Path: blob/main/py-polars/tests/unit/operations/namespaces/string/test_string.py
8396 views
from __future__ import annotations12from typing import Any34import pytest56import polars as pl7import polars.selectors as cs8from polars.exceptions import (9ColumnNotFoundError,10ComputeError,11InvalidOperationError,12PolarsInefficientMapWarning,13ShapeError,14)15from polars.testing import assert_frame_equal, assert_series_equal161718def test_str_slice() -> None:19df = pl.DataFrame({"a": ["foobar", "barfoo"]})20assert df["a"].str.slice(-3).to_list() == ["bar", "foo"]21assert df.select([pl.col("a").str.slice(2, 4)])["a"].to_list() == ["obar", "rfoo"]222324def test_str_slice_expr() -> None:25df = pl.DataFrame(26{27"a": ["foobar", None, "barfoo", "abcd", ""],28"offset": [1, 3, None, -3, 2],29"length": [3, 4, 2, None, 2],30}31)32out = df.select(33all_expr=pl.col("a").str.slice("offset", "length"),34offset_expr=pl.col("a").str.slice("offset", 2),35length_expr=pl.col("a").str.slice(0, "length"),36length_none=pl.col("a").str.slice("offset", None),37offset_length_lit=pl.col("a").str.slice(-3, 3),38str_lit=pl.lit("qwert").str.slice("offset", "length"),39)40expected = pl.DataFrame(41{42"all_expr": ["oob", None, None, "bcd", ""],43"offset_expr": ["oo", None, None, "bc", ""],44"length_expr": ["foo", None, "ba", "abcd", ""],45"length_none": ["oobar", None, None, "bcd", ""],46"offset_length_lit": ["bar", None, "foo", "bcd", ""],47"str_lit": ["wer", "rt", None, "ert", "er"],48}49)50assert_frame_equal(out, expected)5152# negative length is not allowed53with pytest.raises(InvalidOperationError):54df.select(pl.col("a").str.slice(0, -1))555657def test_str_slice_wrong_length() -> None:58df = pl.DataFrame({"num": ["-10", "-1", "0"]})59with pytest.raises(ShapeError):60df.select(pl.col("num").str.slice(pl.Series([1, 2])))616263@pytest.mark.parametrize(64("input", "n", "output"),65[66(["012345", "", None], 0, ["", "", None]),67(["012345", "", None], 2, ["01", "", None]),68(["012345", "", None], -2, ["0123", "", None]),69(["012345", "", None], 100, ["012345", "", None]),70(["012345", "", None], -100, ["", "", None]),71],72)73def test_str_head(input: list[str], n: int, output: list[str]) -> None:74assert pl.Series(input).str.head(n).to_list() == output757677@pytest.mark.parametrize(78("input", "n", "output"),79[80("你好世界", 0, ""),81("你好世界", 2, "你好"),82("你好世界", 999, "你好世界"),83("你好世界", -1, "你好世"),84("你好世界", -2, "你好"),85("你好世界", -999, ""),86],87)88def test_str_head_codepoints(input: str, n: int, output: str) -> None:89assert pl.Series([input]).str.head(n).to_list() == [output]909192def test_str_head_expr() -> None:93s = "012345"94df = pl.DataFrame(95{"a": [s, s, s, s, s, s, "", None], "n": [0, 2, -2, 100, -100, None, 3, -2]}96)97out = df.select(98n_expr=pl.col("a").str.head("n"),99n_pos2=pl.col("a").str.head(2),100n_neg2=pl.col("a").str.head(-2),101n_pos100=pl.col("a").str.head(100),102n_pos_neg100=pl.col("a").str.head(-100),103n_pos_0=pl.col("a").str.head(0),104str_lit=pl.col("a").str.head(pl.lit(2)),105lit_expr=pl.lit(s).str.head("n"),106lit_n=pl.lit(s).str.head(2),107)108expected = pl.DataFrame(109{110"n_expr": ["", "01", "0123", "012345", "", None, "", None],111"n_pos2": ["01", "01", "01", "01", "01", "01", "", None],112"n_neg2": ["0123", "0123", "0123", "0123", "0123", "0123", "", None],113"n_pos100": [s, s, s, s, s, s, "", None],114"n_pos_neg100": ["", "", "", "", "", "", "", None],115"n_pos_0": ["", "", "", "", "", "", "", None],116"str_lit": ["01", "01", "01", "01", "01", "01", "", None],117"lit_expr": ["", "01", "0123", "012345", "", None, "012", "0123"],118"lit_n": ["01", "01", "01", "01", "01", "01", "01", "01"],119}120)121assert_frame_equal(out, expected)122123124def test_str_head_wrong_length() -> None:125df = pl.DataFrame({"num": ["-10", "-1", "0"]})126with pytest.raises(ShapeError):127df.select(pl.col("num").str.head(pl.Series([1, 2])))128129130@pytest.mark.parametrize(131("input", "n", "output"),132[133(["012345", "", None], 0, ["", "", None]),134(["012345", "", None], 2, ["45", "", None]),135(["012345", "", None], -2, ["2345", "", None]),136(["012345", "", None], 100, ["012345", "", None]),137(["012345", "", None], -100, ["", "", None]),138],139)140def test_str_tail(input: list[str], n: int, output: list[str]) -> None:141assert pl.Series(input).str.tail(n).to_list() == output142143144@pytest.mark.parametrize(145("input", "n", "output"),146[147("你好世界", 0, ""),148("你好世界", 2, "世界"),149("你好世界", 999, "你好世界"),150("你好世界", -1, "好世界"),151("你好世界", -2, "世界"),152("你好世界", -999, ""),153],154)155def test_str_tail_codepoints(input: str, n: int, output: str) -> None:156assert pl.Series([input]).str.tail(n).to_list() == [output]157158159def test_str_tail_expr() -> None:160s = "012345"161df = pl.DataFrame(162{"a": [s, s, s, s, s, s, "", None], "n": [0, 2, -2, 100, -100, None, 3, -2]}163)164out = df.select(165n_expr=pl.col("a").str.tail("n"),166n_pos2=pl.col("a").str.tail(2),167n_neg2=pl.col("a").str.tail(-2),168n_pos100=pl.col("a").str.tail(100),169n_pos_neg100=pl.col("a").str.tail(-100),170n_pos_0=pl.col("a").str.tail(0),171str_lit=pl.col("a").str.tail(pl.lit(2)),172lit_expr=pl.lit(s).str.tail("n"),173lit_n=pl.lit(s).str.tail(2),174)175expected = pl.DataFrame(176{177"n_expr": ["", "45", "2345", "012345", "", None, "", None],178"n_pos2": ["45", "45", "45", "45", "45", "45", "", None],179"n_neg2": ["2345", "2345", "2345", "2345", "2345", "2345", "", None],180"n_pos100": [s, s, s, s, s, s, "", None],181"n_pos_neg100": ["", "", "", "", "", "", "", None],182"n_pos_0": ["", "", "", "", "", "", "", None],183"str_lit": ["45", "45", "45", "45", "45", "45", "", None],184"lit_expr": ["", "45", "2345", "012345", "", None, "345", "2345"],185"lit_n": ["45", "45", "45", "45", "45", "45", "45", "45"],186}187)188assert_frame_equal(out, expected)189190191def test_str_tail_wrong_length() -> None:192df = pl.DataFrame({"num": ["-10", "-1", "0"]})193with pytest.raises(ShapeError):194df.select(pl.col("num").str.tail(pl.Series([1, 2])))195196197def test_str_slice_multibyte() -> None:198ref = "你好世界"199s = pl.Series([ref])200201# Pad the string to simplify (negative) offsets starting before/after the string.202npad = 20203padref = "_" * npad + ref + "_" * npad204for start in range(-5, 6):205for length in range(6):206offset = npad + start if start >= 0 else npad + start + len(ref)207correct = padref[offset : offset + length].strip("_")208result = s.str.slice(start, length)209expected = pl.Series([correct])210assert_series_equal(result, expected)211212213def test_str_len_bytes() -> None:214s = pl.Series(["Café", None, "345", "東京"])215result = s.str.len_bytes()216expected = pl.Series([5, None, 3, 6], dtype=pl.UInt32)217assert_series_equal(result, expected)218219220def test_str_len_chars() -> None:221s = pl.Series(["Café", None, "345", "東京"])222result = s.str.len_chars()223expected = pl.Series([4, None, 3, 2], dtype=pl.UInt32)224assert_series_equal(result, expected)225226227def test_str_contains() -> None:228s = pl.Series(["messi", "ronaldo", "ibrahimovic"])229expected = pl.Series([True, False, False])230assert_series_equal(s.str.contains("mes"), expected)231232233def test_str_contains_wrong_length() -> None:234df = pl.DataFrame({"num": ["-10", "-1", "0"]})235with pytest.raises(ShapeError):236df.select(pl.col("num").str.contains(pl.Series(["a", "b"]))) # type: ignore [arg-type]237238239def test_count_match_literal() -> None:240s = pl.Series(["12 dbc 3xy", "cat\\w", "1zy3\\d\\d", None])241out = s.str.count_matches(r"\d", literal=True)242expected = pl.Series([0, 0, 2, None], dtype=pl.UInt32)243assert_series_equal(out, expected)244245out = s.str.count_matches(pl.Series([r"\w", r"\w", r"\d", r"\d"]), literal=True)246expected = pl.Series([0, 1, 2, None], dtype=pl.UInt32)247assert_series_equal(out, expected)248249250def test_str_encode() -> None:251s = pl.Series(["foo", "bar", None])252hex_encoded = pl.Series(["666f6f", "626172", None])253base64_encoded = pl.Series(["Zm9v", "YmFy", None])254255assert_series_equal(s.str.encode("hex"), hex_encoded)256assert_series_equal(s.str.encode("base64"), base64_encoded)257with pytest.raises(ValueError):258s.str.encode("utf8") # type: ignore[arg-type]259260261def test_str_decode() -> None:262hex_encoded = pl.Series(["666f6f", "626172", None])263base64_encoded = pl.Series(["Zm9v", "YmFy", None])264expected = pl.Series([b"foo", b"bar", None])265266assert_series_equal(hex_encoded.str.decode("hex"), expected)267assert_series_equal(base64_encoded.str.decode("base64"), expected)268269270def test_str_decode_exception() -> None:271s = pl.Series(["not a valid", "626172", None])272with pytest.raises(ComputeError):273s.str.decode(encoding="hex")274with pytest.raises(ComputeError):275s.str.decode(encoding="base64")276with pytest.raises(ValueError):277s.str.decode("utf8") # type: ignore[arg-type]278279280@pytest.mark.parametrize("strict", [True, False])281def test_str_find(strict: bool) -> None:282df = pl.DataFrame(283data=[284("Dubai", 3564931, "b[ai]", "ai"),285("Abu Dhabi", 1807000, "b[ai]", " "),286("Sharjah", 1405000, "[ai]n", "s"),287("Al Ain", 846747, "[ai]n", ""),288("Ajman", 490035, "[ai]n", "ma"),289("Ras Al Khaimah", 191753, "a.+a", "Kha"),290("Fujairah", 118933, "a.+a", None),291("Umm Al Quwain", 59098, "a.+a", "wa"),292(None, None, None, "n/a"),293],294schema={295"city": pl.String,296"population": pl.Int32,297"pat": pl.String,298"lit": pl.String,299},300orient="row",301)302city, pop, pat, lit = (pl.col(c) for c in ("city", "population", "pat", "lit"))303304for match_lit in (True, False):305res = df.select(306find_a_regex=city.str.find("(?i)a", strict=strict),307find_a_lit=city.str.find("a", literal=match_lit),308find_00_lit=pop.cast(pl.String).str.find("00", literal=match_lit),309find_col_lit=city.str.find(lit, strict=strict, literal=match_lit),310find_col_pat=city.str.find(pat, strict=strict),311)312assert res.to_dict(as_series=False) == {313"find_a_regex": [3, 0, 2, 0, 0, 1, 3, 4, None],314"find_a_lit": [3, 6, 2, None, 3, 1, 3, 10, None],315"find_00_lit": [None, 4, 4, None, 2, None, None, None, None],316"find_col_lit": [3, 3, None, 0, 2, 7, None, 9, None],317"find_col_pat": [2, 7, None, 4, 3, 1, 3, None, None],318}319320321def test_str_find_invalid_regex() -> None:322# test behaviour of 'strict' with invalid regular expressions323df = pl.DataFrame({"txt": ["AbCdEfG"]})324rx_invalid = "(?i)AB.))"325326with pytest.raises(ComputeError):327df.with_columns(pl.col("txt").str.find(rx_invalid, strict=True))328329res = df.with_columns(pl.col("txt").str.find(rx_invalid, strict=False))330assert res.item() is None331332333def test_str_find_escaped_chars() -> None:334# test behaviour of 'literal=True' with special chars335df = pl.DataFrame({"txt": ["123.*465", "x(x?)x"]})336337res = df.with_columns(338x1=pl.col("txt").str.find("(x?)", literal=True),339x2=pl.col("txt").str.find(".*4", literal=True),340x3=pl.col("txt").str.find("(x?)"),341x4=pl.col("txt").str.find(".*4"),342)343# ┌──────────┬──────┬──────┬─────┬──────┐344# │ txt ┆ x1 ┆ x2 ┆ x3 ┆ x4 │345# │ --- ┆ --- ┆ --- ┆ --- ┆ --- │346# │ str ┆ u32 ┆ u32 ┆ u32 ┆ u32 │347# ╞══════════╪══════╪══════╪═════╪══════╡348# │ 123.*465 ┆ null ┆ 3 ┆ 0 ┆ 0 │349# │ x(x?)x ┆ 1 ┆ null ┆ 0 ┆ null │350# └──────────┴──────┴──────┴─────┴──────┘351assert_frame_equal(352pl.DataFrame(353{354"txt": ["123.*465", "x(x?)x"],355"x1": [None, 1],356"x2": [3, None],357"x3": [0, 0],358"x4": [0, None],359}360).cast({cs.signed_integer(): pl.UInt32}),361res,362)363364365def test_str_find_wrong_length() -> None:366df = pl.DataFrame({"num": ["-10", "-1", "0"]})367with pytest.raises(ShapeError):368df.select(pl.col("num").str.find(pl.Series(["a", "b"]))) # type: ignore [arg-type]369370371def test_hex_decode_return_dtype() -> None:372data = {"a": ["68656c6c6f", "776f726c64"]}373expr = pl.col("a").str.decode("hex")374375df = pl.DataFrame(data).select(expr)376assert df.schema == {"a": pl.Binary}377378ldf = pl.LazyFrame(data).select(expr)379assert ldf.collect_schema() == {"a": pl.Binary}380381382def test_base64_decode_return_dtype() -> None:383data = {"a": ["Zm9v", "YmFy"]}384expr = pl.col("a").str.decode("base64")385386df = pl.DataFrame(data).select(expr)387assert df.schema == {"a": pl.Binary}388389ldf = pl.LazyFrame(data).select(expr)390assert ldf.collect_schema() == {"a": pl.Binary}391392393def test_str_replace_str_replace_all() -> None:394s = pl.Series(["hello", "world", "test", "rooted"])395expected = pl.Series(["hell0", "w0rld", "test", "r0oted"])396assert_series_equal(s.str.replace("o", "0"), expected)397398expected = pl.Series(["hell0", "w0rld", "test", "r00ted"])399assert_series_equal(s.str.replace_all("o", "0"), expected)400401402def test_str_replace_n_single() -> None:403s = pl.Series(["aba", "abaa"])404405assert s.str.replace("a", "b", n=1).to_list() == ["bba", "bbaa"]406assert s.str.replace("a", "b", n=2).to_list() == ["bbb", "bbba"]407assert s.str.replace("a", "b", n=3).to_list() == ["bbb", "bbbb"]408409410def test_str_replace_n_same_length() -> None:411# pat and val have the same length412# this triggers a fast path413s = pl.Series(["abfeab", "foobarabfooabab"])414assert s.str.replace("ab", "AB", n=1).to_list() == ["ABfeab", "foobarABfooabab"]415assert s.str.replace("ab", "AB", n=2).to_list() == ["ABfeAB", "foobarABfooABab"]416assert s.str.replace("ab", "AB", n=3).to_list() == ["ABfeAB", "foobarABfooABAB"]417418419def test_str_to_lowercase() -> None:420s = pl.Series(["Hello", "WORLD"])421expected = pl.Series(["hello", "world"])422assert_series_equal(s.str.to_lowercase(), expected)423424425def test_str_to_uppercase() -> None:426s = pl.Series(["Hello", "WORLD"])427expected = pl.Series(["HELLO", "WORLD"])428assert_series_equal(s.str.to_uppercase(), expected)429430431def test_str_case_cyrillic() -> None:432vals = ["Biтpyк", "Iвaн"]433s = pl.Series(vals)434assert s.str.to_lowercase().to_list() == [a.lower() for a in vals]435assert s.str.to_uppercase().to_list() == [a.upper() for a in vals]436437438def test_str_to_integer() -> None:439bin = pl.Series(["110", "101", "010"])440assert_series_equal(bin.str.to_integer(base=2), pl.Series([6, 5, 2]).cast(pl.Int64))441442hex = pl.Series(["fa1e", "ff00", "cafe", "invalid", None])443assert_series_equal(444hex.str.to_integer(base=16, strict=False),445pl.Series([64030, 65280, 51966, None, None]).cast(pl.Int64),446check_exact=True,447)448449with pytest.raises(ComputeError):450hex.str.to_integer(base=16)451452453@pytest.mark.parametrize("strict", [False, True])454def test_str_to_integer_invalid_base(strict: bool) -> None:455numbers = pl.Series(["1", "ZZZ", "-ABCZZZ", None])456with pytest.raises(ComputeError):457numbers.str.to_integer(base=100, strict=strict)458459df = pl.DataFrame({"str": numbers, "base": [0, 1, 100, None]})460with pytest.raises(ComputeError):461df.select(pl.col("str").str.to_integer(base=pl.col("base"), strict=strict))462463464def test_str_to_integer_base_expr() -> None:465df = pl.DataFrame(466{"str": ["110", "ff00", "234", None, "130"], "base": [2, 16, 10, 8, None]}467)468out = df.select(base_expr=pl.col("str").str.to_integer(base="base"))469expected = pl.DataFrame({"base_expr": [6, 65280, 234, None, None]})470assert_frame_equal(out, expected)471472# test strict raise473df = pl.DataFrame({"str": ["110", "ff00", "cafe", None], "base": [2, 10, 10, 8]})474475with pytest.raises(ComputeError):476df.select(pl.col("str").str.to_integer(base="base"))477478479def test_str_to_integer_base_literal() -> None:480df = pl.DataFrame(481{482"bin": ["110", "101", "-010", "invalid", None],483"hex": ["fa1e", "ff00", "cafe", "invalid", None],484}485)486result = df.with_columns(487pl.col("bin").str.to_integer(base=2, strict=False),488pl.col("hex").str.to_integer(base=16, strict=False),489)490491expected = pl.DataFrame(492{493"bin": [6, 5, -2, None, None],494"hex": [64030, 65280, 51966, None, None],495}496)497assert_frame_equal(result, expected)498499with pytest.raises(ComputeError):500df.with_columns(501pl.col("bin").str.to_integer(base=2),502pl.col("hex").str.to_integer(base=16),503)504505506def test_str_to_integer_dtype() -> None:507lf = pl.LazyFrame(508{509"str": ["1111111", "7f", "127", None, "42"],510"base": [2, 16, 10, 8, None],511}512)513out = lf.select(514i8=pl.col("str").str.to_integer(base="base", dtype=pl.Int8),515i16=pl.col("str").str.to_integer(base="base", dtype=pl.Int16),516i32=pl.col("str").str.to_integer(base="base", dtype=pl.Int32),517i64=pl.col("str").str.to_integer(base="base", dtype=pl.Int64),518u8=pl.col("str").str.to_integer(base="base", dtype=pl.UInt8),519u16=pl.col("str").str.to_integer(base="base", dtype=pl.UInt16),520u32=pl.col("str").str.to_integer(base="base", dtype=pl.UInt32),521u64=pl.col("str").str.to_integer(base="base", dtype=pl.UInt64),522default=pl.col("str").str.to_integer(base="base"),523).collect()524525expected = pl.DataFrame(526{527"i8": [127, 127, 127, None, None],528"i16": [127, 127, 127, None, None],529"i32": [127, 127, 127, None, None],530"i64": [127, 127, 127, None, None],531"u8": [127, 127, 127, None, None],532"u16": [127, 127, 127, None, None],533"u32": [127, 127, 127, None, None],534"u64": [127, 127, 127, None, None],535"default": [127, 127, 127, None, None],536},537schema={538"i8": pl.Int8,539"i16": pl.Int16,540"i32": pl.Int32,541"i64": pl.Int64,542"u8": pl.UInt8,543"u16": pl.UInt16,544"u32": pl.UInt32,545"u64": pl.UInt64,546"default": pl.Int64,547},548)549assert lf.collect_schema() == lf.collect().schema550assert_frame_equal(out, expected)551552553def test_str_to_integer_large() -> None:554df = pl.DataFrame(555{556"str": [557"-6129899454972456276923959272",558"1A44E53BFEBA967E6682FBB0",559"10100110111110110101110100000100110010101111000100011000000100010101010101101011111111101000",560None,561"7798994549724957734429272",562],563"base": [10, 16, 2, 8, None],564}565)566out = df.select(i128=pl.col("str").str.to_integer(base="base", dtype=pl.Int128))567expected = pl.DataFrame(568{569"i128": [570-6129899454972456276923959272,5718129899739726392769273592752,5723229899454972495776923959272,573None,574None,575]576},577schema={"i128": pl.Int128},578)579assert_frame_equal(out, expected)580581# test strict raise582df = pl.DataFrame(583{584"i128": [585"612989945497245627692395927261298994549724562769239592726129899454972456276923959272",586"1A44E53BFEBA967E6682FBB0",587"ABCDEFGHIJKLMNOPQRSTUVWXYZ",588"7798994549724957734429272",589None,590"7798994549724957734429272",591],592"base": [10, 2, 16, 10, 8, None],593}594)595596with pytest.raises(ComputeError):597df.select(pl.col("i128").str.to_integer(base="base", dtype=pl.Int128))598599600def test_str_strip_chars_expr() -> None:601df = pl.DataFrame(602{603"s": [" hello ", "^^world^^", "&&hi&&", " polars ", None],604"pat": [" ", "^", "&", None, "anything"],605}606)607608all_expr = df.select(609pl.col("s").str.strip_chars(pl.col("pat")).alias("strip_chars"),610pl.col("s").str.strip_chars_start(pl.col("pat")).alias("strip_chars_start"),611pl.col("s").str.strip_chars_end(pl.col("pat")).alias("strip_chars_end"),612)613614expected = pl.DataFrame(615{616"strip_chars": ["hello", "world", "hi", "polars", None],617"strip_chars_start": ["hello ", "world^^", "hi&&", "polars ", None],618"strip_chars_end": [" hello", "^^world", "&&hi", " polars", None],619}620)621622assert_frame_equal(all_expr, expected)623624strip_by_null = df.select(625pl.col("s").str.strip_chars(None).alias("strip_chars"),626pl.col("s").str.strip_chars_start(None).alias("strip_chars_start"),627pl.col("s").str.strip_chars_end(None).alias("strip_chars_end"),628)629630# only whitespace are striped.631expected = pl.DataFrame(632{633"strip_chars": ["hello", "^^world^^", "&&hi&&", "polars", None],634"strip_chars_start": ["hello ", "^^world^^", "&&hi&&", "polars ", None],635"strip_chars_end": [" hello", "^^world^^", "&&hi&&", " polars", None],636}637)638assert_frame_equal(strip_by_null, expected)639640641def test_str_strip_chars() -> None:642s = pl.Series([" hello ", "world\t "])643expected = pl.Series(["hello", "world"])644assert_series_equal(s.str.strip_chars(), expected)645646expected = pl.Series(["hell", "world"])647assert_series_equal(s.str.strip_chars().str.strip_chars("o"), expected)648649expected = pl.Series(["ell", "rld\t"])650assert_series_equal(s.str.strip_chars(" hwo"), expected)651652653def test_str_strip_chars_wrong_length() -> None:654df = pl.DataFrame({"num": ["-10", "-1", "0"]})655with pytest.raises(ShapeError):656df.select(pl.col("num").str.strip_chars(pl.Series(["a", "b"])))657658659def test_str_strip_chars_start() -> None:660s = pl.Series([" hello ", "\t world"])661expected = pl.Series(["hello ", "world"])662assert_series_equal(s.str.strip_chars_start(), expected)663664expected = pl.Series(["ello ", "world"])665assert_series_equal(s.str.strip_chars_start().str.strip_chars_start("h"), expected)666667expected = pl.Series(["ello ", "\t world"])668assert_series_equal(s.str.strip_chars_start("hw "), expected)669670671def test_str_strip_chars_start_wrong_length() -> None:672df = pl.DataFrame({"num": ["-10", "-1", "0"]})673with pytest.raises(ShapeError):674df.select(pl.col("num").str.strip_chars_start(pl.Series(["a", "b"])))675676677def test_str_strip_chars_end() -> None:678s = pl.Series([" hello ", "world\t "])679expected = pl.Series([" hello", "world"])680assert_series_equal(s.str.strip_chars_end(), expected)681682expected = pl.Series([" hell", "world"])683assert_series_equal(s.str.strip_chars_end().str.strip_chars_end("o"), expected)684685expected = pl.Series([" he", "wor"])686assert_series_equal(s.str.strip_chars_end("odl \t"), expected)687688689def test_str_strip_chars_end_wrong_length() -> None:690df = pl.DataFrame({"num": ["-10", "-1", "0"]})691with pytest.raises(ShapeError):692df.select(pl.col("num").str.strip_chars_end(pl.Series(["a", "b"])))693694695def test_str_strip_whitespace() -> None:696s = pl.Series("a", ["trailing ", " leading", " both "])697698expected = pl.Series("a", ["trailing", " leading", " both"])699assert_series_equal(s.str.strip_chars_end(), expected)700701expected = pl.Series("a", ["trailing ", "leading", "both "])702assert_series_equal(s.str.strip_chars_start(), expected)703704expected = pl.Series("a", ["trailing", "leading", "both"])705assert_series_equal(s.str.strip_chars(), expected)706707708def test_str_strip_prefix_literal() -> None:709s = pl.Series(["foo:bar", "foofoo:bar", "bar:bar", "foo", "", None])710expected = pl.Series([":bar", "foo:bar", "bar:bar", "", "", None])711assert_series_equal(s.str.strip_prefix("foo"), expected)712# test null literal713expected = pl.Series([None, None, None, None, None, None], dtype=pl.String)714assert_series_equal(s.str.strip_prefix(pl.lit(None, dtype=pl.String)), expected)715716717def test_str_strip_prefix_suffix_expr() -> None:718df = pl.DataFrame(719{720"s": ["foo-bar", "foobarbar", "barfoo", "", "anything", None],721"prefix": ["foo", "foobar", "foo", "", None, "bar"],722"suffix": ["bar", "barbar", "bar", "", None, "foo"],723}724)725out = df.select(726pl.col("s").str.strip_prefix(pl.col("prefix")).alias("strip_prefix"),727pl.col("s").str.strip_suffix(pl.col("suffix")).alias("strip_suffix"),728)729assert out.to_dict(as_series=False) == {730"strip_prefix": ["-bar", "bar", "barfoo", "", None, None],731"strip_suffix": ["foo-", "foo", "barfoo", "", None, None],732}733734735def test_str_strip_prefix_wrong_length() -> None:736df = pl.DataFrame({"num": ["-10", "-1", "0"]})737with pytest.raises(ShapeError):738df.select(pl.col("num").str.strip_prefix(pl.Series(["a", "b"])))739740741def test_str_strip_suffix() -> None:742s = pl.Series(["foo:bar", "foo:barbar", "foo:foo", "bar", "", None])743expected = pl.Series(["foo:", "foo:bar", "foo:foo", "", "", None])744assert_series_equal(s.str.strip_suffix("bar"), expected)745# test null literal746expected = pl.Series([None, None, None, None, None, None], dtype=pl.String)747assert_series_equal(s.str.strip_suffix(pl.lit(None, dtype=pl.String)), expected)748749750def test_str_strip_suffix_wrong_length() -> None:751df = pl.DataFrame({"num": ["-10", "-1", "0"]})752with pytest.raises(ShapeError):753df.select(pl.col("num").str.strip_suffix(pl.Series(["a", "b"])))754755756def test_str_split() -> None:757a = pl.Series("a", ["a, b", "a", "ab,c,de"])758for out in [a.str.split(","), pl.select(pl.lit(a).str.split(",")).to_series()]:759assert out[0].to_list() == ["a", " b"]760assert out[1].to_list() == ["a"]761assert out[2].to_list() == ["ab", "c", "de"]762763for out in [764a.str.split(",", inclusive=True),765pl.select(pl.lit(a).str.split(",", inclusive=True)).to_series(),766]:767assert out[0].to_list() == ["a,", " b"]768assert out[1].to_list() == ["a"]769assert out[2].to_list() == ["ab,", "c,", "de"]770771772def test_json_decode_series() -> None:773s = pl.Series(["[1, 2, 3]", None, "[4, 5, 6]"])774expected = pl.Series([[1, 2, 3], None, [4, 5, 6]])775dtype = pl.List(pl.Int64)776assert_series_equal(s.str.json_decode(None), expected)777assert_series_equal(s.str.json_decode(dtype), expected)778779s = pl.Series(['{"a": 1, "b": true}', None, '{"a": 2, "b": false}'])780expected = pl.Series([{"a": 1, "b": True}, None, {"a": 2, "b": False}])781dtype2 = pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)])782assert_series_equal(s.str.json_decode(None), expected)783assert_series_equal(s.str.json_decode(dtype2), expected)784785expected = pl.Series([{"a": 1}, None, {"a": 2}])786dtype2 = pl.Struct([pl.Field("a", pl.Int64)])787assert_series_equal(s.str.json_decode(dtype2), expected)788789s = pl.Series([], dtype=pl.String)790expected = pl.Series([], dtype=pl.List(pl.Int64))791dtype = pl.List(pl.Int64)792assert_series_equal(s.str.json_decode(dtype), expected)793794795def test_json_decode_lazy_expr() -> None:796dtype = pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Boolean)])797ldf = (798pl.DataFrame({"json": ['{"a": 1, "b": true}', None, '{"a": 2, "b": false}']})799.lazy()800.select(pl.col("json").str.json_decode(dtype))801)802expected = pl.DataFrame(803{"json": [{"a": 1, "b": True}, None, {"a": 2, "b": False}]}804).lazy()805assert ldf.collect_schema() == {"json": dtype}806assert_frame_equal(ldf, expected)807808809def test_json_decode_nested_struct() -> None:810json = [811'[{"key_1": "a"}]',812'[{"key_1": "a2", "key_2": 2}]',813'[{"key_1": "a3", "key_2": 3, "key_3": "c"}]',814]815s = pl.Series("json_str", json)816s_parsed = s.str.json_decode().rename("parsed_list_json")817818expected_dtype = pl.List(819pl.Struct(820[821pl.Field("key_1", pl.String),822pl.Field("key_2", pl.Int64),823pl.Field("key_3", pl.String),824]825)826)827assert s_parsed.dtype == expected_dtype828829key_1_values = s_parsed.to_frame().select(830pl.col("parsed_list_json")831.list.get(0)832.struct.field("key_1")833.alias("key_1_values")834)835expected_values = pl.Series("key_1_values", ["a", "a2", "a3"])836assert_series_equal(key_1_values.get_column("key_1_values"), expected_values)837838839def test_json_decode_primitive_to_list_11053() -> None:840df = pl.DataFrame(841{842"json": [843'{"col1": ["123"], "col2": "123"}',844'{"col1": ["xyz"], "col2": null}',845]846}847)848schema = pl.Struct(849{850"col1": pl.List(pl.String),851"col2": pl.List(pl.String),852}853)854855output = df.select(856pl.col("json").str.json_decode(schema).alias("decoded_json")857).unnest("decoded_json")858expected = pl.DataFrame({"col1": [["123"], ["xyz"]], "col2": [["123"], None]})859assert_frame_equal(output, expected)860861862def test_jsonpath_single() -> None:863s = pl.Series(['{"a":"1"}', None, '{"a":2}', '{"a":2.1}', '{"a":true}'])864expected = pl.Series(["1", None, "2", "2.1", "true"])865assert_series_equal(s.str.json_path_match("$.a"), expected)866867868def test_json_path_match() -> None:869df = pl.DataFrame(870{871"str": [872'{"a":"1"}',873None,874'{"b":2}',875'{"a":2.1, "b": "hello"}',876'{"a":true}',877],878"pat": ["$.a", "$.a", "$.b", "$.b", None],879}880)881out = df.select(882all_expr=pl.col("str").str.json_path_match(pl.col("pat")),883str_expr=pl.col("str").str.json_path_match("$.a"),884pat_expr=pl.lit('{"a": 1.1, "b": 10}').str.json_path_match(pl.col("pat")),885)886expected = pl.DataFrame(887{888"all_expr": ["1", None, "2", "hello", None],889"str_expr": ["1", None, None, "2.1", "true"],890"pat_expr": ["1.1", "1.1", "10", "10", None],891}892)893assert_frame_equal(out, expected)894895896def test_str_json_path_match_wrong_length() -> None:897df = pl.DataFrame({"num": ["-10", "-1", "0"]})898with pytest.raises((ShapeError, ComputeError)):899df.select(pl.col("num").str.json_path_match(pl.Series(["a", "b"])))900901902def test_extract_regex() -> None:903s = pl.Series(904[905"http://vote.com/ballon_dor?candidate=messi&ref=polars",906"http://vote.com/ballon_dor?candidat=jorginho&ref=polars",907"http://vote.com/ballon_dor?candidate=ronaldo&ref=polars",908]909)910expected = pl.Series(["messi", None, "ronaldo"])911assert_series_equal(s.str.extract(r"candidate=(\w+)", 1), expected)912913914def test_extract() -> None:915df = pl.DataFrame(916{917"s": ["aron123", "12butler", "charly*", "~david", None],918"pat": [r"^([a-zA-Z]+)", r"^(\d+)", None, "^(da)", r"(.*)"],919}920)921922out = df.select(923all_expr=pl.col("s").str.extract(pl.col("pat"), 1),924str_expr=pl.col("s").str.extract("^([a-zA-Z]+)", 1),925pat_expr=pl.lit("aron123").str.extract(pl.col("pat")),926)927expected = pl.DataFrame(928{929"all_expr": ["aron", "12", None, None, None],930"str_expr": ["aron", None, "charly", None, None],931"pat_expr": ["aron", None, None, None, "aron123"],932}933)934assert_frame_equal(out, expected)935936937def test_extract_binary() -> None:938df = pl.DataFrame({"foo": ["aron", "butler", "charly", "david"]})939out = df.filter(pl.col("foo").str.extract("^(a)", 1) == "a").to_series()940assert out[0] == "aron"941942943def test_str_join_returns_scalar() -> None:944df = pl.DataFrame(945[pl.Series("val", ["A", "B", "C", "D"]), pl.Series("id", [1, 1, 2, 2])]946)947grouped = (948df.group_by("id")949.agg(pl.col("val").str.join(delimiter=",").alias("grouped"))950.get_column("grouped")951)952assert grouped.dtype == pl.String953954955def test_contains() -> None:956# test strict/non strict957s_txt = pl.Series(["123", "456", "789"])958assert (959pl.Series([None, None, None]).cast(pl.Boolean).to_list()960== s_txt.str.contains("(not_valid_regex", literal=False, strict=False).to_list()961)962with pytest.raises(ComputeError):963s_txt.str.contains("(not_valid_regex", literal=False, strict=True)964assert (965pl.Series([True, False, False]).cast(pl.Boolean).to_list()966== s_txt.str.contains("1", literal=False, strict=False).to_list()967)968969df = pl.DataFrame(970data=[(1, "some * * text"), (2, "(with) special\n * chars"), (3, "**etc...?$")],971schema=["idx", "text"],972orient="row",973)974for pattern, as_literal, expected in (975(r"\* \*", False, [True, False, False]),976(r"* *", True, [True, False, False]),977(r"^\(", False, [False, True, False]),978(r"^\(", True, [False, False, False]),979(r"(", True, [False, True, False]),980(r"e", False, [True, True, True]),981(r"e", True, [True, True, True]),982(r"^\S+$", False, [False, False, True]),983(r"\?\$", False, [False, False, True]),984(r"?$", True, [False, False, True]),985):986# series987assert (988expected == df["text"].str.contains(pattern, literal=as_literal).to_list()989)990# frame select991assert (992expected993== df.select(pl.col("text").str.contains(pattern, literal=as_literal))[994"text"995].to_list()996)997# frame filter998assert sum(expected) == len(999df.filter(pl.col("text").str.contains(pattern, literal=as_literal))1000)100110021003def test_contains_expr() -> None:1004df = pl.DataFrame(1005{1006"text": [1007"some text",1008"(with) special\n .* chars",1009"**etc...?$",1010None,1011"b",1012"invalid_regex",1013],1014"pattern": [r"[me]", r".*", r"^\(", "a", None, "*"],1015}1016)10171018assert df.select(1019pl.col("text")1020.str.contains(pl.col("pattern"), literal=False, strict=False)1021.alias("contains"),1022pl.col("text")1023.str.contains(pl.col("pattern"), literal=True)1024.alias("contains_lit"),1025).to_dict(as_series=False) == {1026"contains": [True, True, False, None, None, None],1027"contains_lit": [False, True, False, None, None, False],1028}10291030with pytest.raises(ComputeError):1031df.select(1032pl.col("text").str.contains(pl.col("pattern"), literal=False, strict=True)1033)103410351036@pytest.mark.parametrize(1037("pattern", "case_insensitive", "expected"),1038[1039(["me"], False, True),1040(["Me"], False, False),1041(["Me"], True, True),1042(pl.Series(["me", "they"]), False, True),1043(pl.Series(["Me", "they"]), False, False),1044(pl.Series(["Me", "they"]), True, True),1045(["me", "they"], False, True),1046(["Me", "they"], False, False),1047(["Me", "they"], True, True),1048],1049)1050def test_contains_any(1051pattern: pl.Series | list[str],1052case_insensitive: bool,1053expected: bool,1054) -> None:1055df = pl.DataFrame({"text": ["Tell me what you want"]})1056# series1057assert (1058expected1059== df["text"]1060.str.contains_any(pattern, ascii_case_insensitive=case_insensitive)1061.item()1062)1063# expr1064assert (1065expected1066== df.select(1067pl.col("text").str.contains_any(1068pattern, ascii_case_insensitive=case_insensitive1069)1070)["text"].item()1071)1072# frame filter1073assert int(expected) == len(1074df.filter(1075pl.col("text").str.contains_any(1076pattern, ascii_case_insensitive=case_insensitive1077)1078)1079)108010811082def test_replace() -> None:1083df = pl.DataFrame(1084data=[(1, "* * text"), (2, "(with) special\n * chars **etc...?$")],1085schema=["idx", "text"],1086orient="row",1087)1088for pattern, replacement, as_literal, expected in (1089(r"\*", "-", False, ["- * text", "(with) special\n - chars **etc...?$"]),1090(r"*", "-", True, ["- * text", "(with) special\n - chars **etc...?$"]),1091(r"^\(", "[", False, ["* * text", "[with) special\n * chars **etc...?$"]),1092(r"^\(", "[", True, ["* * text", "(with) special\n * chars **etc...?$"]),1093(r"t$", "an", False, ["* * texan", "(with) special\n * chars **etc...?$"]),1094(r"t$", "an", True, ["* * text", "(with) special\n * chars **etc...?$"]),1095(r"(with) special", "$1", True, ["* * text", "$1\n * chars **etc...?$"]),1096(1097r"\((with)\) special",1098":$1:",1099False,1100["* * text", ":with:\n * chars **etc...?$"],1101),1102):1103# series1104assert (1105expected1106== df["text"]1107.str.replace(pattern, replacement, literal=as_literal)1108.to_list()1109)1110# expr1111assert (1112expected1113== df.select(1114pl.col("text").str.replace(pattern, replacement, literal=as_literal)1115)["text"].to_list()1116)11171118assert pl.Series(["."]).str.replace(".", "$0", literal=True)[0] == "$0"1119assert pl.Series(["(.)(?)"]).str.replace(".", "$1", literal=True)[0] == "($1)(?)"112011211122def test_replace_all() -> None:1123df = pl.DataFrame(1124data=[(1, "* * text"), (2, "(with) special\n * chars **etc...?$")],1125schema=["idx", "text"],1126orient="row",1127)1128for pattern, replacement, as_literal, expected in (1129(r"\*", "-", False, ["- - text", "(with) special\n - chars --etc...?$"]),1130(r"*", "-", True, ["- - text", "(with) special\n - chars --etc...?$"]),1131(r"\W", "", False, ["text", "withspecialcharsetc"]),1132(r".?$", "", True, ["* * text", "(with) special\n * chars **etc.."]),1133(1134r"(with) special",1135"$1",1136True,1137["* * text", "$1\n * chars **etc...?$"],1138),1139(1140r"\((with)\) special",1141":$1:",1142False,1143["* * text", ":with:\n * chars **etc...?$"],1144),1145(1146r"(\b)[\w\s]{2,}(\b)",1147"$1(blah)$3",1148False,1149["* * (blah)", "((blah)) (blah)\n * (blah) **(blah)...?$"],1150),1151):1152# series1153assert (1154expected1155== df["text"]1156.str.replace_all(pattern, replacement, literal=as_literal)1157.to_list()1158)1159# expr1160assert (1161expected1162== df.select(1163pl.col("text").str.replace_all(pattern, replacement, literal=as_literal)1164)["text"].to_list()1165)1166# invalid regex (but valid literal - requires "literal=True")1167with pytest.raises(ComputeError):1168df["text"].str.replace_all("*", "")11691170assert (1171pl.Series([r"(.)(\?)(\?)"]).str.replace_all("\\?", "$0", literal=True)[0]1172== "(.)($0)($0)"1173)1174assert (1175pl.Series([r"(.)(\?)(\?)"]).str.replace_all("\\?", "$0", literal=False)[0]1176== "(.)(\\?)(\\?)"1177)117811791180def test_replace_all_literal_no_captures() -> None:1181# When using literal = True, capture groups should be disabled11821183# Single row code path in Rust1184df = pl.DataFrame({"text": ["I found <amt> yesterday."], "amt": ["$1"]})1185df = df.with_columns(1186pl.col("text")1187.str.replace_all("<amt>", pl.col("amt"), literal=True)1188.alias("text2")1189)1190assert df.get_column("text2")[0] == "I found $1 yesterday."11911192# Multi-row code path in Rust1193df2 = pl.DataFrame(1194{1195"text": ["I found <amt> yesterday.", "I lost <amt> yesterday."],1196"amt": ["$1", "$2"],1197}1198)1199df2 = df2.with_columns(1200pl.col("text")1201.str.replace_all("<amt>", pl.col("amt"), literal=True)1202.alias("text2")1203)1204assert df2.get_column("text2")[0] == "I found $1 yesterday."1205assert df2.get_column("text2")[1] == "I lost $2 yesterday."120612071208def test_replace_literal_no_captures() -> None:1209# When using literal = True, capture groups should be disabled12101211# Single row code path in Rust1212df = pl.DataFrame({"text": ["I found <amt> yesterday."], "amt": ["$1"]})1213df = df.with_columns(1214pl.col("text").str.replace("<amt>", pl.col("amt"), literal=True).alias("text2")1215)1216assert df.get_column("text2")[0] == "I found $1 yesterday."12171218# Multi-row code path in Rust1219# A string shorter than 32 chars,1220# and one longer than 32 chars to test both sub-paths1221df2 = pl.DataFrame(1222{1223"text": [1224"I found <amt> yesterday.",1225"I lost <amt> yesterday and this string is longer than 32 characters.",1226],1227"amt": ["$1", "$2"],1228}1229)1230df2 = df2.with_columns(1231pl.col("text").str.replace("<amt>", pl.col("amt"), literal=True).alias("text2")1232)1233assert df2.get_column("text2")[0] == "I found $1 yesterday."1234assert (1235df2.get_column("text2")[1]1236== "I lost $2 yesterday and this string is longer than 32 characters."1237)123812391240def test_replace_expressions() -> None:1241df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t"], "value": ["A", "B"]})1242out = df.select([pl.col("foo").str.replace(pl.col("foo").first(), pl.col("value"))])1243assert out.to_dict(as_series=False) == {"foo": ["A", "xyz 678 910t"]}1244out = df.select([pl.col("foo").str.replace(pl.col("foo").last(), "value")])1245assert out.to_dict(as_series=False) == {"foo": ["123 bla 45 asd", "value"]}12461247df = pl.DataFrame(1248{"foo": ["1 bla 45 asd", "xyz 6t"], "pat": [r"\d", r"\W"], "value": ["A", "B"]}1249)1250out = df.select([pl.col("foo").str.replace_all(pl.col("pat").first(), "value")])1251assert out.to_dict(as_series=False) == {1252"foo": ["value bla valuevalue asd", "xyz valuet"]1253}125412551256@pytest.mark.parametrize(1257("pattern", "replacement", "case_insensitive", "leftmost", "expected"),1258[1259(["say"], "", False, False, "Tell me what you want"),1260(["me"], ["them"], False, False, "Tell them what you want"),1261(["who"], ["them"], False, False, "Tell me what you want"),1262(["me", "you"], "it", False, False, "Tell it what it want"),1263(["Me", "you"], "it", False, False, "Tell me what it want"),1264(["me", "you"], ["it"], False, False, "Tell it what it want"),1265(["me", "you"], ["you", "me"], False, False, "Tell you what me want"),1266(["me", "You", "them"], "it", False, False, "Tell it what you want"),1267(["Me", "you"], "it", True, False, "Tell it what it want"),1268(["me", "YOU"], ["you", "me"], True, False, "Tell you what me want"),1269(1270pl.Series(["me", "YOU"]),1271["you", "me"],1272False,1273False,1274"Tell you what you want",1275),1276(pl.Series(["me", "YOU"]), ["you", "me"], True, False, "Tell you what me want"),1277(1278["Tell me", "Tell"],1279["Don't tell", "Text"],1280False,1281False,1282"Text me what you want",1283),1284(1285["Tell me", "Tell"],1286["Don't tell me", "Text"],1287False,1288True,1289"Don't tell me what you want",1290),1291],1292)1293def test_replace_many(1294pattern: pl.Series | list[str],1295replacement: pl.Series | list[str] | str,1296case_insensitive: bool,1297leftmost: bool,1298expected: str,1299) -> None:1300df = pl.DataFrame({"text": ["Tell me what you want"]})1301# series1302val = (1303df["text"]1304.str.replace_many(1305pattern,1306replacement,1307ascii_case_insensitive=case_insensitive,1308leftmost=leftmost,1309)1310.item()1311)1312assert expected == val, val1313# expr1314val = df.select(1315pl.col("text").str.replace_many(1316pattern,1317replacement,1318ascii_case_insensitive=case_insensitive,1319leftmost=leftmost,1320)1321).item()1322assert expected == val, val132313241325def test_replace_many_groupby() -> None:1326df = pl.DataFrame(1327{1328"x": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],1329"g": [0, 0, 0, 1, 1, 1, 2, 2, 2],1330}1331)1332out = df.group_by("g").agg(pl.col.x.str.replace_many(pl.col.x.head(2), ""))1333expected = pl.DataFrame(1334{1335"g": [0, 1, 2],1336"x": [["", "", "c"], ["", "", "f"], ["", "", "i"]],1337}1338)1339assert_frame_equal(out, expected, check_row_order=False)134013411342@pytest.mark.parametrize(1343("mapping", "case_insensitive", "expected"),1344[1345({}, False, "Tell me what you want"),1346({"me": "them"}, False, "Tell them what you want"),1347({"who": "them"}, False, "Tell me what you want"),1348({"me": "it", "you": "it"}, False, "Tell it what it want"),1349({"Me": "it", "you": "it"}, False, "Tell me what it want"),1350({"me": "you", "you": "me"}, False, "Tell you what me want"),1351({}, True, "Tell me what you want"),1352({"Me": "it", "you": "it"}, True, "Tell it what it want"),1353({"me": "you", "YOU": "me"}, True, "Tell you what me want"),1354],1355)1356def test_replace_many_mapping(1357mapping: dict[str, str],1358case_insensitive: bool,1359expected: str,1360) -> None:1361df = pl.DataFrame({"text": ["Tell me what you want"]})1362# series1363assert (1364expected1365== df["text"]1366.str.replace_many(mapping, ascii_case_insensitive=case_insensitive)1367.item()1368)1369# expr1370assert (1371expected1372== df.select(1373pl.col("text").str.replace_many(1374mapping,1375ascii_case_insensitive=case_insensitive,1376)1377).item()1378)137913801381def test_replace_many_invalid_inputs() -> None:1382df = pl.DataFrame({"text": ["Tell me what you want"]})13831384# Ensure a string as the first argument is parsed as a column name.1385with pytest.raises(ColumnNotFoundError, match="me"):1386df.select(pl.col("text").str.replace_many("me", "you"))13871388with pytest.raises(InvalidOperationError):1389df.select(pl.col("text").str.replace_many(1, 2))13901391with pytest.raises(InvalidOperationError):1392df.select(pl.col("text").str.replace_many([1], [2]))13931394with pytest.raises(InvalidOperationError):1395df.select(pl.col("text").str.replace_many(["me"], None))13961397with pytest.raises(TypeError):1398df.select(pl.col("text").str.replace_many(["me"]))13991400with pytest.raises(1401InvalidOperationError,1402match="expected the same amount of patterns as replacement strings",1403):1404df.select(pl.col("text").str.replace_many(["a"], ["b", "c"]))14051406s = df.to_series()14071408with pytest.raises(ColumnNotFoundError, match="me"):1409s.str.replace_many("me", "you") # type: ignore[arg-type]14101411with pytest.raises(TypeError):1412df.select(pl.col("text").str.replace_many(["me"]))14131414with pytest.raises(1415InvalidOperationError,1416match="expected the same amount of patterns as replacement strings",1417):1418s.str.replace_many(["a"], ["b", "c"])141914201421def test_extract_all_count() -> None:1422df = pl.DataFrame({"foo": ["123 bla 45 asd", "xaz 678 910t", "boo", None]})1423assert (1424df.select(1425pl.col("foo").str.extract_all(r"a").alias("extract"),1426pl.col("foo").str.count_matches(r"a").alias("count"),1427).to_dict(as_series=False)1428) == {"extract": [["a", "a"], ["a"], [], None], "count": [2, 1, 0, None]}14291430assert df["foo"].str.extract_all(r"a").dtype == pl.List1431assert df["foo"].str.count_matches(r"a").dtype == pl.UInt32143214331434def test_count_matches_many() -> None:1435df = pl.DataFrame(1436{1437"foo": ["123 bla 45 asd", "xyz 678 910t", None, "boo"],1438"bar": [r"\d", r"[a-z]", r"\d", None],1439}1440)1441assert (1442df.select(1443pl.col("foo").str.count_matches(pl.col("bar")).alias("count")1444).to_dict(as_series=False)1445) == {"count": [5, 4, None, None]}14461447assert df["foo"].str.count_matches(df["bar"]).dtype == pl.UInt3214481449# Test broadcast.1450broad = df.select(1451pl.col("foo").str.count_matches(pl.col("bar").first()).alias("count"),1452pl.col("foo").str.count_matches(pl.col("bar").last()).alias("count_null"),1453)1454assert broad.to_dict(as_series=False) == {1455"count": [5, 6, None, 0],1456"count_null": [None, None, None, None],1457}1458assert broad.schema == {"count": pl.UInt32, "count_null": pl.UInt32}145914601461def test_extract_all_many() -> None:1462df = pl.DataFrame(1463{1464"foo": ["ab", "abc", "abcd", "foo", None, "boo"],1465"re": ["a", "bc", "a.c", "a", "a", None],1466}1467)1468assert df["foo"].str.extract_all(df["re"]).to_list() == [1469["a"],1470["bc"],1471["abc"],1472[],1473None,1474None,1475]14761477# Test broadcast.1478broad = df.select(1479pl.col("foo").str.extract_all(pl.col("re").first()).alias("a"),1480pl.col("foo").str.extract_all(pl.col("re").last()).alias("null"),1481)1482assert broad.to_dict(as_series=False) == {1483"a": [["a"], ["a"], ["a"], [], None, []],1484"null": [None] * 6,1485}1486assert broad.schema == {"a": pl.List(pl.String), "null": pl.List(pl.String)}148714881489@pytest.mark.may_fail_cloud # reason: zero-field struct1490def test_extract_groups_empty() -> None:1491df = pl.DataFrame({"iso_code": ["ISO 80000-1:2009", "ISO/IEC/IEEE 29148:2018"]})14921493assert df.select(pl.col("iso_code").str.extract_groups("")).to_dict(1494as_series=False1495) == {"iso_code": [{}, {}]}14961497q = df.lazy().select(pl.col("iso_code").str.extract_groups(""))1498assert q.collect_schema() == q.collect().schema149915001501def test_extract_groups() -> None:1502def _named_groups_builder(pattern: str, groups: dict[str, str]) -> str:1503return pattern.format(1504**{name: f"(?<{name}>{value})" for name, value in groups.items()}1505)15061507expected = {1508"authority": ["ISO", "ISO/IEC/IEEE"],1509"spec_num": ["80000", "29148"],1510"part_num": ["1", None],1511"revision_year": ["2009", "2018"],1512}15131514pattern = _named_groups_builder(1515r"{authority}\s{spec_num}(?:-{part_num})?(?::{revision_year})",1516{1517"authority": r"^ISO(?:/[A-Z]+)*",1518"spec_num": r"\d+",1519"part_num": r"\d+",1520"revision_year": r"\d{4}",1521},1522)15231524df = pl.DataFrame({"iso_code": ["ISO 80000-1:2009", "ISO/IEC/IEEE 29148:2018"]})15251526assert (1527df.select(pl.col("iso_code").str.extract_groups(pattern))1528.unnest("iso_code")1529.to_dict(as_series=False)1530== expected1531)15321533assert df.select(1534pl.col("iso_code").str.extract_groups(r"\A(ISO\S*).*?(\d+)")1535).to_dict(as_series=False) == {1536"iso_code": [{"1": "ISO", "2": "80000"}, {"1": "ISO/IEC/IEEE", "2": "29148"}]1537}15381539assert df.select(1540pl.col("iso_code").str.extract_groups(r"\A(ISO\S*).*?(?<year>\d+)\z")1541).to_dict(as_series=False) == {1542"iso_code": [1543{"1": "ISO", "year": "2009"},1544{"1": "ISO/IEC/IEEE", "year": "2018"},1545]1546}15471548assert pl.select(1549pl.lit(r"foobar").str.extract_groups(r"(?<foo>.{3})|(?<bar>...)")1550).to_dict(as_series=False) == {"literal": [{"foo": "foo", "bar": None}]}155115521553def test_starts_ends_with() -> None:1554df = pl.DataFrame(1555{1556"a": ["hamburger_with_tomatoes", "nuts", "lollypop", None],1557"sub": ["ham", "ts", None, "anything"],1558}1559)15601561assert df.select(1562pl.col("a").str.ends_with("pop").alias("ends_pop"),1563pl.col("a").str.ends_with(pl.lit(None)).alias("ends_None"),1564pl.col("a").str.ends_with(pl.col("sub")).alias("ends_sub"),1565pl.col("a").str.starts_with("ham").alias("starts_ham"),1566pl.col("a").str.starts_with(pl.lit(None)).alias("starts_None"),1567pl.col("a").str.starts_with(pl.col("sub")).alias("starts_sub"),1568).to_dict(as_series=False) == {1569"ends_pop": [False, False, True, None],1570"ends_None": [None, None, None, None],1571"ends_sub": [False, True, None, None],1572"starts_ham": [True, False, False, None],1573"starts_None": [None, None, None, None],1574"starts_sub": [True, False, None, None],1575}157615771578def test_json_path_match_type_4905() -> None:1579df = pl.DataFrame({"json_val": ['{"a":"hello"}', None, '{"a":"world"}']})1580assert df.filter(1581pl.col("json_val").str.json_path_match("$.a").is_in(["hello"])1582).to_dict(as_series=False) == {"json_val": ['{"a":"hello"}']}158315841585def test_decode_strict() -> None:1586df = pl.DataFrame(1587{"strings": ["0IbQvTc3", "0J%2FQldCf0JA%3D", "0J%2FRgNC%2B0YHRgtC%2B"]}1588)1589result = df.select(pl.col("strings").str.decode("base64", strict=False))1590expected = {"strings": [b"\xd0\x86\xd0\xbd77", None, None]}1591assert result.to_dict(as_series=False) == expected15921593with pytest.raises(ComputeError):1594df.select(pl.col("strings").str.decode("base64", strict=True))159515961597def test_split() -> None:1598df = pl.DataFrame({"x": ["a_a", None, "b", "c_c_c", ""]})1599out = df.select([pl.col("x").str.split("_")])16001601expected = pl.DataFrame(1602[1603{"x": ["a", "a"]},1604{"x": None},1605{"x": ["b"]},1606{"x": ["c", "c", "c"]},1607{"x": [""]},1608]1609)16101611assert_frame_equal(out, expected)1612assert_frame_equal(df["x"].str.split("_").to_frame(), expected)16131614out = df.select([pl.col("x").str.split("_", inclusive=True)])16151616expected = pl.DataFrame(1617[1618{"x": ["a_", "a"]},1619{"x": None},1620{"x": ["b"]},1621{"x": ["c_", "c_", "c"]},1622{"x": []},1623]1624)16251626assert_frame_equal(out, expected)1627assert_frame_equal(df["x"].str.split("_", inclusive=True).to_frame(), expected)16281629out = df.select([pl.col("x").str.split("")])16301631expected = pl.DataFrame(1632[1633{"x": ["a", "_", "a"]},1634{"x": None},1635{"x": ["b"]},1636{"x": ["c", "_", "c", "_", "c"]},1637{"x": []},1638]1639)16401641assert_frame_equal(out, expected)1642assert_frame_equal(df["x"].str.split("").to_frame(), expected)16431644out = df.select([pl.col("x").str.split("", inclusive=True)])16451646expected = pl.DataFrame(1647[1648{"x": ["a", "_", "a"]},1649{"x": None},1650{"x": ["b"]},1651{"x": ["c", "_", "c", "_", "c"]},1652{"x": []},1653]1654)16551656assert_frame_equal(out, expected)1657assert_frame_equal(df["x"].str.split("", inclusive=True).to_frame(), expected)16581659plan = (1660df.lazy()1661.select(1662a=pl.col("x").str.split(" ", inclusive=False),1663b=pl.col("x").str.split_exact(" ", 1, inclusive=False),1664)1665.explain()1666)16671668assert "str.split(" in plan1669assert "str.split_exact(" in plan16701671plan = (1672df.lazy()1673.select(1674a=pl.col("x").str.split(" ", inclusive=True),1675b=pl.col("x").str.split_exact(" ", 1, inclusive=True),1676)1677.explain()1678)16791680assert "str.split_inclusive(" in plan1681assert "str.split_exact_inclusive(" in plan168216831684def test_split_expr() -> None:1685df = pl.DataFrame(1686{1687"x": ["a_a", None, "b", "c*c*c", "dddd", ""],1688"by": ["_", "#", "^", "*", "", ""],1689}1690)1691out = df.select([pl.col("x").str.split(pl.col("by"))])1692expected = pl.DataFrame(1693[1694{"x": ["a", "a"]},1695{"x": None},1696{"x": ["b"]},1697{"x": ["c", "c", "c"]},1698{"x": ["d", "d", "d", "d"]},1699{"x": []},1700]1701)1702assert_frame_equal(out, expected)17031704out = df.select([pl.col("x").str.split(pl.col("by"), inclusive=True)])1705expected = pl.DataFrame(1706[1707{"x": ["a_", "a"]},1708{"x": None},1709{"x": ["b"]},1710{"x": ["c*", "c*", "c"]},1711{"x": ["d", "d", "d", "d"]},1712{"x": []},1713]1714)1715assert_frame_equal(out, expected)171617171718def test_split_exact() -> None:1719df = pl.DataFrame({"x": ["a_a", None, "b", "c_c", ""]})1720out = df.select([pl.col("x").str.split_exact("_", 2, inclusive=False)]).unnest("x")17211722expected = pl.DataFrame(1723{1724"field_0": ["a", None, "b", "c", ""],1725"field_1": ["a", None, None, "c", None],1726"field_2": pl.Series([None, None, None, None, None], dtype=pl.String),1727}1728)17291730assert_frame_equal(out, expected)1731out2 = df["x"].str.split_exact("_", 2, inclusive=False).to_frame().unnest("x")1732assert_frame_equal(out2, expected)17331734out = df.select([pl.col("x").str.split_exact("_", 1, inclusive=True)]).unnest("x")17351736expected = pl.DataFrame(1737{1738"field_0": ["a_", None, "b", "c_", None],1739"field_1": ["a", None, None, "c", None],1740}1741)1742assert_frame_equal(out, expected)1743assert df["x"].str.split_exact("_", 1).dtype == pl.Struct1744assert df["x"].str.split_exact("_", 1, inclusive=False).dtype == pl.Struct17451746out = df.select([pl.col("x").str.split_exact("", 1)]).unnest("x")17471748expected = pl.DataFrame(1749{1750"field_0": ["a", None, "b", "c", None],1751"field_1": ["_", None, None, "_", None],1752}1753)1754assert_frame_equal(out, expected)17551756out = df.select([pl.col("x").str.split_exact("", 1, inclusive=True)]).unnest("x")17571758expected = pl.DataFrame(1759{1760"field_0": ["a", None, "b", "c", None],1761"field_1": ["_", None, None, "_", None],1762}1763)1764assert_frame_equal(out, expected)176517661767def test_split_exact_expr() -> None:1768df = pl.DataFrame(1769{1770"x": ["a_a", None, "b", "c^c^c", "d#d", "eeee", ""],1771"by": ["_", "&", "$", "^", None, "", ""],1772}1773)17741775out = df.select(1776pl.col("x").str.split_exact(pl.col("by"), 2, inclusive=False)1777).unnest("x")17781779expected = pl.DataFrame(1780{1781"field_0": ["a", None, "b", "c", None, "e", None],1782"field_1": ["a", None, None, "c", None, "e", None],1783"field_2": pl.Series(1784[None, None, None, "c", None, "e", None], dtype=pl.String1785),1786}1787)17881789assert_frame_equal(out, expected)17901791out2 = df.select(1792pl.col("x").str.split_exact(pl.col("by"), 2, inclusive=True)1793).unnest("x")17941795expected2 = pl.DataFrame(1796{1797"field_0": ["a_", None, "b", "c^", None, "e", None],1798"field_1": ["a", None, None, "c^", None, "e", None],1799"field_2": pl.Series(1800[None, None, None, "c", None, "e", None], dtype=pl.String1801),1802}1803)1804assert_frame_equal(out2, expected2)180518061807def test_splitn() -> None:1808df = pl.DataFrame({"x": ["a_a", None, "b", "c_c_c", ""]})1809out = df.select([pl.col("x").str.splitn("_", 2)]).unnest("x")18101811expected = pl.DataFrame(1812{1813"field_0": ["a", None, "b", "c", ""],1814"field_1": ["a", None, None, "c_c", None],1815}1816)18171818assert_frame_equal(out, expected)1819assert_frame_equal(df["x"].str.splitn("_", 2).to_frame().unnest("x"), expected)18201821out = df.select([pl.col("x").str.splitn("", 2)]).unnest("x")18221823expected = pl.DataFrame(1824{1825"field_0": ["a", None, "b", "c", None],1826"field_1": ["_a", None, None, "_c_c", None],1827}1828)18291830assert_frame_equal(out, expected)1831assert_frame_equal(df["x"].str.splitn("", 2).to_frame().unnest("x"), expected)183218331834def test_splitn_expr() -> None:1835df = pl.DataFrame(1836{1837"x": ["a_a", None, "b", "c^c^c", "d#d", "eeee", ""],1838"by": ["_", "&", "$", "^", None, "", ""],1839}1840)18411842out = df.select(pl.col("x").str.splitn(pl.col("by"), 2)).unnest("x")18431844expected = pl.DataFrame(1845{1846"field_0": ["a", None, "b", "c", None, "e", None],1847"field_1": ["a", None, None, "c^c", None, "eee", None],1848}1849)18501851assert_frame_equal(out, expected)185218531854def test_titlecase() -> None:1855df = pl.DataFrame(1856{1857"quotes": [1858"'e.t. phone home'",1859"you talkin' to me?",1860"i feel the need--the need for speed",1861"to infinity,and BEYOND!",1862"say 'what' again!i dare you - I\u00a0double-dare you!",1863"What.we.got.here... is#failure#to#communicate",1864"welcome to my world",1865"double space",1866"and\ta\t tab",1867"by jean-paul sartre, 'esq'",1868"SOMETIMES/life/gives/you/a/2nd/chance",1869]1870}1871)18721873with pytest.warns(PolarsInefficientMapWarning):1874assert_frame_equal(1875df.select(pl.col("quotes").str.to_titlecase()),1876df.select(pl.col("quotes").map_elements(lambda s: s.title())),1877)187818791880def test_string_replace_with_nulls_10124() -> None:1881df = pl.DataFrame({"col1": ["S", "S", "S", None, "S", "S", "S", "S"]})18821883assert df.select(1884pl.col("col1"),1885pl.col("col1").str.replace("S", "O", n=1).alias("n_1"),1886pl.col("col1").str.replace("S", "O", n=3).alias("n_3"),1887).to_dict(as_series=False) == {1888"col1": ["S", "S", "S", None, "S", "S", "S", "S"],1889"n_1": ["O", "O", "O", None, "O", "O", "O", "O"],1890"n_3": ["O", "O", "O", None, "O", "O", "O", "O"],1891}189218931894def test_string_extract_groups_lazy_schema_10305() -> None:1895df = pl.LazyFrame(1896data={1897"url": [1898"http://vote.com/ballon_dor?candidate=messi&ref=python",1899"http://vote.com/ballon_dor?candidate=weghorst&ref=polars",1900"http://vote.com/ballon_dor?error=404&ref=rust",1901]1902}1903)1904pattern = r"candidate=(?<candidate>\w+)&ref=(?<ref>\w+)"1905df = df.select(captures=pl.col("url").str.extract_groups(pattern)).unnest(1906"captures"1907)19081909assert df.collect_schema() == {"candidate": pl.String, "ref": pl.String}191019111912def test_string_reverse() -> None:1913df = pl.DataFrame(1914{1915"text": [None, "foo", "bar", "i like pizza&#", None, "man\u0303ana"],1916}1917)1918expected = pl.DataFrame(1919[1920pl.Series(1921"text",1922[None, "oof", "rab", "#&azzip ekil i", None, "anan\u0303am"],1923dtype=pl.String,1924),1925]1926)19271928result = df.select(pl.col("text").str.reverse())1929assert_frame_equal(result, expected)193019311932@pytest.mark.parametrize(1933("data", "expected_data"),1934[1935(["", None, "a"], ["", None, "b"]),1936([None, None, "a"], [None, None, "b"]),1937(["", "", ""], ["", "", ""]),1938([None, None, None], [None, None, None]),1939(["a", "", None], ["b", "", None]),1940],1941)1942def test_replace_lit_n_char_13385(1943data: list[str | None], expected_data: list[str | None]1944) -> None:1945s = pl.Series(data, dtype=pl.String)1946res = s.str.replace("a", "b", literal=True)1947expected_s = pl.Series(expected_data, dtype=pl.String)1948assert_series_equal(res, expected_s)194919501951def test_find_many_raises() -> None:1952df = pl.DataFrame({"values": ["discontent", "foobar"]})1953patterns = ["winter", "disco", "onte", "discontent"]1954with pytest.raises(1955ValueError, match="can not match overlapping patterns when leftmost == True"1956):1957df.select(1958pl.col("values").str.find_many(patterns, leftmost=True, overlapping=True)1959)196019611962def test_extract_many_raises() -> None:1963df = pl.DataFrame({"values": ["discontent", "foobar"]})1964patterns = ["winter", "disco", "onte", "discontent"]1965with pytest.raises(1966ValueError, match="can not match overlapping patterns when leftmost == True"1967):1968df.select(1969pl.col("values").str.extract_many(patterns, leftmost=True, overlapping=True)1970)197119721973def test_extract_many() -> None:1974df = pl.DataFrame({"values": ["discontent", "foobar"]})1975patterns = ["winter", "disco", "onte", "discontent"]1976assert df.with_columns(1977pl.col("values").str.extract_many(patterns, overlapping=False).alias("matches"),1978pl.col("values")1979.str.extract_many(patterns, overlapping=True)1980.alias("matches_overlapping"),1981).to_dict(as_series=False) == {1982"values": ["discontent", "foobar"],1983"matches": [["disco"], []],1984"matches_overlapping": [["disco", "onte", "discontent"], []],1985}19861987# many patterns1988df = pl.DataFrame(1989{1990"values": ["discontent", "rhapsody"],1991"patterns": [1992["winter", "disco", "onte", "discontent"],1993["rhap", "ody", "coalesce"],1994],1995}1996)19971998# extract_many1999assert df.select(pl.col("values").str.extract_many("patterns")).to_dict(2000as_series=False2001) == {"values": [["disco"], ["rhap", "ody"]]}20022003# find_many2004f1 = df.select(pl.col("values").str.find_many("patterns"))2005f2 = df["values"].str.find_many(df["patterns"])20062007assert_series_equal(f1["values"], f2)2008assert f2.to_list() == [[0], [0, 5]]200920102011def test_json_decode_raise_on_data_type_mismatch_13061() -> None:2012assert_series_equal(2013pl.Series(["null", "null"]).str.json_decode(infer_schema_length=1),2014pl.Series([None, None]),2015)20162017with pytest.raises(ComputeError):2018pl.Series(["null", "1"]).str.json_decode(infer_schema_length=1)20192020assert_series_equal(2021pl.Series(["null", "1"]).str.json_decode(infer_schema_length=2),2022pl.Series([None, 1]),2023)202420252026def test_json_decode_struct_schema() -> None:2027with pytest.raises(ComputeError, match="extra field in struct data: b"):2028pl.Series([r'{"a": 1}', r'{"a": 2, "b": 2}']).str.json_decode(2029infer_schema_length=12030)20312032assert_series_equal(2033pl.Series([r'{"a": 1}', r'{"a": 2, "b": 2}']).str.json_decode(2034infer_schema_length=22035),2036pl.Series([{"a": 1, "b": None}, {"a": 2, "b": 2}]),2037)20382039# If the schema was explicitly given, then we ignore extra fields.2040# TODO: There should be a `columns=` parameter to this.2041assert_series_equal(2042pl.Series([r'{"a": 1}', r'{"a": 2, "b": 2}']).str.json_decode(2043dtype=pl.Struct({"a": pl.Int64})2044),2045pl.Series([{"a": 1}, {"a": 2}]),2046)204720482049def test_escape_regex() -> None:2050df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]})2051result_df = df.with_columns(pl.col("text").str.escape_regex().alias("escaped"))2052expected_df = pl.DataFrame(2053{2054"text": ["abc", "def", None, "abc(\\w+)"],2055"escaped": ["abc", "def", None, "abc\\(\\\\w\\+\\)"],2056}2057)20582059assert_frame_equal(result_df, expected_df)2060assert_series_equal(result_df["escaped"], expected_df["escaped"])206120622063@pytest.mark.parametrize(2064("form", "expected_data"),2065[2066("NFC", ["01²", "KADOKAWA"]), # noqa: RUF0012067("NFD", ["01²", "KADOKAWA"]), # noqa: RUF0012068("NFKC", ["012", "KADOKAWA"]),2069("NFKD", ["012", "KADOKAWA"]),2070],2071)2072def test_string_normalize(form: Any, expected_data: list[str | None]) -> None:2073s = pl.Series(["01²", "KADOKAWA"], dtype=pl.String) # noqa: RUF0012074res = s.str.normalize(form)2075expected_s = pl.Series(expected_data, dtype=pl.String)2076assert_series_equal(res, expected_s)207720782079def test_string_normalize_wrong_input() -> None:2080with pytest.raises(ValueError, match="`form` must be one of"):2081pl.Series(["01²"], dtype=pl.String).str.normalize("foobar") # type: ignore[arg-type]208220832084def test_to_integer_unequal_lengths_22034() -> None:2085s = pl.Series("a", ["1", "2", "3"], pl.String)2086with pytest.raises(pl.exceptions.ShapeError):2087s.str.to_integer(base=pl.Series([4, 5, 5, 4]))208820892090def test_broadcast_self() -> None:2091s = pl.Series("a", ["3"], pl.String)2092with pytest.raises(2093pl.exceptions.ComputeError, match="strict integer parsing failed"2094):2095s.str.to_integer(base=pl.Series([2, 2, 3, 4]))209620972098def test_strptime_unequal_length_22018() -> None:2099s = pl.Series(["2020-01-01 01:00Z", "2020-01-01 02:00Z"])2100with pytest.raises(pl.exceptions.ShapeError):2101s.str.strptime(2102pl.Datetime, "%Y-%m-%d %H:%M%#z", ambiguous=pl.Series(["a", "b", "d"])2103)210421052106@pytest.mark.parametrize("inclusive", [False, True])2107def test_str_split_unequal_length_22018(inclusive: bool) -> None:2108with pytest.raises(pl.exceptions.ShapeError):2109pl.Series(["a-c", "x-y"]).str.split(2110pl.Series(["-", "/", "+"]), inclusive=inclusive2111)211221132114def test_str_split_self_broadcast() -> None:2115assert_series_equal(2116pl.Series(["a-/c"]).str.split(pl.Series(["-", "/", "+"])),2117pl.Series([["a", "/c"], ["a-", "c"], ["a-/c"]]),2118)211921202121def test_replace_many_mapping_in_list() -> None:2122assert_series_equal(2123pl.Series([["a", "b"]]).list.eval(2124pl.element().replace_strict({"a": 1, "b": 2})2125),2126pl.Series([[1, 2]]),2127)212821292130def test_str_replace_n_zero_23570() -> None:2131# more than 32 bytes2132abc_long = "abc " * 20 + "abc"2133df = pl.DataFrame(2134{"a": [abc_long, "abc abc abc", "abc ghi"], "b": ["jkl", "pqr", "xyz"]}2135)2136expected = df21372138out = df.with_columns(pl.col("a").str.replace("abc", "XYZ", n=0))2139assert_frame_equal(out, expected)21402141out = df.with_columns(pl.col("a").str.replace("abc", pl.col("b"), n=0))2142assert_frame_equal(out, expected)214321442145def test_str_replace_null_19601() -> None:2146df = pl.DataFrame({"key": ["1", "2"], "1": ["---", None]})21472148assert_frame_equal(2149df.select(result=pl.col("key").str.replace("1", pl.col("1"))),2150pl.DataFrame({"result": ["---", "2"]}),2151)215221532154def test_str_json_decode_25237() -> None:2155s = pl.Series(['[{"a": 0, "b": 1}, {"b": 2}]'])21562157dtypes = {s.str.json_decode().dtype for _ in range(20)}21582159assert len(dtypes) == 1216021612162def test_json_decode_decimal_25789() -> None:2163s = pl.Series(2164['{"a": 1.23}', '{"a": 4.56}', '{"a": null}', '{"a": "30.1271239481230948"}']2165)2166result = s.str.json_decode(dtype=pl.Struct({"a": pl.Decimal(4, 2)}))2167expected = pl.Series(2168[{"a": 1.23}, {"a": 4.56}, {"a": None}, {"a": 30.13}],2169dtype=pl.Struct({"a": pl.Decimal(4, 2)}),2170)2171assert_series_equal(result, expected)21722173with pytest.raises(2174ComputeError, match=r"error deserializing value.*30.127.* as Decimal\(3, 2\)"2175):2176s.str.json_decode(dtype=pl.Struct({"a": pl.Decimal(3, 2)}))217721782179def test_json_decode_i128() -> None:2180s = pl.Series(2181[2182'{"a":170141183460469231731687303715884105723}',2183'{"a":null}',2184'{"a":-170141183460469231731687303715759193239}',2185]2186)2187result = s.str.json_decode(dtype=pl.Struct({"a": pl.Int128}))2188expected = pl.Series(2189[{"a": 2**127 - 5}, {"a": None}, {"a": -(2**127) + 124912489}],2190dtype=pl.Struct({"a": pl.Int128}),2191)2192assert_series_equal(result, expected)219321942195def test_json_decode_u128() -> None:2196s = pl.Series(['{"a":340282366920938463463374607431768211451}', '{"a":null}'])2197result = s.str.json_decode(dtype=pl.Struct({"a": pl.UInt128}))2198expected = pl.Series(2199[{"a": 2**128 - 5}, {"a": None}],2200dtype=pl.Struct({"a": pl.UInt128}),2201)2202assert_series_equal(result, expected)220322042205@pytest.mark.parametrize("dtype", [pl.Enum(["bar", "foo"]), pl.Categorical])2206def test_json_decode_categorical_enum(dtype: pl.DataType) -> None:2207s = pl.Series(['{"a":"foo"}', '{"a":"bar"}', '{"a":null}', '{"a":"foo"}'])2208result = s.str.json_decode(dtype=pl.Struct({"a": dtype}))2209expected = pl.Series(2210[{"a": "foo"}, {"a": "bar"}, {"a": None}, {"a": "foo"}],2211dtype=pl.Struct({"a": dtype}),2212)2213assert_series_equal(result, expected)221422152216def test_str_split_regex() -> None:2217df = pl.DataFrame({"s": ["foo1bar", "foo99bar", "foo1bar2baz"]})22182219out = df.select(split=pl.col("s").str.split(by=r"\d+", literal=False))2220expected = pl.DataFrame(2221{"split": [["foo", "bar"], ["foo", "bar"], ["foo", "bar", "baz"]]}2222)22232224assert_frame_equal(out, expected)222522262227def test_str_split_regex_inclusive() -> None:2228df = pl.DataFrame({"s": ["foo1bar", "foo99bar", "foo1bar2baz"]})22292230out = df.select(2231split=pl.col("s").str.split(by=r"\d+", literal=False, inclusive=True)2232)2233expected = pl.DataFrame(2234{"split": [["foo1", "bar"], ["foo99", "bar"], ["foo1", "bar2", "baz"]]}2235)22362237assert_frame_equal(out, expected)223822392240def test_str_split_regex_expr() -> None:2241df = pl.DataFrame(2242{2243"s": ["foo1bar", "foo bar", "foo-bar baz"],2244"by": [r"\d", r"\s", r"-"],2245}2246)22472248out = df.select(split=pl.col("s").str.split(by=pl.col("by"), literal=False))2249expected = pl.DataFrame(2250{"split": [["foo", "bar"], ["foo", "bar"], ["foo", "bar baz"]]}2251)22522253assert_frame_equal(out, expected)225422552256def test_str_split_regex_expr_inclusive() -> None:2257df = pl.DataFrame(2258{2259"s": ["foo1bar", "foo bar", "foo-bar baz"],2260"by": [r"\d", r"\s", r"-"],2261}2262)22632264out = df.select(2265split=pl.col("s").str.split(by=pl.col("by"), literal=False, inclusive=True)2266)2267expected = pl.DataFrame(2268{"split": [["foo1", "bar"], ["foo ", "bar"], ["foo-", "bar baz"]]}2269)22702271assert_frame_equal(out, expected)227222732274def test_str_split_regex_invalid_pattern_strict_true() -> None:2275df = pl.DataFrame({"s": ["foo1bar", "abc", "123xyz"]})22762277with pytest.raises(ComputeError):2278df.select(split=pl.col("s").str.split(by="(", literal=False, strict=True))227922802281def test_str_split_regex_invalid_pattern_strict_false() -> None:2282df = pl.DataFrame({"s": ["foo1bar", "abc", "123xyz"]})22832284out = df.select(split=pl.col("s").str.split(by="(", literal=False, strict=False))22852286expected = pl.DataFrame(2287{2288"split": pl.Series(2289"split",2290[None, None, None],2291dtype=pl.List(pl.String),2292)2293}2294)22952296assert_frame_equal(out, expected)229722982299def test_str_split_regex_scalar_string_expr() -> None:2300df = pl.DataFrame({"by": [r"\d", r"\d+", r"bar"]})23012302out = df.select(2303split=pl.lit("foo1bar2baz").str.split(by=pl.col("by"), literal=False)2304)23052306expected = pl.DataFrame(2307{2308"split": [2309["foo", "bar", "baz"], # split by \d2310["foo", "bar", "baz"], # split by \d+2311["foo1", "2baz"], # split by "bar"2312]2313}2314)23152316assert_frame_equal(out, expected)231723182319