Path: blob/main/py-polars/tests/unit/sql/test_strings.py
6939 views
from __future__ import annotations12from pathlib import Path34import pytest56import polars as pl7from polars.exceptions import SQLSyntaxError8from polars.testing import assert_frame_equal91011# TODO: Do not rely on I/O for these tests12@pytest.fixture13def foods_ipc_path() -> Path:14return Path(__file__).parent.parent / "io" / "files" / "foods1.ipc"151617def test_string_case() -> None:18df = pl.DataFrame({"words": ["Test SOME words"]})1920with pl.SQLContext(frame=df) as ctx:21res = ctx.execute(22"""23SELECT24words,25INITCAP(words) as cap,26UPPER(words) as upper,27LOWER(words) as lower,28FROM frame29"""30).collect()3132assert res.to_dict(as_series=False) == {33"words": ["Test SOME words"],34"cap": ["Test Some Words"],35"upper": ["TEST SOME WORDS"],36"lower": ["test some words"],37}383940def test_string_concat() -> None:41lf = pl.LazyFrame(42{43"x": ["a", None, "c"],44"y": ["d", "e", "f"],45"z": [1, 2, 3],46}47)48res = lf.sql(49"""50SELECT51("x" || "x" || "y") AS c0,52("x" || "y" || "z") AS c1,53CONCAT(("x" || '-'), "y") AS c2,54CONCAT("x", "x", "y") AS c3,55CONCAT("x", "y", ("z" * 2)) AS c4,56CONCAT_WS(':', "x", "y", "z") AS c5,57CONCAT_WS('', "y", "z", '!') AS c658FROM self59""",60).collect()6162assert res.to_dict(as_series=False) == {63"c0": ["aad", None, "ccf"],64"c1": ["ad1", None, "cf3"],65"c2": ["a-d", "e", "c-f"],66"c3": ["aad", "e", "ccf"],67"c4": ["ad2", "e4", "cf6"],68"c5": ["a:d:1", "e:2", "c:f:3"],69"c6": ["d1!", "e2!", "f3!"],70}717273@pytest.mark.parametrize(74"invalid_concat", ["CONCAT()", "CONCAT_WS()", "CONCAT_WS(':')"]75)76def test_string_concat_errors(invalid_concat: str) -> None:77lf = pl.LazyFrame({"x": ["a", "b", "c"]})78with pytest.raises(79SQLSyntaxError,80match=r"CONCAT.*expects at least \d argument[s]? \(found \d\)",81):82pl.SQLContext(data=lf).execute(f"SELECT {invalid_concat} FROM data")838485def test_string_left_right_reverse() -> None:86df = pl.DataFrame({"txt": ["abcde", "abc", "a", None]})87ctx = pl.SQLContext(df=df)88res = ctx.execute(89"""90SELECT91LEFT(txt,2) AS "l",92RIGHT(txt,2) AS "r",93REVERSE(txt) AS "rev"94FROM df95""",96).collect()9798assert res.to_dict(as_series=False) == {99"l": ["ab", "ab", "a", None],100"r": ["de", "bc", "a", None],101"rev": ["edcba", "cba", "a", None],102}103for func, invalid_arg, invalid_err in (104("LEFT", "'xyz'", '"xyz"'),105("RIGHT", "6.66", "(dyn float: 6.66)"),106):107with pytest.raises(108SQLSyntaxError,109match=rf"""invalid 'n_chars' for {func} \({invalid_err}\)""",110):111ctx.execute(f"""SELECT {func}(txt,{invalid_arg}) FROM df""").collect()112113114def test_string_left_negative_expr() -> None:115# negative values and expressions116df = pl.DataFrame({"s": ["alphabet", "alphabet"], "n": [-6, 6]})117with pl.SQLContext(df=df, eager=True) as sql:118res = sql.execute(119"""120SELECT121LEFT("s",-50) AS l0, -- empty string122LEFT("s",-3) AS l1, -- all but last three chars123LEFT("s",SIGN(-1)) AS l2, -- all but last char (expr => -1)124LEFT("s",0) AS l3, -- empty string125LEFT("s",NULL) AS l4, -- null126LEFT("s",1) AS l5, -- first char127LEFT("s",SIGN(1)) AS l6, -- first char (expr => 1)128LEFT("s",3) AS l7, -- first three chars129LEFT("s",50) AS l8, -- entire string130LEFT("s","n") AS l9, -- from other col131FROM df132"""133)134assert res.to_dict(as_series=False) == {135"l0": ["", ""],136"l1": ["alpha", "alpha"],137"l2": ["alphabe", "alphabe"],138"l3": ["", ""],139"l4": [None, None],140"l5": ["a", "a"],141"l6": ["a", "a"],142"l7": ["alp", "alp"],143"l8": ["alphabet", "alphabet"],144"l9": ["al", "alphab"],145}146147148def test_string_right_negative_expr() -> None:149# negative values and expressions150df = pl.DataFrame({"s": ["alphabet", "alphabet"], "n": [-6, 6]})151with pl.SQLContext(df=df, eager=True) as sql:152res = sql.execute(153"""154SELECT155RIGHT("s",-50) AS l0, -- empty string156RIGHT("s",-3) AS l1, -- all but first three chars157RIGHT("s",SIGN(-1)) AS l2, -- all but first char (expr => -1)158RIGHT("s",0) AS l3, -- empty string159RIGHT("s",NULL) AS l4, -- null160RIGHT("s",1) AS l5, -- last char161RIGHT("s",SIGN(1)) AS l6, -- last char (expr => 1)162RIGHT("s",3) AS l7, -- last three chars163RIGHT("s",50) AS l8, -- entire string164RIGHT("s","n") AS l9, -- from other col165FROM df166"""167)168assert res.to_dict(as_series=False) == {169"l0": ["", ""],170"l1": ["habet", "habet"],171"l2": ["lphabet", "lphabet"],172"l3": ["", ""],173"l4": [None, None],174"l5": ["t", "t"],175"l6": ["t", "t"],176"l7": ["bet", "bet"],177"l8": ["alphabet", "alphabet"],178"l9": ["et", "phabet"],179}180181182def test_string_lengths() -> None:183df = pl.DataFrame({"words": ["Café", None, "東京", ""]})184185with pl.SQLContext(frame=df) as ctx:186res = ctx.execute(187"""188SELECT189words,190LENGTH(words) AS n_chrs1,191CHAR_LENGTH(words) AS n_chrs2,192CHARACTER_LENGTH(words) AS n_chrs3,193OCTET_LENGTH(words) AS n_bytes,194BIT_LENGTH(words) AS n_bits195FROM frame196"""197).collect()198199assert res.to_dict(as_series=False) == {200"words": ["Café", None, "東京", ""],201"n_chrs1": [4, None, 2, 0],202"n_chrs2": [4, None, 2, 0],203"n_chrs3": [4, None, 2, 0],204"n_bytes": [5, None, 6, 0],205"n_bits": [40, None, 48, 0],206}207208209@pytest.mark.parametrize(210("pattern", "like", "expected"),211[212("a%", "LIKE", [1, 4]),213("a%", "ILIKE", [0, 1, 3, 4]),214("ab%", "LIKE", [1]),215("AB%", "ILIKE", [0, 1]),216("ab_", "LIKE", [1]),217("A__", "ILIKE", [0, 1]),218("_0%_", "LIKE", [2, 4]),219("%0", "LIKE", [2]),220("0%", "LIKE", [2]),221("__0%", "~~", [2, 3]),222("%*%", "~~*", [3]),223("____", "~~", [4]),224("a%C", "~~", []),225("a%C", "~~*", [0, 1, 3]),226("%C?", "~~*", [4]),227("a0c?", "~~", [4]),228("000", "~~", [2]),229("00", "~~", []),230],231)232def test_string_like(pattern: str, like: str, expected: list[int]) -> None:233df = pl.DataFrame(234{235"idx": [0, 1, 2, 3, 4],236"txt": ["ABC", "abc", "000", "A[0]*C", "a0c?"],237}238)239with pl.SQLContext(df=df) as ctx:240for not_ in ("", ("NOT " if like.endswith("LIKE") else "!")):241out = ctx.execute(242f"SELECT idx FROM df WHERE txt {not_}{like} '{pattern}'"243).collect()244245res = out["idx"].to_list()246if not_:247expected = [i for i in df["idx"] if i not in expected]248assert res == expected249250251def test_string_like_multiline() -> None:252s1 = "Hello World"253s2 = "Hello\nWorld"254s3 = "hello\nWORLD"255256df = pl.DataFrame({"idx": [0, 1, 2], "txt": [s1, s2, s3]})257258# starts with...259res1 = df.sql("SELECT * FROM self WHERE txt LIKE 'Hello%' ORDER BY idx")260res2 = df.sql("SELECT * FROM self WHERE txt ILIKE 'HELLO%' ORDER BY idx")261262assert res1["txt"].to_list() == [s1, s2]263assert res2["txt"].to_list() == [s1, s2, s3]264265# ends with...266res3 = df.sql("SELECT * FROM self WHERE txt LIKE '%WORLD' ORDER BY idx")267res4 = df.sql("SELECT * FROM self WHERE txt ILIKE '%\nWORLD' ORDER BY idx")268269assert res3["txt"].to_list() == [s3]270assert res4["txt"].to_list() == [s2, s3]271272# exact match273for s in (s1, s2, s3):274assert df.sql(f"SELECT txt FROM self WHERE txt LIKE '{s}'").item() == s275276277@pytest.mark.parametrize("form", ["NFKC", "NFKD"])278def test_string_normalize(form: str) -> None:279df = pl.DataFrame({"txt": ["Test", "𝕋𝕖𝕤𝕥", "𝕿𝖊𝖘𝖙", "𝗧𝗲𝘀𝘁", "Ⓣⓔⓢⓣ"]}) # noqa: RUF001280res = df.sql(281f"""282SELECT txt, NORMALIZE(txt,{form}) AS norm_txt283FROM self284"""285)286assert res.to_dict(as_series=False) == {287"txt": ["Test", "𝕋𝕖𝕤𝕥", "𝕿𝖊𝖘𝖙", "𝗧𝗲𝘀𝘁", "Ⓣⓔⓢⓣ"], # noqa: RUF001288"norm_txt": ["Test", "Test", "Test", "Test", "Test"],289}290291292def test_string_position() -> None:293df = pl.Series(294name="city",295values=["Dubai", "Abu Dhabi", "Sharjah", "Al Ain", "Ajman", "Ras Al Khaimah"],296).to_frame()297298with pl.SQLContext(cities=df, eager=True) as ctx:299res = ctx.execute(300"""301SELECT302POSITION('a' IN city) AS a_lc1,303POSITION('A' IN city) AS a_uc1,304STRPOS(city,'a') AS a_lc2,305STRPOS(city,'A') AS a_uc2,306FROM cities307"""308)309expected_lc = [4, 7, 3, 0, 4, 2]310expected_uc = [0, 1, 0, 1, 1, 5]311312assert res.to_dict(as_series=False) == {313"a_lc1": expected_lc,314"a_uc1": expected_uc,315"a_lc2": expected_lc,316"a_uc2": expected_uc,317}318319df = pl.DataFrame({"txt": ["AbCdEXz", "XyzFDkE"]})320with pl.SQLContext(txt=df) as ctx:321res = ctx.execute(322"""323SELECT324txt,325POSITION('E' IN txt) AS match_E,326STRPOS(txt,'X') AS match_X327FROM txt328""",329eager=True,330)331assert_frame_equal(332res,333pl.DataFrame(334data={335"txt": ["AbCdEXz", "XyzFDkE"],336"match_E": [5, 7],337"match_X": [6, 1],338},339schema={340"txt": pl.String,341"match_E": pl.UInt32,342"match_X": pl.UInt32,343},344),345)346347348def test_string_replace() -> None:349df = pl.DataFrame({"words": ["Yemeni coffee is the best coffee", "", None]})350with pl.SQLContext(df=df) as ctx:351out = ctx.execute(352"""353SELECT354REPLACE(355REPLACE(words, 'coffee', 'tea'),356'Yemeni',357'English breakfast'358)359FROM df360"""361).collect()362363res = out["words"].to_list()364assert res == ["English breakfast tea is the best tea", "", None]365366with pytest.raises(367SQLSyntaxError, match=r"REPLACE expects 3 arguments \(found 2\)"368):369ctx.execute("SELECT REPLACE(words,'coffee') FROM df")370371372def test_string_split() -> None:373df = pl.DataFrame({"s": ["xx,yy,zz", "abc,,xyz", "", None]})374res = df.sql("SELECT *, STRING_TO_ARRAY(s,',') AS s_array FROM self")375376assert res.schema == {"s": pl.String, "s_array": pl.List(pl.String)}377assert res.to_dict(as_series=False) == {378"s": ["xx,yy,zz", "abc,,xyz", "", None],379"s_array": [["xx", "yy", "zz"], ["abc", "", "xyz"], [""], None],380}381382383def test_string_split_part() -> None:384df = pl.DataFrame({"s": ["xx,yy,zz", "abc,,xyz,???,hmm", "", None]})385res = df.sql(386"""387SELECT388SPLIT_PART(s,',',1) AS "s+1",389SPLIT_PART(s,',',3) AS "s+3",390SPLIT_PART(s,',',-2) AS "s-2",391FROM self392"""393)394assert res.to_dict(as_series=False) == {395"s+1": ["xx", "abc", "", None],396"s+3": ["zz", "xyz", "", None],397"s-2": ["yy", "???", "", None],398}399400401def test_string_substr() -> None:402df = pl.DataFrame(403{"scol": ["abcdefg", "abcde", "abc", None], "n": [-2, 3, 2, None]}404)405with pl.SQLContext(df=df) as ctx:406res = ctx.execute(407"""408SELECT409-- note: sql is 1-indexed410SUBSTR(scol,1) AS s1,411SUBSTR(scol,2) AS s2,412SUBSTR(scol,3) AS s3,413SUBSTR(scol,1,5) AS s1_5,414SUBSTR(scol,2,2) AS s2_2,415SUBSTR(scol,3,1) AS s3_1,416SUBSTR(scol,-3) AS "s-3",417SUBSTR(scol,-3,3) AS "s-3_3",418SUBSTR(scol,-3,4) AS "s-3_4",419SUBSTR(scol,-3,5) AS "s-3_5",420SUBSTR(scol,-10,13) AS "s-10_13",421SUBSTR(scol,"n",2) AS "s-n2",422SUBSTR(scol,2,"n"+3) AS "s-2n3"423FROM df424"""425).collect()426427with pytest.raises(428SQLSyntaxError,429match=r"SUBSTR does not support negative length \(-99\)",430):431ctx.execute("SELECT SUBSTR(scol,2,-99) FROM df")432433with pytest.raises(434SQLSyntaxError,435match=r"SUBSTR expects 2-3 arguments \(found 1\)",436):437pl.sql_expr("SUBSTR(s)")438439assert res.to_dict(as_series=False) == {440"s1": ["abcdefg", "abcde", "abc", None],441"s2": ["bcdefg", "bcde", "bc", None],442"s3": ["cdefg", "cde", "c", None],443"s1_5": ["abcde", "abcde", "abc", None],444"s2_2": ["bc", "bc", "bc", None],445"s3_1": ["c", "c", "c", None],446"s-3": ["abcdefg", "abcde", "abc", None],447"s-3_3": ["", "", "", None],448"s-3_4": ["", "", "", None],449"s-3_5": ["a", "a", "a", None],450"s-10_13": ["ab", "ab", "ab", None],451"s-n2": ["", "cd", "bc", None],452"s-2n3": ["b", "bcde", "bc", None],453}454455456def test_string_trim(foods_ipc_path: Path) -> None:457lf = pl.scan_ipc(foods_ipc_path)458out = lf.sql(459"""460SELECT DISTINCT TRIM(LEADING 'vmf' FROM category) as new_category461FROM self ORDER BY new_category DESC462"""463).collect()464assert out.to_dict(as_series=False) == {465"new_category": ["seafood", "ruit", "egetables", "eat"]466}467with pytest.raises(468SQLSyntaxError,469match="unsupported TRIM syntax",470):471# currently unsupported (snowflake-style) trim syntax472lf.sql("SELECT DISTINCT TRIM('*^xxxx^*', '^*') as new_category FROM self")473474475