Path: blob/main/py-polars/tests/unit/sql/test_order_by.py
8406 views
from __future__ import annotations12from pathlib import Path34import pytest56import polars as pl7from polars.exceptions import SQLInterfaceError, SQLSyntaxError8from tests.unit.sql.asserts import assert_sql_matches91011@pytest.fixture12def foods_ipc_path() -> Path:13return Path(__file__).parent.parent / "io" / "files" / "foods1.ipc"141516def test_order_by_basic(foods_ipc_path: Path) -> None:17foods = pl.scan_ipc(foods_ipc_path)1819order_by_distinct_res = foods.sql(20"""21SELECT DISTINCT category22FROM self23ORDER BY category DESC24"""25).collect()26assert order_by_distinct_res.to_dict(as_series=False) == {27"category": ["vegetables", "seafood", "meat", "fruit"]28}2930for category in ("category", "category AS cat"):31category_col = category.split(" ")[-1]32order_by_group_by_res = foods.sql(33f"""34SELECT {category}35FROM self36GROUP BY category37ORDER BY {category_col} DESC38"""39).collect()40assert order_by_group_by_res.to_dict(as_series=False) == {41category_col: ["vegetables", "seafood", "meat", "fruit"]42}4344order_by_constructed_group_by_res = foods.sql(45"""46SELECT category, SUM(calories) as summed_calories47FROM self48GROUP BY category49ORDER BY summed_calories DESC50"""51).collect()52assert order_by_constructed_group_by_res.to_dict(as_series=False) == {53"category": ["seafood", "meat", "fruit", "vegetables"],54"summed_calories": [1250, 540, 410, 192],55}5657order_by_unselected_res = foods.sql(58"""59SELECT SUM(calories) as summed_calories60FROM self61GROUP BY category62ORDER BY summed_calories DESC63"""64).collect()65assert order_by_unselected_res.to_dict(as_series=False) == {66"summed_calories": [1250, 540, 410, 192],67}686970def test_order_by_misc_selection() -> None:71df = pl.DataFrame({"x": [None, 1, 2, 3], "y": [4, 2, None, 8]})7273# order by aliased col74res = df.sql("SELECT x, y AS y2 FROM self ORDER BY y2")75assert res.to_dict(as_series=False) == {"x": [1, None, 3, 2], "y2": [2, 4, 8, None]}7677res = df.sql("SELECT x, y AS y2 FROM self ORDER BY y2 DESC")78assert res.to_dict(as_series=False) == {"x": [2, 3, None, 1], "y2": [None, 8, 4, 2]}7980# order by col found in wildcard81res = df.sql("SELECT *, y AS y2 FROM self ORDER BY y")82assert res.to_dict(as_series=False) == {83"x": [1, None, 3, 2],84"y": [2, 4, 8, None],85"y2": [2, 4, 8, None],86}87res = df.sql("SELECT *, y AS y2 FROM self ORDER BY y NULLS FIRST")88assert res.to_dict(as_series=False) == {89"x": [2, 1, None, 3],90"y": [None, 2, 4, 8],91"y2": [None, 2, 4, 8],92}9394# order by col found in qualified wildcard95res = df.sql("SELECT self.* FROM self ORDER BY x NULLS FIRST")96assert res.to_dict(as_series=False) == {"x": [None, 1, 2, 3], "y": [4, 2, None, 8]}9798res = df.sql("SELECT self.* FROM self ORDER BY y NULLS FIRST")99assert res.to_dict(as_series=False) == {"x": [2, 1, None, 3], "y": [None, 2, 4, 8]}100101# order by col excluded from wildcard102res = df.sql("SELECT * EXCLUDE y FROM self ORDER BY y")103assert res.to_dict(as_series=False) == {"x": [1, None, 3, 2]}104105res = df.sql("SELECT * EXCLUDE y FROM self ORDER BY y NULLS FIRST")106assert res.to_dict(as_series=False) == {"x": [2, 1, None, 3]}107108# order by col excluded from qualified wildcard109res = df.sql("SELECT self.* EXCLUDE y FROM self ORDER BY y")110assert res.to_dict(as_series=False) == {"x": [1, None, 3, 2]}111112# order by expression113res = df.sql("SELECT (x % y) AS xmy FROM self ORDER BY -(x % y)")114assert res.to_dict(as_series=False) == {"xmy": [3, 1, None, None]}115116res = df.sql("SELECT (x % y) AS xmy FROM self ORDER BY x % y NULLS FIRST")117assert res.to_dict(as_series=False) == {"xmy": [None, None, 1, 3]}118119# confirm that 'order by all' syntax prioritises cols120df = pl.DataFrame({"SOME": [0, 1], "ALL": [1, 0]})121res = df.sql("SELECT * FROM self ORDER BY ALL")122assert res.to_dict(as_series=False) == {"SOME": [1, 0], "ALL": [0, 1]}123124res = df.sql("SELECT * FROM self ORDER BY ALL DESC")125assert res.to_dict(as_series=False) == {"SOME": [0, 1], "ALL": [1, 0]}126127128def test_order_by_misc_16579() -> None:129res = pl.DataFrame(130{131"x": ["apple", "orange"],132"y": ["sheep", "alligator"],133"z": ["hello", "world"],134}135).sql(136"""137SELECT z, y, x138FROM self ORDER BY y DESC139"""140)141assert res.columns == ["z", "y", "x"]142assert res.to_dict(as_series=False) == {143"z": ["hello", "world"],144"y": ["sheep", "alligator"],145"x": ["apple", "orange"],146}147148149def test_order_by_multi_nulls_first_last() -> None:150df = pl.DataFrame({"x": [None, 1, None, 3], "y": [3, 2, None, 1]})151# ┌──────┬──────┐152# │ x ┆ y │153# │ --- ┆ --- │154# │ i64 ┆ i64 │155# ╞══════╪══════╡156# │ null ┆ 3 │157# │ 1 ┆ 2 │158# │ null ┆ null │159# │ 3 ┆ 1 │160# └──────┴──────┘161162res1 = df.sql("SELECT * FROM self ORDER BY x, y")163res2 = df.sql("SELECT * FROM self ORDER BY ALL")164for res in (res1, res2):165assert res.to_dict(as_series=False) == {166"x": [1, 3, None, None],167"y": [2, 1, 3, None],168}169170res = df.sql("SELECT * FROM self ORDER BY x NULLS FIRST, y")171assert res.to_dict(as_series=False) == {172"x": [None, None, 1, 3],173"y": [3, None, 2, 1],174}175176res = df.sql("SELECT * FROM self ORDER BY x, y NULLS FIRST")177assert res.to_dict(as_series=False) == {178"x": [1, 3, None, None],179"y": [2, 1, None, 3],180}181182res1 = df.sql("SELECT * FROM self ORDER BY x NULLS FIRST, y NULLS FIRST")183res2 = df.sql("SELECT * FROM self ORDER BY All NULLS FIRST")184for res in (res1, res2):185assert res.to_dict(as_series=False) == {186"x": [None, None, 1, 3],187"y": [None, 3, 2, 1],188}189190res1 = df.sql("SELECT * FROM self ORDER BY x DESC NULLS FIRST, y DESC NULLS FIRST")191res2 = df.sql("SELECT * FROM self ORDER BY all DESC NULLS FIRST")192for res in (res1, res2):193assert res.to_dict(as_series=False) == {194"x": [None, None, 3, 1],195"y": [None, 3, 1, 2],196}197198res = df.sql("SELECT * FROM self ORDER BY x DESC NULLS FIRST, y DESC NULLS LAST")199assert res.to_dict(as_series=False) == {200"x": [None, None, 3, 1],201"y": [3, None, 1, 2],202}203204res = df.sql("SELECT * FROM self ORDER BY y DESC NULLS FIRST, x NULLS LAST")205assert res.to_dict(as_series=False) == {206"x": [None, None, 1, 3],207"y": [None, 3, 2, 1],208}209210211def test_order_by_ordinal() -> None:212df = pl.DataFrame({"x": [None, 1, None, 3], "y": [3, 2, None, 1]})213214res = df.sql("SELECT * FROM self ORDER BY 1, 2")215assert res.to_dict(as_series=False) == {216"x": [1, 3, None, None],217"y": [2, 1, 3, None],218}219220res = df.sql("SELECT * FROM self ORDER BY 1 DESC, 2")221assert res.to_dict(as_series=False) == {222"x": [None, None, 3, 1],223"y": [3, None, 1, 2],224}225226res = df.sql("SELECT * FROM self ORDER BY 1 DESC NULLS LAST, 2 ASC")227assert res.to_dict(as_series=False) == {228"x": [3, 1, None, None],229"y": [1, 2, 3, None],230}231232res = df.sql("SELECT * FROM self ORDER BY 1 DESC NULLS LAST, 2 ASC NULLS FIRST")233assert res.to_dict(as_series=False) == {234"x": [3, 1, None, None],235"y": [1, 2, None, 3],236}237238res = df.sql("SELECT * FROM self ORDER BY 1 DESC, 2 DESC NULLS FIRST")239assert res.to_dict(as_series=False) == {240"x": [None, None, 3, 1],241"y": [None, 3, 1, 2],242}243244245def test_order_by_errors() -> None:246df = pl.DataFrame({"a": ["w", "x", "y", "z"], "b": [1, 2, 3, 4]})247248with pytest.raises(249SQLInterfaceError,250match="ORDER BY ordinal value must refer to a valid column; found 99",251):252df.sql("SELECT * FROM self ORDER BY 99")253254with pytest.raises(255SQLSyntaxError,256match="negative ordinal values are invalid for ORDER BY; found -1",257):258df.sql("SELECT * FROM self ORDER BY -1")259260261@pytest.mark.parametrize(262"query",263[264# basic aliasing: ORDER BY original column name after aliasing265"SELECT a b FROM self GROUP BY a ORDER BY a",266"SELECT a AS b FROM self GROUP BY a ORDER BY a",267# table-qualified aliasing268"SELECT self.a b FROM self GROUP BY self.a ORDER BY self.a",269"SELECT self.a AS b FROM self GROUP BY self.a ORDER BY self.a",270# mixed qualified/unqualified271"SELECT a b FROM self GROUP BY a ORDER BY self.a",272"SELECT self.a b FROM self GROUP BY a ORDER BY a",273# ORDER BY alias name (should still work)274"SELECT a b FROM self GROUP BY a ORDER BY b",275"SELECT a AS b FROM self GROUP BY a ORDER BY b",276],277)278def test_order_by_aliased_group_key(query: str) -> None:279"""Test ORDER BY with original column name when aliased in SELECT."""280df = pl.DataFrame({"a": [3, 1, 2], "b": [30, 10, 20]})281assert_sql_matches(df, query=query, compare_with="sqlite")282283284@pytest.mark.parametrize(285"query",286[287# cross-aliasing: columns swap names288"SELECT a AS b, b AS a FROM self GROUP BY a, b ORDER BY self.a",289"SELECT a AS b, b AS a FROM self GROUP BY a, b ORDER BY self.b",290# cross-aliasing with expressions291"SELECT a AS b, -b AS a FROM self GROUP BY a, b ORDER BY self.a",292"SELECT a AS b, -b AS a FROM self GROUP BY a, b ORDER BY self.b",293# cross-aliasing with aggregate294"SELECT a AS b, SUM(b) AS a FROM self GROUP BY a ORDER BY self.a",295"SELECT a AS b, SUM(b) AS a FROM self GROUP BY a ORDER BY self.b",296],297)298def test_order_by_cross_aliased_columns(query: str) -> None:299"""Test ORDER BY with cross-aliasing where columns swap names."""300df = pl.DataFrame({"a": [3, 1, 2], "b": [30, 10, 20]})301assert_sql_matches(df, query=query, compare_with="sqlite")302303304@pytest.mark.parametrize(305"query",306[307# multiple columns with various aliasing patterns308"SELECT a x, b y, a + b z FROM self GROUP BY a, b ORDER BY a",309"SELECT a x, b y, a + b z FROM self GROUP BY a, b ORDER BY self.a",310"SELECT a x, b y, a + b z FROM self GROUP BY a, b ORDER BY b DESC",311"SELECT a x, b y, a + b z FROM self GROUP BY a, b ORDER BY self.b DESC",312# ORDER BY referencing original columns313"SELECT a x, b y FROM self GROUP BY a, b ORDER BY a + b",314"SELECT a x, b y FROM self GROUP BY a, b ORDER BY self.a + self.b",315# ORDER BY with ordinal316"SELECT a x, b y FROM self GROUP BY a, b ORDER BY 1",317"SELECT a x, b y FROM self GROUP BY a, b ORDER BY 2 DESC",318],319)320def test_order_by_multi_column_aliasing(query: str) -> None:321"""Test ORDER BY with multiple aliased columns and expressions."""322df = pl.DataFrame({"a": [3, 1, 2, 4], "b": [30, 10, 20, 15]})323assert_sql_matches(df, query=query, compare_with="sqlite")324325326@pytest.mark.parametrize(327"query",328[329# aggregate with aliased group key330"SELECT a grp, SUM(b) total FROM self GROUP BY a ORDER BY a",331"SELECT a grp, SUM(b) total FROM self GROUP BY a ORDER BY self.a",332"SELECT a grp, SUM(b) total FROM self GROUP BY a ORDER BY grp",333"SELECT a grp, SUM(b) total FROM self GROUP BY a ORDER BY total DESC",334# multiple aggregates335"SELECT a grp, SUM(b) s, AVG(b) avg FROM self GROUP BY a ORDER BY a",336"SELECT a grp, SUM(b) s, AVG(b) avg FROM self GROUP BY a ORDER BY self.a DESC",337],338)339def test_order_by_aggregate_with_aliased_key(query: str) -> None:340"""Test ORDER BY with aggregates and aliased group keys."""341df = pl.DataFrame({"a": [1, 1, 2, 2, 3], "b": [10, 20, 30, 40, 50]})342assert_sql_matches(df, query=query, compare_with="sqlite")343344345