CoCalc -- test_order

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/sql/test_order_by.py
⁶⁹³⁹ views
1
from __future__ import annotations
2

3
from pathlib import Path
4

5
import pytest
6

7
import polars as pl
8
from polars.exceptions import SQLInterfaceError, SQLSyntaxError
9

10

11
@pytest.fixture
12
def foods_ipc_path() -> Path:
13
    return Path(__file__).parent.parent / "io" / "files" / "foods1.ipc"
14

15

16
def test_order_by_basic(foods_ipc_path: Path) -> None:
17
    foods = pl.scan_ipc(foods_ipc_path)
18

19
    order_by_distinct_res = foods.sql(
20
        """
21
        SELECT DISTINCT category
22
        FROM self
23
        ORDER BY category DESC
24
        """
25
    ).collect()
26
    assert order_by_distinct_res.to_dict(as_series=False) == {
27
        "category": ["vegetables", "seafood", "meat", "fruit"]
28
    }
29

30
    for category in ("category", "category AS cat"):
31
        category_col = category.split(" ")[-1]
32
        order_by_group_by_res = foods.sql(
33
            f"""
34
            SELECT {category}
35
            FROM self
36
            GROUP BY category
37
            ORDER BY {category_col} DESC
38
            """
39
        ).collect()
40
        assert order_by_group_by_res.to_dict(as_series=False) == {
41
            category_col: ["vegetables", "seafood", "meat", "fruit"]
42
        }
43

44
    order_by_constructed_group_by_res = foods.sql(
45
        """
46
        SELECT category, SUM(calories) as summed_calories
47
        FROM self
48
        GROUP BY category
49
        ORDER BY summed_calories DESC
50
        """
51
    ).collect()
52
    assert order_by_constructed_group_by_res.to_dict(as_series=False) == {
53
        "category": ["seafood", "meat", "fruit", "vegetables"],
54
        "summed_calories": [1250, 540, 410, 192],
55
    }
56

57
    order_by_unselected_res = foods.sql(
58
        """
59
        SELECT SUM(calories) as summed_calories
60
        FROM self
61
        GROUP BY category
62
        ORDER BY summed_calories DESC
63
        """
64
    ).collect()
65
    assert order_by_unselected_res.to_dict(as_series=False) == {
66
        "summed_calories": [1250, 540, 410, 192],
67
    }
68

69

70
def test_order_by_misc_selection() -> None:
71
    df = pl.DataFrame({"x": [None, 1, 2, 3], "y": [4, 2, None, 8]})
72

73
    # order by aliased col
74
    res = df.sql("SELECT x, y AS y2 FROM self ORDER BY y2")
75
    assert res.to_dict(as_series=False) == {"x": [1, None, 3, 2], "y2": [2, 4, 8, None]}
76

77
    res = df.sql("SELECT x, y AS y2 FROM self ORDER BY y2 DESC")
78
    assert res.to_dict(as_series=False) == {"x": [2, 3, None, 1], "y2": [None, 8, 4, 2]}
79

80
    # order by col found in wildcard
81
    res = df.sql("SELECT *, y AS y2 FROM self ORDER BY y")
82
    assert res.to_dict(as_series=False) == {
83
        "x": [1, None, 3, 2],
84
        "y": [2, 4, 8, None],
85
        "y2": [2, 4, 8, None],
86
    }
87
    res = df.sql("SELECT *, y AS y2 FROM self ORDER BY y NULLS FIRST")
88
    assert res.to_dict(as_series=False) == {
89
        "x": [2, 1, None, 3],
90
        "y": [None, 2, 4, 8],
91
        "y2": [None, 2, 4, 8],
92
    }
93

94
    # order by col found in qualified wildcard
95
    res = df.sql("SELECT self.* FROM self ORDER BY x NULLS FIRST")
96
    assert res.to_dict(as_series=False) == {"x": [None, 1, 2, 3], "y": [4, 2, None, 8]}
97

98
    res = df.sql("SELECT self.* FROM self ORDER BY y NULLS FIRST")
99
    assert res.to_dict(as_series=False) == {"x": [2, 1, None, 3], "y": [None, 2, 4, 8]}
100

101
    # order by col excluded from wildcard
102
    res = df.sql("SELECT * EXCLUDE y FROM self ORDER BY y")
103
    assert res.to_dict(as_series=False) == {"x": [1, None, 3, 2]}
104

105
    res = df.sql("SELECT * EXCLUDE y FROM self ORDER BY y NULLS FIRST")
106
    assert res.to_dict(as_series=False) == {"x": [2, 1, None, 3]}
107

108
    # order by col excluded from qualified wildcard
109
    res = df.sql("SELECT self.* EXCLUDE y FROM self ORDER BY y")
110
    assert res.to_dict(as_series=False) == {"x": [1, None, 3, 2]}
111

112
    # order by expression
113
    res = df.sql("SELECT (x % y) AS xmy FROM self ORDER BY -(x % y)")
114
    assert res.to_dict(as_series=False) == {"xmy": [3, 1, None, None]}
115

116
    res = df.sql("SELECT (x % y) AS xmy FROM self ORDER BY x % y NULLS FIRST")
117
    assert res.to_dict(as_series=False) == {"xmy": [None, None, 1, 3]}
118

119
    # confirm that 'order by all' syntax prioritises cols
120
    df = pl.DataFrame({"SOME": [0, 1], "ALL": [1, 0]})
121
    res = df.sql("SELECT * FROM self ORDER BY ALL")
122
    assert res.to_dict(as_series=False) == {"SOME": [1, 0], "ALL": [0, 1]}
123

124
    res = df.sql("SELECT * FROM self ORDER BY ALL DESC")
125
    assert res.to_dict(as_series=False) == {"SOME": [0, 1], "ALL": [1, 0]}
126

127

128
def test_order_by_misc_16579() -> None:
129
    res = pl.DataFrame(
130
        {
131
            "x": ["apple", "orange"],
132
            "y": ["sheep", "alligator"],
133
            "z": ["hello", "world"],
134
        }
135
    ).sql(
136
        """
137
        SELECT z, y, x
138
        FROM self ORDER BY y DESC
139
        """
140
    )
141
    assert res.columns == ["z", "y", "x"]
142
    assert res.to_dict(as_series=False) == {
143
        "z": ["hello", "world"],
144
        "y": ["sheep", "alligator"],
145
        "x": ["apple", "orange"],
146
    }
147

148

149
def test_order_by_multi_nulls_first_last() -> None:
150
    df = pl.DataFrame({"x": [None, 1, None, 3], "y": [3, 2, None, 1]})
151
    # ┌──────┬──────┐
152
    # │ x    ┆ y    │
153
    # │ ---  ┆ ---  │
154
    # │ i64  ┆ i64  │
155
    # ╞══════╪══════╡
156
    # │ null ┆ 3    │
157
    # │ 1    ┆ 2    │
158
    # │ null ┆ null │
159
    # │ 3    ┆ 1    │
160
    # └──────┴──────┘
161

162
    res1 = df.sql("SELECT * FROM self ORDER BY x, y")
163
    res2 = df.sql("SELECT * FROM self ORDER BY ALL")
164
    for res in (res1, res2):
165
        assert res.to_dict(as_series=False) == {
166
            "x": [1, 3, None, None],
167
            "y": [2, 1, 3, None],
168
        }
169

170
    res = df.sql("SELECT * FROM self ORDER BY x NULLS FIRST, y")
171
    assert res.to_dict(as_series=False) == {
172
        "x": [None, None, 1, 3],
173
        "y": [3, None, 2, 1],
174
    }
175

176
    res = df.sql("SELECT * FROM self ORDER BY x, y NULLS FIRST")
177
    assert res.to_dict(as_series=False) == {
178
        "x": [1, 3, None, None],
179
        "y": [2, 1, None, 3],
180
    }
181

182
    res1 = df.sql("SELECT * FROM self ORDER BY x NULLS FIRST, y NULLS FIRST")
183
    res2 = df.sql("SELECT * FROM self ORDER BY All NULLS FIRST")
184
    for res in (res1, res2):
185
        assert res.to_dict(as_series=False) == {
186
            "x": [None, None, 1, 3],
187
            "y": [None, 3, 2, 1],
188
        }
189

190
    res1 = df.sql("SELECT * FROM self ORDER BY x DESC NULLS FIRST, y DESC NULLS FIRST")
191
    res2 = df.sql("SELECT * FROM self ORDER BY all DESC NULLS FIRST")
192
    for res in (res1, res2):
193
        assert res.to_dict(as_series=False) == {
194
            "x": [None, None, 3, 1],
195
            "y": [None, 3, 1, 2],
196
        }
197

198
    res = df.sql("SELECT * FROM self ORDER BY x DESC NULLS FIRST, y DESC NULLS LAST")
199
    assert res.to_dict(as_series=False) == {
200
        "x": [None, None, 3, 1],
201
        "y": [3, None, 1, 2],
202
    }
203

204
    res = df.sql("SELECT * FROM self ORDER BY y DESC NULLS FIRST, x NULLS LAST")
205
    assert res.to_dict(as_series=False) == {
206
        "x": [None, None, 1, 3],
207
        "y": [None, 3, 2, 1],
208
    }
209

210

211
def test_order_by_ordinal() -> None:
212
    df = pl.DataFrame({"x": [None, 1, None, 3], "y": [3, 2, None, 1]})
213

214
    res = df.sql("SELECT * FROM self ORDER BY 1, 2")
215
    assert res.to_dict(as_series=False) == {
216
        "x": [1, 3, None, None],
217
        "y": [2, 1, 3, None],
218
    }
219

220
    res = df.sql("SELECT * FROM self ORDER BY 1 DESC, 2")
221
    assert res.to_dict(as_series=False) == {
222
        "x": [None, None, 3, 1],
223
        "y": [3, None, 1, 2],
224
    }
225

226
    res = df.sql("SELECT * FROM self ORDER BY 1 DESC NULLS LAST, 2 ASC")
227
    assert res.to_dict(as_series=False) == {
228
        "x": [3, 1, None, None],
229
        "y": [1, 2, 3, None],
230
    }
231

232
    res = df.sql("SELECT * FROM self ORDER BY 1 DESC NULLS LAST, 2 ASC NULLS FIRST")
233
    assert res.to_dict(as_series=False) == {
234
        "x": [3, 1, None, None],
235
        "y": [1, 2, None, 3],
236
    }
237

238
    res = df.sql("SELECT * FROM self ORDER BY 1 DESC, 2 DESC NULLS FIRST")
239
    assert res.to_dict(as_series=False) == {
240
        "x": [None, None, 3, 1],
241
        "y": [None, 3, 1, 2],
242
    }
243

244

245
def test_order_by_errors() -> None:
246
    df = pl.DataFrame({"a": ["w", "x", "y", "z"], "b": [1, 2, 3, 4]})
247

248
    with pytest.raises(
249
        SQLInterfaceError,
250
        match="ORDER BY ordinal value must refer to a valid column; found 99",
251
    ):
252
        df.sql("SELECT * FROM self ORDER BY 99")
253

254
    with pytest.raises(
255
        SQLSyntaxError,
256
        match="negative ordinal values are invalid for ORDER BY; found -1",
257
    ):
258
        df.sql("SELECT * FROM self ORDER BY -1")
259

260
Product

Resources

Company