Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/sql/test_order_by.py
8406 views
1
from __future__ import annotations
2
3
from pathlib import Path
4
5
import pytest
6
7
import polars as pl
8
from polars.exceptions import SQLInterfaceError, SQLSyntaxError
9
from tests.unit.sql.asserts import assert_sql_matches
10
11
12
@pytest.fixture
13
def foods_ipc_path() -> Path:
14
return Path(__file__).parent.parent / "io" / "files" / "foods1.ipc"
15
16
17
def test_order_by_basic(foods_ipc_path: Path) -> None:
18
foods = pl.scan_ipc(foods_ipc_path)
19
20
order_by_distinct_res = foods.sql(
21
"""
22
SELECT DISTINCT category
23
FROM self
24
ORDER BY category DESC
25
"""
26
).collect()
27
assert order_by_distinct_res.to_dict(as_series=False) == {
28
"category": ["vegetables", "seafood", "meat", "fruit"]
29
}
30
31
for category in ("category", "category AS cat"):
32
category_col = category.split(" ")[-1]
33
order_by_group_by_res = foods.sql(
34
f"""
35
SELECT {category}
36
FROM self
37
GROUP BY category
38
ORDER BY {category_col} DESC
39
"""
40
).collect()
41
assert order_by_group_by_res.to_dict(as_series=False) == {
42
category_col: ["vegetables", "seafood", "meat", "fruit"]
43
}
44
45
order_by_constructed_group_by_res = foods.sql(
46
"""
47
SELECT category, SUM(calories) as summed_calories
48
FROM self
49
GROUP BY category
50
ORDER BY summed_calories DESC
51
"""
52
).collect()
53
assert order_by_constructed_group_by_res.to_dict(as_series=False) == {
54
"category": ["seafood", "meat", "fruit", "vegetables"],
55
"summed_calories": [1250, 540, 410, 192],
56
}
57
58
order_by_unselected_res = foods.sql(
59
"""
60
SELECT SUM(calories) as summed_calories
61
FROM self
62
GROUP BY category
63
ORDER BY summed_calories DESC
64
"""
65
).collect()
66
assert order_by_unselected_res.to_dict(as_series=False) == {
67
"summed_calories": [1250, 540, 410, 192],
68
}
69
70
71
def test_order_by_misc_selection() -> None:
72
df = pl.DataFrame({"x": [None, 1, 2, 3], "y": [4, 2, None, 8]})
73
74
# order by aliased col
75
res = df.sql("SELECT x, y AS y2 FROM self ORDER BY y2")
76
assert res.to_dict(as_series=False) == {"x": [1, None, 3, 2], "y2": [2, 4, 8, None]}
77
78
res = df.sql("SELECT x, y AS y2 FROM self ORDER BY y2 DESC")
79
assert res.to_dict(as_series=False) == {"x": [2, 3, None, 1], "y2": [None, 8, 4, 2]}
80
81
# order by col found in wildcard
82
res = df.sql("SELECT *, y AS y2 FROM self ORDER BY y")
83
assert res.to_dict(as_series=False) == {
84
"x": [1, None, 3, 2],
85
"y": [2, 4, 8, None],
86
"y2": [2, 4, 8, None],
87
}
88
res = df.sql("SELECT *, y AS y2 FROM self ORDER BY y NULLS FIRST")
89
assert res.to_dict(as_series=False) == {
90
"x": [2, 1, None, 3],
91
"y": [None, 2, 4, 8],
92
"y2": [None, 2, 4, 8],
93
}
94
95
# order by col found in qualified wildcard
96
res = df.sql("SELECT self.* FROM self ORDER BY x NULLS FIRST")
97
assert res.to_dict(as_series=False) == {"x": [None, 1, 2, 3], "y": [4, 2, None, 8]}
98
99
res = df.sql("SELECT self.* FROM self ORDER BY y NULLS FIRST")
100
assert res.to_dict(as_series=False) == {"x": [2, 1, None, 3], "y": [None, 2, 4, 8]}
101
102
# order by col excluded from wildcard
103
res = df.sql("SELECT * EXCLUDE y FROM self ORDER BY y")
104
assert res.to_dict(as_series=False) == {"x": [1, None, 3, 2]}
105
106
res = df.sql("SELECT * EXCLUDE y FROM self ORDER BY y NULLS FIRST")
107
assert res.to_dict(as_series=False) == {"x": [2, 1, None, 3]}
108
109
# order by col excluded from qualified wildcard
110
res = df.sql("SELECT self.* EXCLUDE y FROM self ORDER BY y")
111
assert res.to_dict(as_series=False) == {"x": [1, None, 3, 2]}
112
113
# order by expression
114
res = df.sql("SELECT (x % y) AS xmy FROM self ORDER BY -(x % y)")
115
assert res.to_dict(as_series=False) == {"xmy": [3, 1, None, None]}
116
117
res = df.sql("SELECT (x % y) AS xmy FROM self ORDER BY x % y NULLS FIRST")
118
assert res.to_dict(as_series=False) == {"xmy": [None, None, 1, 3]}
119
120
# confirm that 'order by all' syntax prioritises cols
121
df = pl.DataFrame({"SOME": [0, 1], "ALL": [1, 0]})
122
res = df.sql("SELECT * FROM self ORDER BY ALL")
123
assert res.to_dict(as_series=False) == {"SOME": [1, 0], "ALL": [0, 1]}
124
125
res = df.sql("SELECT * FROM self ORDER BY ALL DESC")
126
assert res.to_dict(as_series=False) == {"SOME": [0, 1], "ALL": [1, 0]}
127
128
129
def test_order_by_misc_16579() -> None:
130
res = pl.DataFrame(
131
{
132
"x": ["apple", "orange"],
133
"y": ["sheep", "alligator"],
134
"z": ["hello", "world"],
135
}
136
).sql(
137
"""
138
SELECT z, y, x
139
FROM self ORDER BY y DESC
140
"""
141
)
142
assert res.columns == ["z", "y", "x"]
143
assert res.to_dict(as_series=False) == {
144
"z": ["hello", "world"],
145
"y": ["sheep", "alligator"],
146
"x": ["apple", "orange"],
147
}
148
149
150
def test_order_by_multi_nulls_first_last() -> None:
151
df = pl.DataFrame({"x": [None, 1, None, 3], "y": [3, 2, None, 1]})
152
# ┌──────┬──────┐
153
# │ x ┆ y │
154
# │ --- ┆ --- │
155
# │ i64 ┆ i64 │
156
# ╞══════╪══════╡
157
# │ null ┆ 3 │
158
# │ 1 ┆ 2 │
159
# │ null ┆ null │
160
# │ 3 ┆ 1 │
161
# └──────┴──────┘
162
163
res1 = df.sql("SELECT * FROM self ORDER BY x, y")
164
res2 = df.sql("SELECT * FROM self ORDER BY ALL")
165
for res in (res1, res2):
166
assert res.to_dict(as_series=False) == {
167
"x": [1, 3, None, None],
168
"y": [2, 1, 3, None],
169
}
170
171
res = df.sql("SELECT * FROM self ORDER BY x NULLS FIRST, y")
172
assert res.to_dict(as_series=False) == {
173
"x": [None, None, 1, 3],
174
"y": [3, None, 2, 1],
175
}
176
177
res = df.sql("SELECT * FROM self ORDER BY x, y NULLS FIRST")
178
assert res.to_dict(as_series=False) == {
179
"x": [1, 3, None, None],
180
"y": [2, 1, None, 3],
181
}
182
183
res1 = df.sql("SELECT * FROM self ORDER BY x NULLS FIRST, y NULLS FIRST")
184
res2 = df.sql("SELECT * FROM self ORDER BY All NULLS FIRST")
185
for res in (res1, res2):
186
assert res.to_dict(as_series=False) == {
187
"x": [None, None, 1, 3],
188
"y": [None, 3, 2, 1],
189
}
190
191
res1 = df.sql("SELECT * FROM self ORDER BY x DESC NULLS FIRST, y DESC NULLS FIRST")
192
res2 = df.sql("SELECT * FROM self ORDER BY all DESC NULLS FIRST")
193
for res in (res1, res2):
194
assert res.to_dict(as_series=False) == {
195
"x": [None, None, 3, 1],
196
"y": [None, 3, 1, 2],
197
}
198
199
res = df.sql("SELECT * FROM self ORDER BY x DESC NULLS FIRST, y DESC NULLS LAST")
200
assert res.to_dict(as_series=False) == {
201
"x": [None, None, 3, 1],
202
"y": [3, None, 1, 2],
203
}
204
205
res = df.sql("SELECT * FROM self ORDER BY y DESC NULLS FIRST, x NULLS LAST")
206
assert res.to_dict(as_series=False) == {
207
"x": [None, None, 1, 3],
208
"y": [None, 3, 2, 1],
209
}
210
211
212
def test_order_by_ordinal() -> None:
213
df = pl.DataFrame({"x": [None, 1, None, 3], "y": [3, 2, None, 1]})
214
215
res = df.sql("SELECT * FROM self ORDER BY 1, 2")
216
assert res.to_dict(as_series=False) == {
217
"x": [1, 3, None, None],
218
"y": [2, 1, 3, None],
219
}
220
221
res = df.sql("SELECT * FROM self ORDER BY 1 DESC, 2")
222
assert res.to_dict(as_series=False) == {
223
"x": [None, None, 3, 1],
224
"y": [3, None, 1, 2],
225
}
226
227
res = df.sql("SELECT * FROM self ORDER BY 1 DESC NULLS LAST, 2 ASC")
228
assert res.to_dict(as_series=False) == {
229
"x": [3, 1, None, None],
230
"y": [1, 2, 3, None],
231
}
232
233
res = df.sql("SELECT * FROM self ORDER BY 1 DESC NULLS LAST, 2 ASC NULLS FIRST")
234
assert res.to_dict(as_series=False) == {
235
"x": [3, 1, None, None],
236
"y": [1, 2, None, 3],
237
}
238
239
res = df.sql("SELECT * FROM self ORDER BY 1 DESC, 2 DESC NULLS FIRST")
240
assert res.to_dict(as_series=False) == {
241
"x": [None, None, 3, 1],
242
"y": [None, 3, 1, 2],
243
}
244
245
246
def test_order_by_errors() -> None:
247
df = pl.DataFrame({"a": ["w", "x", "y", "z"], "b": [1, 2, 3, 4]})
248
249
with pytest.raises(
250
SQLInterfaceError,
251
match="ORDER BY ordinal value must refer to a valid column; found 99",
252
):
253
df.sql("SELECT * FROM self ORDER BY 99")
254
255
with pytest.raises(
256
SQLSyntaxError,
257
match="negative ordinal values are invalid for ORDER BY; found -1",
258
):
259
df.sql("SELECT * FROM self ORDER BY -1")
260
261
262
@pytest.mark.parametrize(
263
"query",
264
[
265
# basic aliasing: ORDER BY original column name after aliasing
266
"SELECT a b FROM self GROUP BY a ORDER BY a",
267
"SELECT a AS b FROM self GROUP BY a ORDER BY a",
268
# table-qualified aliasing
269
"SELECT self.a b FROM self GROUP BY self.a ORDER BY self.a",
270
"SELECT self.a AS b FROM self GROUP BY self.a ORDER BY self.a",
271
# mixed qualified/unqualified
272
"SELECT a b FROM self GROUP BY a ORDER BY self.a",
273
"SELECT self.a b FROM self GROUP BY a ORDER BY a",
274
# ORDER BY alias name (should still work)
275
"SELECT a b FROM self GROUP BY a ORDER BY b",
276
"SELECT a AS b FROM self GROUP BY a ORDER BY b",
277
],
278
)
279
def test_order_by_aliased_group_key(query: str) -> None:
280
"""Test ORDER BY with original column name when aliased in SELECT."""
281
df = pl.DataFrame({"a": [3, 1, 2], "b": [30, 10, 20]})
282
assert_sql_matches(df, query=query, compare_with="sqlite")
283
284
285
@pytest.mark.parametrize(
286
"query",
287
[
288
# cross-aliasing: columns swap names
289
"SELECT a AS b, b AS a FROM self GROUP BY a, b ORDER BY self.a",
290
"SELECT a AS b, b AS a FROM self GROUP BY a, b ORDER BY self.b",
291
# cross-aliasing with expressions
292
"SELECT a AS b, -b AS a FROM self GROUP BY a, b ORDER BY self.a",
293
"SELECT a AS b, -b AS a FROM self GROUP BY a, b ORDER BY self.b",
294
# cross-aliasing with aggregate
295
"SELECT a AS b, SUM(b) AS a FROM self GROUP BY a ORDER BY self.a",
296
"SELECT a AS b, SUM(b) AS a FROM self GROUP BY a ORDER BY self.b",
297
],
298
)
299
def test_order_by_cross_aliased_columns(query: str) -> None:
300
"""Test ORDER BY with cross-aliasing where columns swap names."""
301
df = pl.DataFrame({"a": [3, 1, 2], "b": [30, 10, 20]})
302
assert_sql_matches(df, query=query, compare_with="sqlite")
303
304
305
@pytest.mark.parametrize(
306
"query",
307
[
308
# multiple columns with various aliasing patterns
309
"SELECT a x, b y, a + b z FROM self GROUP BY a, b ORDER BY a",
310
"SELECT a x, b y, a + b z FROM self GROUP BY a, b ORDER BY self.a",
311
"SELECT a x, b y, a + b z FROM self GROUP BY a, b ORDER BY b DESC",
312
"SELECT a x, b y, a + b z FROM self GROUP BY a, b ORDER BY self.b DESC",
313
# ORDER BY referencing original columns
314
"SELECT a x, b y FROM self GROUP BY a, b ORDER BY a + b",
315
"SELECT a x, b y FROM self GROUP BY a, b ORDER BY self.a + self.b",
316
# ORDER BY with ordinal
317
"SELECT a x, b y FROM self GROUP BY a, b ORDER BY 1",
318
"SELECT a x, b y FROM self GROUP BY a, b ORDER BY 2 DESC",
319
],
320
)
321
def test_order_by_multi_column_aliasing(query: str) -> None:
322
"""Test ORDER BY with multiple aliased columns and expressions."""
323
df = pl.DataFrame({"a": [3, 1, 2, 4], "b": [30, 10, 20, 15]})
324
assert_sql_matches(df, query=query, compare_with="sqlite")
325
326
327
@pytest.mark.parametrize(
328
"query",
329
[
330
# aggregate with aliased group key
331
"SELECT a grp, SUM(b) total FROM self GROUP BY a ORDER BY a",
332
"SELECT a grp, SUM(b) total FROM self GROUP BY a ORDER BY self.a",
333
"SELECT a grp, SUM(b) total FROM self GROUP BY a ORDER BY grp",
334
"SELECT a grp, SUM(b) total FROM self GROUP BY a ORDER BY total DESC",
335
# multiple aggregates
336
"SELECT a grp, SUM(b) s, AVG(b) avg FROM self GROUP BY a ORDER BY a",
337
"SELECT a grp, SUM(b) s, AVG(b) avg FROM self GROUP BY a ORDER BY self.a DESC",
338
],
339
)
340
def test_order_by_aggregate_with_aliased_key(query: str) -> None:
341
"""Test ORDER BY with aggregates and aliased group keys."""
342
df = pl.DataFrame({"a": [1, 1, 2, 2, 3], "b": [10, 20, 30, 40, 50]})
343
assert_sql_matches(df, query=query, compare_with="sqlite")
344
345