Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/sql/test_order_by.py
6939 views
1
from __future__ import annotations
2
3
from pathlib import Path
4
5
import pytest
6
7
import polars as pl
8
from polars.exceptions import SQLInterfaceError, SQLSyntaxError
9
10
11
@pytest.fixture
12
def foods_ipc_path() -> Path:
13
return Path(__file__).parent.parent / "io" / "files" / "foods1.ipc"
14
15
16
def test_order_by_basic(foods_ipc_path: Path) -> None:
17
foods = pl.scan_ipc(foods_ipc_path)
18
19
order_by_distinct_res = foods.sql(
20
"""
21
SELECT DISTINCT category
22
FROM self
23
ORDER BY category DESC
24
"""
25
).collect()
26
assert order_by_distinct_res.to_dict(as_series=False) == {
27
"category": ["vegetables", "seafood", "meat", "fruit"]
28
}
29
30
for category in ("category", "category AS cat"):
31
category_col = category.split(" ")[-1]
32
order_by_group_by_res = foods.sql(
33
f"""
34
SELECT {category}
35
FROM self
36
GROUP BY category
37
ORDER BY {category_col} DESC
38
"""
39
).collect()
40
assert order_by_group_by_res.to_dict(as_series=False) == {
41
category_col: ["vegetables", "seafood", "meat", "fruit"]
42
}
43
44
order_by_constructed_group_by_res = foods.sql(
45
"""
46
SELECT category, SUM(calories) as summed_calories
47
FROM self
48
GROUP BY category
49
ORDER BY summed_calories DESC
50
"""
51
).collect()
52
assert order_by_constructed_group_by_res.to_dict(as_series=False) == {
53
"category": ["seafood", "meat", "fruit", "vegetables"],
54
"summed_calories": [1250, 540, 410, 192],
55
}
56
57
order_by_unselected_res = foods.sql(
58
"""
59
SELECT SUM(calories) as summed_calories
60
FROM self
61
GROUP BY category
62
ORDER BY summed_calories DESC
63
"""
64
).collect()
65
assert order_by_unselected_res.to_dict(as_series=False) == {
66
"summed_calories": [1250, 540, 410, 192],
67
}
68
69
70
def test_order_by_misc_selection() -> None:
71
df = pl.DataFrame({"x": [None, 1, 2, 3], "y": [4, 2, None, 8]})
72
73
# order by aliased col
74
res = df.sql("SELECT x, y AS y2 FROM self ORDER BY y2")
75
assert res.to_dict(as_series=False) == {"x": [1, None, 3, 2], "y2": [2, 4, 8, None]}
76
77
res = df.sql("SELECT x, y AS y2 FROM self ORDER BY y2 DESC")
78
assert res.to_dict(as_series=False) == {"x": [2, 3, None, 1], "y2": [None, 8, 4, 2]}
79
80
# order by col found in wildcard
81
res = df.sql("SELECT *, y AS y2 FROM self ORDER BY y")
82
assert res.to_dict(as_series=False) == {
83
"x": [1, None, 3, 2],
84
"y": [2, 4, 8, None],
85
"y2": [2, 4, 8, None],
86
}
87
res = df.sql("SELECT *, y AS y2 FROM self ORDER BY y NULLS FIRST")
88
assert res.to_dict(as_series=False) == {
89
"x": [2, 1, None, 3],
90
"y": [None, 2, 4, 8],
91
"y2": [None, 2, 4, 8],
92
}
93
94
# order by col found in qualified wildcard
95
res = df.sql("SELECT self.* FROM self ORDER BY x NULLS FIRST")
96
assert res.to_dict(as_series=False) == {"x": [None, 1, 2, 3], "y": [4, 2, None, 8]}
97
98
res = df.sql("SELECT self.* FROM self ORDER BY y NULLS FIRST")
99
assert res.to_dict(as_series=False) == {"x": [2, 1, None, 3], "y": [None, 2, 4, 8]}
100
101
# order by col excluded from wildcard
102
res = df.sql("SELECT * EXCLUDE y FROM self ORDER BY y")
103
assert res.to_dict(as_series=False) == {"x": [1, None, 3, 2]}
104
105
res = df.sql("SELECT * EXCLUDE y FROM self ORDER BY y NULLS FIRST")
106
assert res.to_dict(as_series=False) == {"x": [2, 1, None, 3]}
107
108
# order by col excluded from qualified wildcard
109
res = df.sql("SELECT self.* EXCLUDE y FROM self ORDER BY y")
110
assert res.to_dict(as_series=False) == {"x": [1, None, 3, 2]}
111
112
# order by expression
113
res = df.sql("SELECT (x % y) AS xmy FROM self ORDER BY -(x % y)")
114
assert res.to_dict(as_series=False) == {"xmy": [3, 1, None, None]}
115
116
res = df.sql("SELECT (x % y) AS xmy FROM self ORDER BY x % y NULLS FIRST")
117
assert res.to_dict(as_series=False) == {"xmy": [None, None, 1, 3]}
118
119
# confirm that 'order by all' syntax prioritises cols
120
df = pl.DataFrame({"SOME": [0, 1], "ALL": [1, 0]})
121
res = df.sql("SELECT * FROM self ORDER BY ALL")
122
assert res.to_dict(as_series=False) == {"SOME": [1, 0], "ALL": [0, 1]}
123
124
res = df.sql("SELECT * FROM self ORDER BY ALL DESC")
125
assert res.to_dict(as_series=False) == {"SOME": [0, 1], "ALL": [1, 0]}
126
127
128
def test_order_by_misc_16579() -> None:
129
res = pl.DataFrame(
130
{
131
"x": ["apple", "orange"],
132
"y": ["sheep", "alligator"],
133
"z": ["hello", "world"],
134
}
135
).sql(
136
"""
137
SELECT z, y, x
138
FROM self ORDER BY y DESC
139
"""
140
)
141
assert res.columns == ["z", "y", "x"]
142
assert res.to_dict(as_series=False) == {
143
"z": ["hello", "world"],
144
"y": ["sheep", "alligator"],
145
"x": ["apple", "orange"],
146
}
147
148
149
def test_order_by_multi_nulls_first_last() -> None:
150
df = pl.DataFrame({"x": [None, 1, None, 3], "y": [3, 2, None, 1]})
151
# ┌──────┬──────┐
152
# │ x ┆ y │
153
# │ --- ┆ --- │
154
# │ i64 ┆ i64 │
155
# ╞══════╪══════╡
156
# │ null ┆ 3 │
157
# │ 1 ┆ 2 │
158
# │ null ┆ null │
159
# │ 3 ┆ 1 │
160
# └──────┴──────┘
161
162
res1 = df.sql("SELECT * FROM self ORDER BY x, y")
163
res2 = df.sql("SELECT * FROM self ORDER BY ALL")
164
for res in (res1, res2):
165
assert res.to_dict(as_series=False) == {
166
"x": [1, 3, None, None],
167
"y": [2, 1, 3, None],
168
}
169
170
res = df.sql("SELECT * FROM self ORDER BY x NULLS FIRST, y")
171
assert res.to_dict(as_series=False) == {
172
"x": [None, None, 1, 3],
173
"y": [3, None, 2, 1],
174
}
175
176
res = df.sql("SELECT * FROM self ORDER BY x, y NULLS FIRST")
177
assert res.to_dict(as_series=False) == {
178
"x": [1, 3, None, None],
179
"y": [2, 1, None, 3],
180
}
181
182
res1 = df.sql("SELECT * FROM self ORDER BY x NULLS FIRST, y NULLS FIRST")
183
res2 = df.sql("SELECT * FROM self ORDER BY All NULLS FIRST")
184
for res in (res1, res2):
185
assert res.to_dict(as_series=False) == {
186
"x": [None, None, 1, 3],
187
"y": [None, 3, 2, 1],
188
}
189
190
res1 = df.sql("SELECT * FROM self ORDER BY x DESC NULLS FIRST, y DESC NULLS FIRST")
191
res2 = df.sql("SELECT * FROM self ORDER BY all DESC NULLS FIRST")
192
for res in (res1, res2):
193
assert res.to_dict(as_series=False) == {
194
"x": [None, None, 3, 1],
195
"y": [None, 3, 1, 2],
196
}
197
198
res = df.sql("SELECT * FROM self ORDER BY x DESC NULLS FIRST, y DESC NULLS LAST")
199
assert res.to_dict(as_series=False) == {
200
"x": [None, None, 3, 1],
201
"y": [3, None, 1, 2],
202
}
203
204
res = df.sql("SELECT * FROM self ORDER BY y DESC NULLS FIRST, x NULLS LAST")
205
assert res.to_dict(as_series=False) == {
206
"x": [None, None, 1, 3],
207
"y": [None, 3, 2, 1],
208
}
209
210
211
def test_order_by_ordinal() -> None:
212
df = pl.DataFrame({"x": [None, 1, None, 3], "y": [3, 2, None, 1]})
213
214
res = df.sql("SELECT * FROM self ORDER BY 1, 2")
215
assert res.to_dict(as_series=False) == {
216
"x": [1, 3, None, None],
217
"y": [2, 1, 3, None],
218
}
219
220
res = df.sql("SELECT * FROM self ORDER BY 1 DESC, 2")
221
assert res.to_dict(as_series=False) == {
222
"x": [None, None, 3, 1],
223
"y": [3, None, 1, 2],
224
}
225
226
res = df.sql("SELECT * FROM self ORDER BY 1 DESC NULLS LAST, 2 ASC")
227
assert res.to_dict(as_series=False) == {
228
"x": [3, 1, None, None],
229
"y": [1, 2, 3, None],
230
}
231
232
res = df.sql("SELECT * FROM self ORDER BY 1 DESC NULLS LAST, 2 ASC NULLS FIRST")
233
assert res.to_dict(as_series=False) == {
234
"x": [3, 1, None, None],
235
"y": [1, 2, None, 3],
236
}
237
238
res = df.sql("SELECT * FROM self ORDER BY 1 DESC, 2 DESC NULLS FIRST")
239
assert res.to_dict(as_series=False) == {
240
"x": [None, None, 3, 1],
241
"y": [None, 3, 1, 2],
242
}
243
244
245
def test_order_by_errors() -> None:
246
df = pl.DataFrame({"a": ["w", "x", "y", "z"], "b": [1, 2, 3, 4]})
247
248
with pytest.raises(
249
SQLInterfaceError,
250
match="ORDER BY ordinal value must refer to a valid column; found 99",
251
):
252
df.sql("SELECT * FROM self ORDER BY 99")
253
254
with pytest.raises(
255
SQLSyntaxError,
256
match="negative ordinal values are invalid for ORDER BY; found -1",
257
):
258
df.sql("SELECT * FROM self ORDER BY -1")
259
260