Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/sql/test_distinct_on.py
7884 views
1
from __future__ import annotations
2
3
import polars as pl
4
from tests.unit.sql import assert_sql_matches
5
6
# Note: SQLite does not support "DISTINCT ON", so only compare results against DuckDB
7
8
9
def test_distinct_on_single_column(df_distinct: pl.DataFrame) -> None:
10
"""Test DISTINCT ON with single column - keeps first row per distinct value."""
11
assert_sql_matches(
12
df_distinct,
13
query="""
14
SELECT DISTINCT ON (category)
15
category, subcategory, value, status, score
16
FROM self
17
ORDER BY category NULLS FIRST, value
18
""",
19
compare_with="duckdb",
20
expected={
21
"category": [None, "A", "B", "C"],
22
"subcategory": ["x", "x", "y", "x"],
23
"value": [600, 100, 200, 400],
24
"status": ["active", "active", "active", "inactive"],
25
"score": [70, 10, 30, 50],
26
},
27
)
28
29
30
def test_distinct_on_multiple_columns(df_distinct: pl.DataFrame) -> None:
31
"""Test DISTINCT ON with multiple columns."""
32
assert_sql_matches(
33
df_distinct,
34
query="""
35
SELECT DISTINCT ON (category, subcategory)
36
category, subcategory, value, status, score
37
FROM self
38
ORDER BY category NULLS FIRST, subcategory, score
39
""",
40
compare_with="duckdb",
41
expected={
42
"category": [None, None, "A", "B", "B", "C", "C"],
43
"subcategory": ["x", "y", "x", "y", "z", "x", "y"],
44
"value": [600, 700, 100, 200, 300, 400, 500],
45
"status": [
46
"active",
47
"inactive",
48
"active",
49
"active",
50
"active",
51
"inactive",
52
"active",
53
],
54
"score": [70, 80, 10, 30, 40, 50, 60],
55
},
56
)
57
58
59
def test_distinct_on_after_join(df_distinct: pl.DataFrame) -> None:
60
"""Test DISTINCT ON applied after a JOIN operation."""
61
df_supplements = pl.DataFrame(
62
{
63
"category": ["A", "A", "B", "B", "C"],
64
"supplement_id": [1, 2, 3, 4, 5],
65
"supplement_score": [95, 85, 75, 90, 80],
66
}
67
)
68
assert_sql_matches(
69
frames={"data": df_distinct, "supplements": df_supplements},
70
query="""
71
SELECT DISTINCT ON (d.category)
72
d.category,
73
d.value,
74
s.supplement_id,
75
s.supplement_score
76
FROM data d
77
INNER JOIN supplements s ON d.category = s.category
78
ORDER BY
79
d.category,
80
s.supplement_score DESC,
81
d.value ASC
82
""",
83
compare_with="duckdb",
84
expected={
85
"category": ["A", "B", "C"],
86
"value": [100, 200, 400],
87
"supplement_id": [1, 4, 5],
88
"supplement_score": [95, 90, 80],
89
},
90
)
91
92
93
def test_distinct_on_with_ordering(df_distinct: pl.DataFrame) -> None:
94
"""Test DISTINCT ON where ORDER BY determines which row is kept."""
95
# ascending 'value' order
96
assert_sql_matches(
97
df_distinct,
98
query="""
99
SELECT DISTINCT ON (status)
100
status, value
101
FROM self
102
WHERE category IS NOT NULL
103
ORDER BY status, value ASC
104
""",
105
compare_with="duckdb",
106
expected={
107
"status": ["active", "inactive"],
108
"value": [100, 200],
109
},
110
)
111
112
# descending 'value' order
113
assert_sql_matches(
114
df_distinct,
115
query="""
116
SELECT DISTINCT ON (status)
117
status, value
118
FROM self
119
WHERE category IS NOT NULL
120
ORDER BY status, value DESC
121
""",
122
compare_with="duckdb",
123
expected={
124
"status": ["active", "inactive"],
125
"value": [500, 400],
126
},
127
)
128
129
# ascending category with 'nulls first', descending score
130
assert_sql_matches(
131
df_distinct,
132
query="""
133
SELECT DISTINCT ON (category)
134
category, subcategory, value, score
135
FROM self
136
ORDER BY category NULLS FIRST, score DESC
137
""",
138
compare_with="duckdb",
139
expected={
140
"category": [None, "A", "B", "C"],
141
"subcategory": ["y", "x", "z", "y"],
142
"value": [700, 100, 300, 500],
143
"score": [80, 20, 40, 60],
144
},
145
)
146
147
# mixed ordering, multiple 'distinct on' cols
148
assert_sql_matches(
149
df_distinct,
150
query="""
151
SELECT DISTINCT ON (category, status)
152
category, status, subcategory, value, score
153
FROM self
154
ORDER BY category NULLS FIRST, status, value DESC, score ASC
155
""",
156
compare_with="duckdb",
157
expected={
158
"category": [None, None, "A", "B", "B", "C", "C"],
159
"status": [
160
"active",
161
"inactive",
162
"active",
163
"active",
164
"inactive",
165
"active",
166
"inactive",
167
],
168
"subcategory": ["x", "y", "x", "z", "y", "y", "x"],
169
"value": [600, 700, 100, 300, 200, 500, 400],
170
"score": [70, 80, 10, 40, 30, 60, 50],
171
},
172
)
173
174
175
def test_distinct_on_with_distinct_aggregation_in_join(
176
df_distinct: pl.DataFrame,
177
) -> None:
178
"""Test DISTINCT COUNT in aggregations used within JOINs."""
179
df_targets = pl.DataFrame(
180
{
181
"category": ["A", "B", "C", "A"],
182
"target_subcats": [1, 2, 3, 4],
183
}
184
)
185
assert_sql_matches(
186
frames={"dist": df_distinct, "targets": df_targets},
187
query="""
188
SELECT DISTINCT ON (a.category)
189
a.category,
190
a.unique_subcats,
191
t.target_subcats,
192
CASE
193
WHEN a.unique_subcats >= t.target_subcats THEN 'yes'
194
ELSE 'no'
195
END AS met_target
196
FROM (
197
SELECT
198
category,
199
COUNT(DISTINCT subcategory) AS unique_subcats
200
FROM dist
201
WHERE category IS NOT NULL
202
GROUP BY category
203
) a
204
INNER JOIN targets t ON a.category = t.category
205
ORDER BY a.category, target_subcats DESC
206
""",
207
compare_with=["duckdb"],
208
expected={
209
"category": ["A", "B", "C"],
210
"unique_subcats": [1, 2, 2],
211
"target_subcats": [4, 2, 3],
212
"met_target": ["no", "yes", "no"],
213
},
214
)
215
216