Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/sql/test_distinct.py
7884 views
1
from __future__ import annotations
2
3
import polars as pl
4
from tests.unit.sql import assert_sql_matches
5
6
7
def test_distinct_basic_single_column(df_distinct: pl.DataFrame) -> None:
8
"""Test DISTINCT on a single column."""
9
assert_sql_matches(
10
df_distinct,
11
query="""
12
SELECT DISTINCT category
13
FROM self ORDER BY category NULLS FIRST
14
""",
15
compare_with=["sqlite"],
16
expected={"category": [None, "A", "B", "C"]},
17
)
18
19
20
def test_distinct_basic_multiple_columns(df_distinct: pl.DataFrame) -> None:
21
"""Test DISTINCT across multiple columns."""
22
assert_sql_matches(
23
df_distinct,
24
query="""
25
SELECT DISTINCT category, subcategory
26
FROM self ORDER BY category NULLS FIRST, subcategory
27
""",
28
compare_with=["sqlite"],
29
expected={
30
"category": [None, None, "A", "B", "B", "C", "C"],
31
"subcategory": ["x", "y", "x", "y", "z", "x", "y"],
32
},
33
)
34
35
36
def test_distinct_basic_all_columns(df_distinct: pl.DataFrame) -> None:
37
"""Test DISTINCT across all columns."""
38
assert_sql_matches(
39
df_distinct,
40
query="""
41
SELECT DISTINCT * FROM self
42
ORDER BY category NULLS FIRST, subcategory, value, status, score
43
""",
44
compare_with=["sqlite"],
45
expected={
46
"category": [None, None, "A", "A", "B", "B", "B", "C", "C"],
47
"subcategory": ["x", "y", "x", "x", "y", "y", "z", "x", "y"],
48
"value": [600, 700, 100, 100, 200, 200, 300, 400, 500],
49
"status": [
50
"active",
51
"inactive",
52
"active",
53
"active",
54
"active",
55
"inactive",
56
"active",
57
"inactive",
58
"active",
59
],
60
"score": [70, 80, 10, 20, 30, 30, 40, 50, 60],
61
},
62
)
63
64
65
def test_distinct_with_expressions(df_distinct: pl.DataFrame) -> None:
66
"""Test DISTINCT with column expressions and aggregations."""
67
assert_sql_matches(
68
df_distinct,
69
query="""
70
SELECT DISTINCT
71
category,
72
value * 2 AS doubled_value
73
FROM self
74
WHERE category IS NOT NULL
75
ORDER BY category, doubled_value
76
""",
77
compare_with=["sqlite"],
78
expected={
79
"category": ["A", "B", "B", "C", "C"],
80
"doubled_value": [200, 400, 600, 800, 1000],
81
},
82
)
83
84
85
def test_distinct_with_full_outer_join(df_distinct: pl.DataFrame) -> None:
86
"""Test DISTINCT with FULL OUTER JOIN producing NULLs on both sides."""
87
df_extended = pl.DataFrame(
88
{
89
"category": ["A", "D", "E"],
90
"extra_info": ["info_a", "info_d", "info_e"],
91
}
92
)
93
assert_sql_matches(
94
frames={"data": df_distinct, "extended": df_extended},
95
query="""
96
SELECT DISTINCT
97
COALESCE(d.category, e.category) AS category,
98
e.extra_info
99
FROM (SELECT DISTINCT category FROM data WHERE category IS NOT NULL) d
100
FULL JOIN extended e USING (category)
101
ORDER BY category
102
""",
103
compare_with=["sqlite"],
104
expected={
105
"category": ["A", "B", "C", "D", "E"],
106
"extra_info": ["info_a", None, None, "info_d", "info_e"],
107
},
108
)
109
110
111
def test_distinct_with_group_by(df_distinct: pl.DataFrame) -> None:
112
"""Test DISTINCT in combination with GROUP BY."""
113
assert_sql_matches(
114
df_distinct,
115
query="""
116
SELECT
117
category,
118
COUNT(DISTINCT subcategory) AS distinct_subcats,
119
COUNT(DISTINCT status) AS distinct_statuses,
120
SUM(value) AS total_value
121
FROM self
122
WHERE category IS NOT NULL
123
GROUP BY category
124
ORDER BY category
125
""",
126
compare_with=["sqlite"],
127
expected={
128
"category": ["A", "B", "C"],
129
"distinct_subcats": [1, 2, 2],
130
"distinct_statuses": [1, 2, 2],
131
"total_value": [300, 700, 900],
132
},
133
)
134
assert_sql_matches(
135
df_distinct,
136
query="""
137
SELECT *
138
FROM (
139
SELECT
140
subcategory,
141
COUNT(DISTINCT category) AS distinct_categories
142
FROM self
143
WHERE category IS NOT NULL
144
GROUP BY subcategory
145
) AS agg
146
WHERE distinct_categories > 1
147
ORDER BY subcategory
148
""",
149
compare_with=["sqlite"],
150
expected={
151
"subcategory": ["x", "y"],
152
"distinct_categories": [2, 2],
153
},
154
)
155
156
157
def test_distinct_with_join(df_distinct: pl.DataFrame) -> None:
158
"""Test DISTINCT with multiway JOINs."""
159
df_categories = pl.DataFrame(
160
{
161
"category": ["A", "B", "C", "D"],
162
"category_name": ["Alpha", "Beta", "Gamma", "Delta"],
163
}
164
)
165
df_status_info = pl.DataFrame(
166
{
167
"status": ["active", "inactive", "pending"],
168
"priority": [1, 2, 3],
169
}
170
)
171
assert_sql_matches(
172
{
173
"data": df_distinct,
174
"categories": df_categories,
175
"status_info": df_status_info,
176
},
177
query="""
178
SELECT DISTINCT
179
d.category,
180
c.category_name,
181
d.status,
182
s.priority
183
FROM data d
184
INNER JOIN categories c ON d.category = c.category
185
INNER JOIN status_info s ON d.status = s.status
186
ORDER BY d.category, d.status
187
""",
188
compare_with=["sqlite"],
189
expected={
190
"category": ["A", "B", "B", "C", "C"],
191
"category_name": ["Alpha", "Beta", "Beta", "Gamma", "Gamma"],
192
"status": ["active", "active", "inactive", "active", "inactive"],
193
"priority": [1, 1, 2, 1, 2],
194
},
195
)
196
197
198
def test_distinct_with_left_join_nulls(df_distinct: pl.DataFrame) -> None:
199
"""Test DISTINCT behaviour with NULL values introduced by LEFT JOIN."""
200
df_lookup = pl.DataFrame(
201
{
202
"category": ["A", "B"],
203
"region": ["North", "South"],
204
}
205
)
206
assert_sql_matches(
207
frames={"data": df_distinct, "lookup": df_lookup},
208
query="""
209
SELECT DISTINCT
210
d.category,
211
l.region
212
FROM data d
213
LEFT JOIN lookup l ON d.category = l.category
214
ORDER BY d.category NULLS FIRST, l.region
215
""",
216
compare_with=["sqlite"],
217
expected={
218
"category": [None, "A", "B", "C"],
219
"region": [None, "North", "South", None],
220
},
221
)
222
223
224
def test_distinct_with_nulls_handling(df_distinct: pl.DataFrame) -> None:
225
"""Test that DISTINCT treats NULL values as different from each other."""
226
assert_sql_matches(
227
df_distinct,
228
query="""
229
SELECT DISTINCT category, status
230
FROM self ORDER BY category NULLS FIRST, status
231
""",
232
compare_with=["sqlite"],
233
expected={
234
"category": [None, None, "A", "B", "B", "C", "C"],
235
"status": [
236
"active",
237
"inactive",
238
"active",
239
"active",
240
"inactive",
241
"active",
242
"inactive",
243
],
244
},
245
)
246
247
248
def test_distinct_with_where_filter(df_distinct: pl.DataFrame) -> None:
249
"""Test DISTINCT with various WHERE clause filters."""
250
assert_sql_matches(
251
df_distinct,
252
query="""
253
SELECT DISTINCT category, status
254
FROM self
255
WHERE value >= 200 AND category IS NOT NULL
256
ORDER BY category, status
257
""",
258
compare_with=["sqlite"],
259
expected={
260
"category": ["B", "B", "C", "C"],
261
"status": ["active", "inactive", "active", "inactive"],
262
},
263
)
264
265