Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/sql/test_rank_functions.py
7884 views
1
from __future__ import annotations
2
3
import pytest
4
5
import polars as pl
6
from tests.unit.sql import assert_sql_matches
7
8
9
@pytest.fixture
10
def df_test() -> pl.DataFrame:
11
return pl.DataFrame(
12
{
13
"id": [1, 2, 3, 4, 5, 6, 7],
14
"category": ["A", "A", "A", "B", "B", "B", "C"],
15
"value": [20, 10, 25, 10, 40, 25, 35],
16
}
17
)
18
19
20
def test_rank_funcs_comparison(df_test: pl.DataFrame) -> None:
21
# Compare ROW_NUMBER, RANK, and DENSE_RANK; can see the
22
# differences between them when there are tied values
23
query = """
24
SELECT
25
value,
26
ROW_NUMBER() OVER (ORDER BY value) AS row_num,
27
RANK() OVER (ORDER BY value) AS rank,
28
DENSE_RANK() OVER (ORDER BY value) AS dense_rank
29
FROM self
30
ORDER BY value, id
31
"""
32
assert_sql_matches(
33
df_test,
34
query=query,
35
compare_with="sqlite",
36
expected={
37
"value": [10, 10, 20, 25, 25, 35, 40],
38
"row_num": [1, 2, 3, 4, 5, 6, 7],
39
"rank": [1, 1, 3, 4, 4, 6, 7],
40
"dense_rank": [1, 1, 2, 3, 3, 4, 5],
41
},
42
)
43
44
45
def test_rank_funcs_with_partition(df_test: pl.DataFrame) -> None:
46
# All three ranking functions should return identical
47
# results if there are no ties within the partitions
48
query = """
49
SELECT
50
category,
51
value,
52
ROW_NUMBER() OVER (PARTITION BY category ORDER BY value) AS row_num,
53
RANK() OVER (PARTITION BY category ORDER BY value) AS rank,
54
DENSE_RANK() OVER (PARTITION BY category ORDER BY value) AS dense
55
FROM self
56
ORDER BY category, value
57
"""
58
assert_sql_matches(
59
df_test,
60
query=query,
61
compare_with="sqlite",
62
expected={
63
"category": ["A", "A", "A", "B", "B", "B", "C"],
64
"value": [10, 20, 25, 10, 25, 40, 35],
65
# No ties within each partition, so identical results
66
"row_num": [1, 2, 3, 1, 2, 3, 1],
67
"rank": [1, 2, 3, 1, 2, 3, 1],
68
"dense": [1, 2, 3, 1, 2, 3, 1],
69
},
70
)
71
72
# We expect to see differences (in the same query) when there *are* ties
73
df = pl.DataFrame(
74
{
75
"id": [1, 2, 3, 4, 5, 6, 7, 8],
76
"category": ["A", "A", "A", "B", "B", "B", "B", "C"],
77
"value": [10, 10, 20, 20, 30, 30, 30, 30],
78
}
79
)
80
assert_sql_matches(
81
df,
82
query=query,
83
compare_with="sqlite",
84
expected={
85
"category": ["A", "A", "A", "B", "B", "B", "B", "C"],
86
"value": [10, 10, 20, 20, 30, 30, 30, 30],
87
# ROW_NUMBER: always unique
88
"row_num": [1, 2, 3, 1, 2, 3, 4, 1],
89
# RANK: ties get same rank, next rank skips
90
"rank": [1, 1, 3, 1, 2, 2, 2, 1],
91
# DENSE_RANK: ties get same rank, next rank is consecutive
92
"dense": [1, 1, 2, 1, 2, 2, 2, 1],
93
},
94
)
95
96
97
def test_rank_funcs_desc(df_test: pl.DataFrame) -> None:
98
query = """
99
SELECT
100
value,
101
ROW_NUMBER() OVER (ORDER BY value DESC, category DESC) AS row_num,
102
RANK() OVER (ORDER BY value DESC, category DESC) AS rank,
103
DENSE_RANK() OVER (ORDER BY value DESC, category DESC) AS dense_rank
104
FROM self
105
ORDER BY value, id DESC
106
"""
107
assert_sql_matches(
108
df_test,
109
query=query,
110
compare_with="sqlite",
111
expected={
112
"value": [10, 10, 20, 25, 25, 35, 40],
113
"row_num": [6, 7, 5, 3, 4, 2, 1],
114
"rank": [6, 7, 5, 3, 4, 2, 1],
115
"dense_rank": [6, 7, 5, 3, 4, 2, 1],
116
},
117
)
118
119
120
def test_rank_funcs_require_order_by(df_test: pl.DataFrame) -> None:
121
# ROW_NUMBER without ORDER BY is fine (uses arbitrary order)
122
query_row_num = "SELECT id, ROW_NUMBER() OVER () FROM self"
123
result = df_test.sql(query_row_num)
124
assert result.height == 7 # Just verify it runs
125
126
# RANK without ORDER BY should error
127
query_rank = "SELECT id, RANK() OVER (PARTITION BY category) FROM self"
128
with pytest.raises(
129
pl.exceptions.SQLSyntaxError,
130
match="RANK requires an OVER clause with ORDER BY",
131
):
132
df_test.sql(query_rank)
133
134
# DENSE_RANK without ORDER BY should error
135
query_dense = "SELECT id, DENSE_RANK() OVER () FROM self"
136
with pytest.raises(
137
pl.exceptions.SQLSyntaxError,
138
match="DENSE_RANK requires an OVER clause with ORDER BY",
139
):
140
df_test.sql(query_dense)
141
142