Path: blob/main/py-polars/tests/unit/sql/test_rank_functions.py
7884 views
from __future__ import annotations12import pytest34import polars as pl5from tests.unit.sql import assert_sql_matches678@pytest.fixture9def df_test() -> pl.DataFrame:10return pl.DataFrame(11{12"id": [1, 2, 3, 4, 5, 6, 7],13"category": ["A", "A", "A", "B", "B", "B", "C"],14"value": [20, 10, 25, 10, 40, 25, 35],15}16)171819def test_rank_funcs_comparison(df_test: pl.DataFrame) -> None:20# Compare ROW_NUMBER, RANK, and DENSE_RANK; can see the21# differences between them when there are tied values22query = """23SELECT24value,25ROW_NUMBER() OVER (ORDER BY value) AS row_num,26RANK() OVER (ORDER BY value) AS rank,27DENSE_RANK() OVER (ORDER BY value) AS dense_rank28FROM self29ORDER BY value, id30"""31assert_sql_matches(32df_test,33query=query,34compare_with="sqlite",35expected={36"value": [10, 10, 20, 25, 25, 35, 40],37"row_num": [1, 2, 3, 4, 5, 6, 7],38"rank": [1, 1, 3, 4, 4, 6, 7],39"dense_rank": [1, 1, 2, 3, 3, 4, 5],40},41)424344def test_rank_funcs_with_partition(df_test: pl.DataFrame) -> None:45# All three ranking functions should return identical46# results if there are no ties within the partitions47query = """48SELECT49category,50value,51ROW_NUMBER() OVER (PARTITION BY category ORDER BY value) AS row_num,52RANK() OVER (PARTITION BY category ORDER BY value) AS rank,53DENSE_RANK() OVER (PARTITION BY category ORDER BY value) AS dense54FROM self55ORDER BY category, value56"""57assert_sql_matches(58df_test,59query=query,60compare_with="sqlite",61expected={62"category": ["A", "A", "A", "B", "B", "B", "C"],63"value": [10, 20, 25, 10, 25, 40, 35],64# No ties within each partition, so identical results65"row_num": [1, 2, 3, 1, 2, 3, 1],66"rank": [1, 2, 3, 1, 2, 3, 1],67"dense": [1, 2, 3, 1, 2, 3, 1],68},69)7071# We expect to see differences (in the same query) when there *are* ties72df = pl.DataFrame(73{74"id": [1, 2, 3, 4, 5, 6, 7, 8],75"category": ["A", "A", "A", "B", "B", "B", "B", "C"],76"value": [10, 10, 20, 20, 30, 30, 30, 30],77}78)79assert_sql_matches(80df,81query=query,82compare_with="sqlite",83expected={84"category": ["A", "A", "A", "B", "B", "B", "B", "C"],85"value": [10, 10, 20, 20, 30, 30, 30, 30],86# ROW_NUMBER: always unique87"row_num": [1, 2, 3, 1, 2, 3, 4, 1],88# RANK: ties get same rank, next rank skips89"rank": [1, 1, 3, 1, 2, 2, 2, 1],90# DENSE_RANK: ties get same rank, next rank is consecutive91"dense": [1, 1, 2, 1, 2, 2, 2, 1],92},93)949596def test_rank_funcs_desc(df_test: pl.DataFrame) -> None:97query = """98SELECT99value,100ROW_NUMBER() OVER (ORDER BY value DESC, category DESC) AS row_num,101RANK() OVER (ORDER BY value DESC, category DESC) AS rank,102DENSE_RANK() OVER (ORDER BY value DESC, category DESC) AS dense_rank103FROM self104ORDER BY value, id DESC105"""106assert_sql_matches(107df_test,108query=query,109compare_with="sqlite",110expected={111"value": [10, 10, 20, 25, 25, 35, 40],112"row_num": [6, 7, 5, 3, 4, 2, 1],113"rank": [6, 7, 5, 3, 4, 2, 1],114"dense_rank": [6, 7, 5, 3, 4, 2, 1],115},116)117118119def test_rank_funcs_require_order_by(df_test: pl.DataFrame) -> None:120# ROW_NUMBER without ORDER BY is fine (uses arbitrary order)121query_row_num = "SELECT id, ROW_NUMBER() OVER () FROM self"122result = df_test.sql(query_row_num)123assert result.height == 7 # Just verify it runs124125# RANK without ORDER BY should error126query_rank = "SELECT id, RANK() OVER (PARTITION BY category) FROM self"127with pytest.raises(128pl.exceptions.SQLSyntaxError,129match="RANK requires an OVER clause with ORDER BY",130):131df_test.sql(query_rank)132133# DENSE_RANK without ORDER BY should error134query_dense = "SELECT id, DENSE_RANK() OVER () FROM self"135with pytest.raises(136pl.exceptions.SQLSyntaxError,137match="DENSE_RANK requires an OVER clause with ORDER BY",138):139df_test.sql(query_dense)140141142