Path: blob/main/py-polars/tests/unit/sql/test_distinct_on.py
7884 views
from __future__ import annotations12import polars as pl3from tests.unit.sql import assert_sql_matches45# Note: SQLite does not support "DISTINCT ON", so only compare results against DuckDB678def test_distinct_on_single_column(df_distinct: pl.DataFrame) -> None:9"""Test DISTINCT ON with single column - keeps first row per distinct value."""10assert_sql_matches(11df_distinct,12query="""13SELECT DISTINCT ON (category)14category, subcategory, value, status, score15FROM self16ORDER BY category NULLS FIRST, value17""",18compare_with="duckdb",19expected={20"category": [None, "A", "B", "C"],21"subcategory": ["x", "x", "y", "x"],22"value": [600, 100, 200, 400],23"status": ["active", "active", "active", "inactive"],24"score": [70, 10, 30, 50],25},26)272829def test_distinct_on_multiple_columns(df_distinct: pl.DataFrame) -> None:30"""Test DISTINCT ON with multiple columns."""31assert_sql_matches(32df_distinct,33query="""34SELECT DISTINCT ON (category, subcategory)35category, subcategory, value, status, score36FROM self37ORDER BY category NULLS FIRST, subcategory, score38""",39compare_with="duckdb",40expected={41"category": [None, None, "A", "B", "B", "C", "C"],42"subcategory": ["x", "y", "x", "y", "z", "x", "y"],43"value": [600, 700, 100, 200, 300, 400, 500],44"status": [45"active",46"inactive",47"active",48"active",49"active",50"inactive",51"active",52],53"score": [70, 80, 10, 30, 40, 50, 60],54},55)565758def test_distinct_on_after_join(df_distinct: pl.DataFrame) -> None:59"""Test DISTINCT ON applied after a JOIN operation."""60df_supplements = pl.DataFrame(61{62"category": ["A", "A", "B", "B", "C"],63"supplement_id": [1, 2, 3, 4, 5],64"supplement_score": [95, 85, 75, 90, 80],65}66)67assert_sql_matches(68frames={"data": df_distinct, "supplements": df_supplements},69query="""70SELECT DISTINCT ON (d.category)71d.category,72d.value,73s.supplement_id,74s.supplement_score75FROM data d76INNER JOIN supplements s ON d.category = s.category77ORDER BY78d.category,79s.supplement_score DESC,80d.value ASC81""",82compare_with="duckdb",83expected={84"category": ["A", "B", "C"],85"value": [100, 200, 400],86"supplement_id": [1, 4, 5],87"supplement_score": [95, 90, 80],88},89)909192def test_distinct_on_with_ordering(df_distinct: pl.DataFrame) -> None:93"""Test DISTINCT ON where ORDER BY determines which row is kept."""94# ascending 'value' order95assert_sql_matches(96df_distinct,97query="""98SELECT DISTINCT ON (status)99status, value100FROM self101WHERE category IS NOT NULL102ORDER BY status, value ASC103""",104compare_with="duckdb",105expected={106"status": ["active", "inactive"],107"value": [100, 200],108},109)110111# descending 'value' order112assert_sql_matches(113df_distinct,114query="""115SELECT DISTINCT ON (status)116status, value117FROM self118WHERE category IS NOT NULL119ORDER BY status, value DESC120""",121compare_with="duckdb",122expected={123"status": ["active", "inactive"],124"value": [500, 400],125},126)127128# ascending category with 'nulls first', descending score129assert_sql_matches(130df_distinct,131query="""132SELECT DISTINCT ON (category)133category, subcategory, value, score134FROM self135ORDER BY category NULLS FIRST, score DESC136""",137compare_with="duckdb",138expected={139"category": [None, "A", "B", "C"],140"subcategory": ["y", "x", "z", "y"],141"value": [700, 100, 300, 500],142"score": [80, 20, 40, 60],143},144)145146# mixed ordering, multiple 'distinct on' cols147assert_sql_matches(148df_distinct,149query="""150SELECT DISTINCT ON (category, status)151category, status, subcategory, value, score152FROM self153ORDER BY category NULLS FIRST, status, value DESC, score ASC154""",155compare_with="duckdb",156expected={157"category": [None, None, "A", "B", "B", "C", "C"],158"status": [159"active",160"inactive",161"active",162"active",163"inactive",164"active",165"inactive",166],167"subcategory": ["x", "y", "x", "z", "y", "y", "x"],168"value": [600, 700, 100, 300, 200, 500, 400],169"score": [70, 80, 10, 40, 30, 60, 50],170},171)172173174def test_distinct_on_with_distinct_aggregation_in_join(175df_distinct: pl.DataFrame,176) -> None:177"""Test DISTINCT COUNT in aggregations used within JOINs."""178df_targets = pl.DataFrame(179{180"category": ["A", "B", "C", "A"],181"target_subcats": [1, 2, 3, 4],182}183)184assert_sql_matches(185frames={"dist": df_distinct, "targets": df_targets},186query="""187SELECT DISTINCT ON (a.category)188a.category,189a.unique_subcats,190t.target_subcats,191CASE192WHEN a.unique_subcats >= t.target_subcats THEN 'yes'193ELSE 'no'194END AS met_target195FROM (196SELECT197category,198COUNT(DISTINCT subcategory) AS unique_subcats199FROM dist200WHERE category IS NOT NULL201GROUP BY category202) a203INNER JOIN targets t ON a.category = t.category204ORDER BY a.category, target_subcats DESC205""",206compare_with=["duckdb"],207expected={208"category": ["A", "B", "C"],209"unique_subcats": [1, 2, 2],210"target_subcats": [4, 2, 3],211"met_target": ["no", "yes", "no"],212},213)214215216