Path: blob/main/py-polars/tests/unit/sql/test_distinct.py
7884 views
from __future__ import annotations12import polars as pl3from tests.unit.sql import assert_sql_matches456def test_distinct_basic_single_column(df_distinct: pl.DataFrame) -> None:7"""Test DISTINCT on a single column."""8assert_sql_matches(9df_distinct,10query="""11SELECT DISTINCT category12FROM self ORDER BY category NULLS FIRST13""",14compare_with=["sqlite"],15expected={"category": [None, "A", "B", "C"]},16)171819def test_distinct_basic_multiple_columns(df_distinct: pl.DataFrame) -> None:20"""Test DISTINCT across multiple columns."""21assert_sql_matches(22df_distinct,23query="""24SELECT DISTINCT category, subcategory25FROM self ORDER BY category NULLS FIRST, subcategory26""",27compare_with=["sqlite"],28expected={29"category": [None, None, "A", "B", "B", "C", "C"],30"subcategory": ["x", "y", "x", "y", "z", "x", "y"],31},32)333435def test_distinct_basic_all_columns(df_distinct: pl.DataFrame) -> None:36"""Test DISTINCT across all columns."""37assert_sql_matches(38df_distinct,39query="""40SELECT DISTINCT * FROM self41ORDER BY category NULLS FIRST, subcategory, value, status, score42""",43compare_with=["sqlite"],44expected={45"category": [None, None, "A", "A", "B", "B", "B", "C", "C"],46"subcategory": ["x", "y", "x", "x", "y", "y", "z", "x", "y"],47"value": [600, 700, 100, 100, 200, 200, 300, 400, 500],48"status": [49"active",50"inactive",51"active",52"active",53"active",54"inactive",55"active",56"inactive",57"active",58],59"score": [70, 80, 10, 20, 30, 30, 40, 50, 60],60},61)626364def test_distinct_with_expressions(df_distinct: pl.DataFrame) -> None:65"""Test DISTINCT with column expressions and aggregations."""66assert_sql_matches(67df_distinct,68query="""69SELECT DISTINCT70category,71value * 2 AS doubled_value72FROM self73WHERE category IS NOT NULL74ORDER BY category, doubled_value75""",76compare_with=["sqlite"],77expected={78"category": ["A", "B", "B", "C", "C"],79"doubled_value": [200, 400, 600, 800, 1000],80},81)828384def test_distinct_with_full_outer_join(df_distinct: pl.DataFrame) -> None:85"""Test DISTINCT with FULL OUTER JOIN producing NULLs on both sides."""86df_extended = pl.DataFrame(87{88"category": ["A", "D", "E"],89"extra_info": ["info_a", "info_d", "info_e"],90}91)92assert_sql_matches(93frames={"data": df_distinct, "extended": df_extended},94query="""95SELECT DISTINCT96COALESCE(d.category, e.category) AS category,97e.extra_info98FROM (SELECT DISTINCT category FROM data WHERE category IS NOT NULL) d99FULL JOIN extended e USING (category)100ORDER BY category101""",102compare_with=["sqlite"],103expected={104"category": ["A", "B", "C", "D", "E"],105"extra_info": ["info_a", None, None, "info_d", "info_e"],106},107)108109110def test_distinct_with_group_by(df_distinct: pl.DataFrame) -> None:111"""Test DISTINCT in combination with GROUP BY."""112assert_sql_matches(113df_distinct,114query="""115SELECT116category,117COUNT(DISTINCT subcategory) AS distinct_subcats,118COUNT(DISTINCT status) AS distinct_statuses,119SUM(value) AS total_value120FROM self121WHERE category IS NOT NULL122GROUP BY category123ORDER BY category124""",125compare_with=["sqlite"],126expected={127"category": ["A", "B", "C"],128"distinct_subcats": [1, 2, 2],129"distinct_statuses": [1, 2, 2],130"total_value": [300, 700, 900],131},132)133assert_sql_matches(134df_distinct,135query="""136SELECT *137FROM (138SELECT139subcategory,140COUNT(DISTINCT category) AS distinct_categories141FROM self142WHERE category IS NOT NULL143GROUP BY subcategory144) AS agg145WHERE distinct_categories > 1146ORDER BY subcategory147""",148compare_with=["sqlite"],149expected={150"subcategory": ["x", "y"],151"distinct_categories": [2, 2],152},153)154155156def test_distinct_with_join(df_distinct: pl.DataFrame) -> None:157"""Test DISTINCT with multiway JOINs."""158df_categories = pl.DataFrame(159{160"category": ["A", "B", "C", "D"],161"category_name": ["Alpha", "Beta", "Gamma", "Delta"],162}163)164df_status_info = pl.DataFrame(165{166"status": ["active", "inactive", "pending"],167"priority": [1, 2, 3],168}169)170assert_sql_matches(171{172"data": df_distinct,173"categories": df_categories,174"status_info": df_status_info,175},176query="""177SELECT DISTINCT178d.category,179c.category_name,180d.status,181s.priority182FROM data d183INNER JOIN categories c ON d.category = c.category184INNER JOIN status_info s ON d.status = s.status185ORDER BY d.category, d.status186""",187compare_with=["sqlite"],188expected={189"category": ["A", "B", "B", "C", "C"],190"category_name": ["Alpha", "Beta", "Beta", "Gamma", "Gamma"],191"status": ["active", "active", "inactive", "active", "inactive"],192"priority": [1, 1, 2, 1, 2],193},194)195196197def test_distinct_with_left_join_nulls(df_distinct: pl.DataFrame) -> None:198"""Test DISTINCT behaviour with NULL values introduced by LEFT JOIN."""199df_lookup = pl.DataFrame(200{201"category": ["A", "B"],202"region": ["North", "South"],203}204)205assert_sql_matches(206frames={"data": df_distinct, "lookup": df_lookup},207query="""208SELECT DISTINCT209d.category,210l.region211FROM data d212LEFT JOIN lookup l ON d.category = l.category213ORDER BY d.category NULLS FIRST, l.region214""",215compare_with=["sqlite"],216expected={217"category": [None, "A", "B", "C"],218"region": [None, "North", "South", None],219},220)221222223def test_distinct_with_nulls_handling(df_distinct: pl.DataFrame) -> None:224"""Test that DISTINCT treats NULL values as different from each other."""225assert_sql_matches(226df_distinct,227query="""228SELECT DISTINCT category, status229FROM self ORDER BY category NULLS FIRST, status230""",231compare_with=["sqlite"],232expected={233"category": [None, None, "A", "B", "B", "C", "C"],234"status": [235"active",236"inactive",237"active",238"active",239"inactive",240"active",241"inactive",242],243},244)245246247def test_distinct_with_where_filter(df_distinct: pl.DataFrame) -> None:248"""Test DISTINCT with various WHERE clause filters."""249assert_sql_matches(250df_distinct,251query="""252SELECT DISTINCT category, status253FROM self254WHERE value >= 200 AND category IS NOT NULL255ORDER BY category, status256""",257compare_with=["sqlite"],258expected={259"category": ["B", "B", "C", "C"],260"status": ["active", "inactive", "active", "inactive"],261},262)263264265