Path: blob/main/py-polars/tests/unit/test_projections.py
6939 views
from typing import Literal12import numpy as np3import pytest45import polars as pl6from polars.testing import assert_frame_equal789def test_projection_on_semi_join_4789() -> None:10lfa = pl.DataFrame({"a": [1], "p": [1]}).lazy()1112lfb = pl.DataFrame({"seq": [1], "p": [1]}).lazy()1314ab = lfa.join(lfb, on="p", how="semi").inspect()1516intermediate_agg = (ab.group_by("a").agg([pl.col("a").alias("seq")])).select(17["a", "seq"]18)1920q = ab.join(intermediate_agg, on="a")2122assert q.collect().to_dict(as_series=False) == {"a": [1], "p": [1], "seq": [[1]]}232425def test_unpivot_projection_pd_block_4997() -> None:26assert (27pl.DataFrame({"col1": ["a"], "col2": ["b"]})28.with_row_index()29.lazy()30.unpivot(index="index")31.group_by("index")32.agg(pl.col("variable").alias("result"))33.collect()34).to_dict(as_series=False) == {"index": [0], "result": [["col1", "col2"]]}353637def test_double_projection_pushdown() -> None:38assert (39"2/3 COLUMNS"40in (41pl.DataFrame({"c0": [], "c1": [], "c2": []})42.lazy()43.select(["c0", "c1", "c2"])44.select(["c0", "c1"])45).explain()46)474849def test_group_by_projection_pushdown() -> None:50assert (51"2/3 COLUMNS"52in (53pl.DataFrame({"c0": [], "c1": [], "c2": []})54.lazy()55.group_by("c0")56.agg(57[58pl.col("c1").sum().alias("sum(c1)"),59pl.col("c2").mean().alias("mean(c2)"),60]61)62.select(["sum(c1)"])63).explain()64)656667def test_unnest_projection_pushdown() -> None:68lf = pl.DataFrame({"x|y|z": [1, 2], "a|b|c": [2, 3]}).lazy()6970mlf = (71lf.unpivot()72.with_columns(pl.col("variable").str.split_exact("|", 2))73.unnest("variable")74)75mlf = mlf.select(76pl.col("field_1").cast(pl.Categorical).alias("row"),77pl.col("field_2").cast(pl.Categorical).alias("col"),78pl.col("value"),79)8081out = (82mlf.sort(83[pl.col.row.cast(pl.String), pl.col.col.cast(pl.String)],84maintain_order=True,85)86.collect()87.to_dict(as_series=False)88)89assert out == {90"row": ["b", "b", "y", "y"],91"col": ["c", "c", "z", "z"],92"value": [2, 3, 1, 2],93}949596def test_hconcat_projection_pushdown() -> None:97lf1 = pl.LazyFrame({"a": [0, 1, 2], "b": [3, 4, 5]})98lf2 = pl.LazyFrame({"c": [6, 7, 8], "d": [9, 10, 11]})99query = pl.concat([lf1, lf2], how="horizontal").select(["a", "d"])100101explanation = query.explain()102assert explanation.count("1/2 COLUMNS") == 2103104out = query.collect()105expected = pl.DataFrame({"a": [0, 1, 2], "d": [9, 10, 11]})106assert_frame_equal(out, expected)107108109def test_hconcat_projection_pushdown_length_maintained() -> None:110# We can't eliminate the second input completely as this affects111# the length of the result, even though no columns are used.112lf1 = pl.LazyFrame({"a": [0, 1], "b": [2, 3]})113lf2 = pl.LazyFrame({"c": [4, 5, 6, 7], "d": [8, 9, 10, 11]})114query = pl.concat([lf1, lf2], how="horizontal").select(["a"])115116explanation = query.explain()117assert "1/2 COLUMNS" in explanation118119out = query.collect()120expected = pl.DataFrame({"a": [0, 1, None, None]})121assert_frame_equal(out, expected)122123124@pytest.mark.may_fail_auto_streaming125@pytest.mark.may_fail_cloud126def test_unnest_columns_available() -> None:127df = pl.DataFrame(128{129"title": ["Avatar", "spectre", "King Kong"],130"content_rating": ["PG-13"] * 3,131"genres": [132"Action|Adventure|Fantasy|Sci-Fi",133"Action|Adventure|Thriller",134"Action|Adventure|Drama|Romance",135],136}137).lazy()138139q = df.with_columns(140pl.col("genres")141.str.split("|")142.list.to_struct(upper_bound=4, fields=lambda i: f"genre{i + 1}")143).unnest("genres")144145out = q.collect()146assert out.to_dict(as_series=False) == {147"title": ["Avatar", "spectre", "King Kong"],148"content_rating": ["PG-13", "PG-13", "PG-13"],149"genre1": ["Action", "Action", "Action"],150"genre2": ["Adventure", "Adventure", "Adventure"],151"genre3": ["Fantasy", "Thriller", "Drama"],152"genre4": ["Sci-Fi", None, "Romance"],153}154155156def test_double_projection_union() -> None:157lf1 = pl.DataFrame(158{159"a": [1, 2, 3, 4],160"b": [2, 3, 4, 5],161"c": [1, 1, 2, 2],162"d": [1, 2, 2, 1],163}164).lazy()165166lf2 = pl.DataFrame(167{168"a": [5, 6, 7, 8],169"b": [6, 7, 8, 9],170"c": [1, 2, 1, 3],171}172).lazy()173174# in this query the group_by projects only 2 columns, that's one175# less than the upstream projection so the union will fail if176# the select node does not prune one column177q = lf1.select(["a", "b", "c"])178179q = pl.concat([q, lf2])180181q = q.group_by("c", maintain_order=True).agg([pl.col("a")])182assert q.collect().to_dict(as_series=False) == {183"c": [1, 2, 3],184"a": [[1, 2, 5, 7], [3, 4, 6], [8]],185}186187188def test_asof_join_projection_() -> None:189lf1 = (190pl.DataFrame(191{192"m": np.linspace(0, 5, 7),193"a": np.linspace(0, 5, 7),194"b": np.linspace(0, 5, 7),195"c": pl.Series(np.linspace(0, 5, 7)).cast(str),196"d": np.linspace(0, 5, 7),197}198)199.lazy()200.set_sorted("b")201)202lf2 = (203pl.DataFrame(204{205"group": [0, 2, 3, 0, 1, 2, 3],206"val": [0.0, 2.5, 2.6, 2.7, 3.4, 4.0, 5.0],207"c": ["x", "x", "x", "y", "y", "y", "y"],208}209)210.with_columns(pl.col("val").alias("b"))211.lazy()212.set_sorted("b")213)214215joined = lf1.join_asof(216lf2,217on="b",218by=["c"],219strategy="backward",220)221222expressions = [223"m",224"a",225"b",226"c",227"d",228pl.lit(0, dtype=pl.Int64).alias("group"),229pl.lit(0.1).alias("val"),230]231dirty_lf1 = lf1.select(expressions)232233concatted = pl.concat([joined, dirty_lf1])234assert_frame_equal(235concatted.select(["b", "a"]).collect(),236pl.DataFrame(237{238"b": [2390.0,2400.8333333333333334,2411.6666666666666667,2422.5,2433.3333333333333335,2444.166666666666667,2455.0,2460.0,2470.8333333333333334,2481.6666666666666667,2492.5,2503.3333333333333335,2514.166666666666667,2525.0,253],254"a": [2550.0,2560.8333333333333334,2571.6666666666666667,2582.5,2593.3333333333333335,2604.166666666666667,2615.0,2620.0,2630.8333333333333334,2641.6666666666666667,2652.5,2663.3333333333333335,2674.166666666666667,2685.0,269],270}271),272check_row_order=False,273)274275276def test_merge_sorted_projection_pd() -> None:277lf = pl.LazyFrame(278{279"foo": [1, 2, 3, 4],280"bar": ["patrick", "lukas", "onion", "afk"],281}282).sort("foo")283284lf2 = pl.LazyFrame({"foo": [5, 6], "bar": ["nice", "false"]}).sort("foo")285286assert (287lf.merge_sorted(lf2, key="foo").reverse().select(["bar"])288).collect().to_dict(as_series=False) == {289"bar": ["false", "nice", "afk", "onion", "lukas", "patrick"]290}291292293def test_distinct_projection_pd_7578() -> None:294lf = pl.LazyFrame(295{296"foo": ["0", "1", "2", "1", "2"],297"bar": ["a", "a", "a", "b", "b"],298}299)300301result = lf.unique().group_by("bar").agg(pl.len())302expected = pl.LazyFrame(303{304"bar": ["a", "b"],305"len": [3, 2],306},307schema_overrides={"len": pl.UInt32},308)309assert_frame_equal(result, expected, check_row_order=False)310311312def test_join_suffix_collision_9562() -> None:313df = pl.DataFrame(314{315"foo": [1, 2, 3],316"bar": [6.0, 7.0, 8.0],317"ham": ["a", "b", "c"],318}319)320other_df = pl.DataFrame(321{322"apple": ["x", "y", "z"],323"ham": ["a", "b", "d"],324}325)326df.join(other_df, on="ham")327assert df.lazy().join(328other_df.lazy(),329how="inner",330left_on="ham",331right_on="ham",332suffix="m",333maintain_order="right",334).select("ham").collect().to_dict(as_series=False) == {"ham": ["a", "b"]}335336337def test_projection_join_names_9955() -> None:338batting = pl.LazyFrame(339{340"playerID": ["abercda01"],341"yearID": [1871],342"lgID": ["NA"],343}344)345346awards_players = pl.LazyFrame(347{348"playerID": ["bondto01"],349"yearID": [1877],350"lgID": ["NL"],351}352)353354right = awards_players.filter(pl.col("lgID") == "NL").select("playerID")355356q = batting.join(357right,358left_on=[pl.col("playerID")],359right_on=[pl.col("playerID")],360how="inner",361)362363q = q.select(*batting.collect_schema().keys())364365assert q.collect().schema == {366"playerID": pl.String,367"yearID": pl.Int64,368"lgID": pl.String,369}370371372def test_projection_rename_10595() -> None:373lf = pl.LazyFrame(schema={"a": pl.Float32, "b": pl.Float32})374375result = lf.select("a", "b").rename({"b": "a", "a": "b"}).select("a")376assert result.collect().schema == {"a": pl.Float32}377378result = (379lf.select("a", "b")380.rename({"c": "d", "b": "a", "d": "c", "a": "b"}, strict=False)381.select("a")382)383assert result.collect().schema == {"a": pl.Float32}384385386def test_projection_count_11841() -> None:387pl.LazyFrame({"x": 1}).select(records=pl.len()).select(388pl.lit(1).alias("x"), pl.all()389).collect()390391392def test_schema_full_outer_join_projection_pd_13287() -> None:393lf = pl.LazyFrame({"a": [1, 1], "b": [2, 3]})394lf2 = pl.LazyFrame({"a": [1, 1], "c": [2, 3]})395396assert lf.join(397lf2,398how="full",399left_on="a",400right_on="c",401maintain_order="right_left",402).with_columns(403pl.col("a").fill_null(pl.col("c")),404).select("a").collect().to_dict(as_series=False) == {"a": [2, 3, 1, 1]}405406407def test_projection_pushdown_full_outer_join_duplicates() -> None:408df1 = pl.DataFrame({"a": [1, 2, 3], "b": [10, 20, 30]}).lazy()409df2 = pl.DataFrame({"a": [1, 2, 3], "b": [10, 20, 30]}).lazy()410assert (411df1.join(df2, on="a", how="full", maintain_order="right")412.with_columns(c=0)413.select("a", "c")414.collect()415).to_dict(as_series=False) == {"a": [1, 2, 3], "c": [0, 0, 0]}416417418def test_rolling_key_projected_13617() -> None:419df = pl.DataFrame({"idx": [1, 2], "value": ["a", "b"]}).set_sorted("idx")420ldf = df.lazy().select(pl.col("value").rolling("idx", period="1i"))421plan = ldf.explain(optimizations=pl.QueryOptFlags(projection_pushdown=True))422assert r"2/2 COLUMNS" in plan423out = ldf.collect(optimizations=pl.QueryOptFlags(projection_pushdown=True))424assert out.to_dict(as_series=False) == {"value": [["a"], ["b"]]}425426427def test_projection_drop_with_series_lit_14382() -> None:428df = pl.DataFrame({"b": [1, 6, 8, 7]})429df2 = pl.DataFrame({"a": [1, 2, 4, 4], "b": [True, True, True, False]})430431q = (432df2.lazy()433.select(434*["a", "b"], pl.lit("b").alias("b_name"), df.get_column("b").alias("b_old")435)436.filter(pl.col("b").not_())437.drop("b")438)439assert q.collect().to_dict(as_series=False) == {440"a": [4],441"b_name": ["b"],442"b_old": [7],443}444445446def test_cached_schema_15651() -> None:447q = pl.LazyFrame({"col1": [1], "col2": [2], "col3": [3]})448q = q.with_row_index()449q = q.filter(~pl.col("col1").is_null())450# create a subplan diverging from q451_ = q.select(pl.len()).collect(452optimizations=pl.QueryOptFlags(projection_pushdown=True)453)454455# ensure that q's "cached" columns are still correct456assert q.collect_schema().names() == q.collect().columns457458459def test_double_projection_pushdown_15895() -> None:460df = (461pl.LazyFrame({"A": [0], "B": [1]})462.select(C="A", A="B")463.group_by(1)464.all()465.collect(optimizations=pl.QueryOptFlags(projection_pushdown=True))466)467assert df.to_dict(as_series=False) == {468"literal": [1],469"C": [[0]],470"A": [[1]],471}472473474@pytest.mark.parametrize("join_type", ["inner", "left", "full"])475def test_non_coalesce_join_projection_pushdown_16515(476join_type: Literal["inner", "left", "full"],477) -> None:478left = pl.LazyFrame({"x": 1})479right = pl.LazyFrame({"y": 1})480481assert (482left.join(right, how=join_type, left_on="x", right_on="y", coalesce=False)483.select("y")484.collect()485.item()486== 1487)488489490@pytest.mark.parametrize("join_type", ["inner", "left", "full"])491def test_non_coalesce_multi_key_join_projection_pushdown_16554(492join_type: Literal["inner", "left", "full"],493) -> None:494lf1 = pl.LazyFrame(495{496"a": [1, 2, 3, 4, 5],497"b": [1, 2, 3, 4, 5],498}499)500lf2 = pl.LazyFrame(501{502"a": [0, 2, 3, 4, 5],503"b": [1, 2, 3, 5, 6],504"c": [7, 5, 3, 5, 7],505}506)507508expect = (509lf1.with_columns(a2="a")510.join(511other=lf2,512how=join_type,513left_on=["a", "a2"],514right_on=["b", "c"],515coalesce=False,516)517.select("a", "b", "c")518.collect()519)520521out = (522lf1.join(523other=lf2,524how=join_type,525left_on=["a", "a"],526right_on=["b", "c"],527coalesce=False,528)529.select("a", "b", "c")530.collect()531)532533assert_frame_equal(out, expect, check_row_order=False)534535536@pytest.mark.parametrize("how", ["semi", "anti"])537def test_projection_pushdown_semi_anti_no_selection(538how: Literal["semi", "anti"],539) -> None:540q_a = pl.LazyFrame({"a": [1, 2, 3]})541542q_b = pl.LazyFrame({"b": [1, 2, 3], "c": [1, 2, 3]})543544assert "1/2 COLUMNS" in (545q_a.join(q_b, left_on="a", right_on="b", how=how).explain()546)547548549def test_projection_empty_frame_len_16904() -> None:550df = pl.LazyFrame({})551552q = df.select(pl.len())553554assert "0/0 COLUMNS" in q.explain()555556expect = pl.DataFrame({"len": [0]}, schema_overrides={"len": pl.UInt32()})557assert_frame_equal(q.collect(), expect)558559560def test_projection_literal_no_alias_17739() -> None:561df = pl.LazyFrame({})562assert df.select(pl.lit(False)).select("literal").collect().to_dict(563as_series=False564) == {"literal": [False]}565566567def test_projections_collapse_17781() -> None:568frame1 = pl.LazyFrame(569{570"index": [0],571"data1": [0],572"data2": [0],573}574)575frame2 = pl.LazyFrame(576{577"index": [0],578"label1": [True],579"label2": [False],580"label3": [False],581},582schema=[583("index", pl.Int64),584("label1", pl.Boolean),585("label2", pl.Boolean),586("label3", pl.Boolean),587],588)589cols = ["index", "data1", "label1", "label2"]590591lf = None592for lfj in [frame1, frame2]:593use_columns = [c for c in cols if c in lfj.collect_schema().names()]594lfj = lfj.select(use_columns)595lfj = lfj.select(use_columns)596if lf is None:597lf = lfj598else:599lf = lf.join(lfj, on="index", how="left")600assert "SELECT " not in lf.explain() # type: ignore[union-attr]601602603def test_with_columns_projection_pushdown() -> None:604# # Summary605# `process_hstack` in projection PD incorrectly took a fast-path meant for606# LP nodes that don't add new columns to the schema, which stops projection607# PD if it sees that the schema lengths on the upper node matches.608#609# To trigger this, we drop the same number of columns before and after610# the with_columns, and in the with_columns we also add the same number of611# columns.612lf = (613pl.scan_csv(614b"""\615a,b,c,d,e6161,1,1,1,1617""",618include_file_paths="path",619)620.drop("a", "b")621.with_columns(pl.lit(1).alias(x) for x in ["x", "y"])622.drop("c", "d")623)624625plan = lf.explain().strip()626627assert plan.startswith("WITH_COLUMNS:")628# [dyn int: 1.alias("x"), dyn int: 1.alias("y")]629# Csv SCAN [20 in-mem bytes]630assert plan.endswith("1/6 COLUMNS")631632633def test_projection_pushdown_height_20221() -> None:634q = pl.LazyFrame({"a": range(5)}).select("a", b=pl.col("a").first()).select("b")635assert_frame_equal(q.collect(), pl.DataFrame({"b": [0, 0, 0, 0, 0]}))636637638def test_select_len_20337() -> None:639strs = [str(i) for i in range(3)]640q = pl.LazyFrame({"a": strs, "b": strs, "c": strs, "d": range(3)})641642q = q.group_by(pl.col("c")).agg(643(pl.col("d") * j).alias(f"mult {j}") for j in [1, 2]644)645646q = q.with_row_index("foo")647assert q.select(pl.len()).collect().item() == 3648649650def test_filter_count_projection_20902() -> None:651lineitem_ldf = pl.LazyFrame(652{653"l_partkey": [1],654"l_quantity": [1],655"l_extendedprice": [1],656}657)658assert (659"1/3 COLUMNS"660in lineitem_ldf.filter(pl.col("l_partkey").is_between(10, 20))661.select(pl.len())662.explain()663)664665666def test_projection_count_21154() -> None:667lf = pl.LazyFrame(668{669"a": [1, 2, 3],670"b": [4, 5, 6],671}672)673674assert lf.unique("a").select(pl.len()).collect().to_dict(as_series=False) == {675"len": [3]676}677678679