Path: blob/main/py-polars/tests/unit/operations/test_pivot.py
6939 views
from __future__ import annotations12from datetime import date, timedelta3from typing import TYPE_CHECKING, Any45import pytest67import polars as pl8import polars.selectors as cs9from polars.exceptions import ComputeError, DuplicateError10from polars.testing import assert_frame_equal1112if TYPE_CHECKING:13from polars._typing import PivotAgg, PolarsIntegerType, PolarsTemporalType141516def test_pivot() -> None:17df = pl.DataFrame(18{19"foo": ["A", "A", "B", "B", "C"],20"bar": ["k", "l", "m", "n", "o"],21"N": [1, 2, 2, 4, 2],22}23)24result = df.pivot("bar", values="N", aggregate_function=None)2526expected = pl.DataFrame(27[28("A", 1, 2, None, None, None),29("B", None, None, 2, 4, None),30("C", None, None, None, None, 2),31],32schema=["foo", "k", "l", "m", "n", "o"],33orient="row",34)35assert_frame_equal(result, expected)363738def test_pivot_no_values() -> None:39df = pl.DataFrame(40{41"foo": ["A", "A", "B", "B", "C"],42"bar": ["k", "l", "m", "n", "o"],43"N1": [1, 2, 2, 4, 2],44"N2": [1, 2, 2, 4, 2],45}46)47result = df.pivot(on="bar", index="foo", aggregate_function=None)48expected = pl.DataFrame(49{50"foo": ["A", "B", "C"],51"N1_k": [1, None, None],52"N1_l": [2, None, None],53"N1_m": [None, 2, None],54"N1_n": [None, 4, None],55"N1_o": [None, None, 2],56"N2_k": [1, None, None],57"N2_l": [2, None, None],58"N2_m": [None, 2, None],59"N2_n": [None, 4, None],60"N2_o": [None, None, 2],61}62)6364assert_frame_equal(result, expected)656667def test_pivot_list() -> None:68df = pl.DataFrame({"a": [1, 2, 3], "b": [[1, 1], [2, 2], [3, 3]]})6970expected = pl.DataFrame(71{72"a": [1, 2, 3],73"1": [[1, 1], None, None],74"2": [None, [2, 2], None],75"3": [None, None, [3, 3]],76}77)78out = df.pivot(79index="a",80on="a",81values="b",82aggregate_function="first",83sort_columns=True,84)85assert_frame_equal(out, expected)868788@pytest.mark.parametrize(89("agg_fn", "expected_rows"),90[91("first", [("a", 2, None, None), ("b", None, None, 10)]),92("len", [("a", 2, 0, 0), ("b", 0, 2, 1)]),93("min", [("a", 2, None, None), ("b", None, 8, 10)]),94("max", [("a", 4, None, None), ("b", None, 8, 10)]),95("sum", [("a", 6, 0, 0), ("b", 0, 8, 10)]),96("mean", [("a", 3.0, None, None), ("b", None, 8.0, 10.0)]),97("median", [("a", 3.0, None, None), ("b", None, 8.0, 10.0)]),98],99)100def test_pivot_aggregate(agg_fn: PivotAgg, expected_rows: list[tuple[Any]]) -> None:101df = pl.DataFrame(102{103"a": [1, 1, 2, 2, 3],104"b": ["a", "a", "b", "b", "b"],105"c": [2, 4, None, 8, 10],106}107)108result = df.pivot(109index="b", on="a", values="c", aggregate_function=agg_fn, sort_columns=True110)111assert result.rows() == expected_rows112113114def test_pivot_categorical_3968() -> None:115df = pl.DataFrame(116{117"foo": ["one", "one", "one", "two", "two", "two"],118"bar": ["A", "B", "C", "A", "B", "C"],119"baz": [1, 2, 3, 4, 5, 6],120}121)122123result = df.with_columns(pl.col("baz").cast(str).cast(pl.Categorical))124125expected = pl.DataFrame(126{127"foo": ["one", "one", "one", "two", "two", "two"],128"bar": ["A", "B", "C", "A", "B", "C"],129"baz": ["1", "2", "3", "4", "5", "6"],130},131schema_overrides={"baz": pl.Categorical},132)133assert_frame_equal(result, expected, categorical_as_str=True)134135136def test_pivot_categorical_index() -> None:137df = pl.DataFrame(138{"A": ["Fire", "Water", "Water", "Fire"], "B": ["Car", "Car", "Car", "Ship"]},139schema=[("A", pl.Categorical), ("B", pl.Categorical)],140)141142result = df.pivot(index=["A"], on="B", values="B", aggregate_function="len")143expected = {"A": ["Fire", "Water"], "Car": [1, 2], "Ship": [1, 0]}144assert result.to_dict(as_series=False) == expected145146# test expression dispatch147result = df.pivot(index=["A"], on="B", values="B", aggregate_function=pl.len())148assert result.to_dict(as_series=False) == expected149150df = pl.DataFrame(151{152"A": ["Fire", "Water", "Water", "Fire"],153"B": ["Car", "Car", "Car", "Ship"],154"C": ["Paper", "Paper", "Paper", "Paper"],155},156schema=[("A", pl.Categorical), ("B", pl.Categorical), ("C", pl.Categorical)],157)158result = df.pivot(index=["A", "C"], on="B", values="B", aggregate_function="len")159expected = {160"A": ["Fire", "Water"],161"C": ["Paper", "Paper"],162"Car": [1, 2],163"Ship": [1, 0],164}165assert result.to_dict(as_series=False) == expected166167168def test_pivot_multiple_values_column_names_5116() -> None:169df = pl.DataFrame(170{171"x1": [1, 2, 3, 4, 5, 6, 7, 8],172"x2": [8, 7, 6, 5, 4, 3, 2, 1],173"c1": ["A", "B"] * 4,174"c2": ["C", "C", "D", "D"] * 2,175}176)177178with pytest.raises(ComputeError, match="found multiple elements in the same group"):179df.pivot(180index="c1",181on="c2",182values=["x1", "x2"],183separator="|",184aggregate_function=None,185)186187result = df.pivot(188index="c1",189on="c2",190values=["x1", "x2"],191separator="|",192aggregate_function="first",193)194expected = {195"c1": ["A", "B"],196"x1|C": [1, 2],197"x1|D": [3, 4],198"x2|C": [8, 7],199"x2|D": [6, 5],200}201assert result.to_dict(as_series=False) == expected202203204def test_pivot_duplicate_names_7731() -> None:205df = pl.DataFrame(206{207"a": [1, 4],208"b": [1.5, 2.5],209"c": ["x", "x"],210"d": [7, 8],211"e": ["x", "y"],212}213)214result = df.pivot(215index=cs.float(),216on=cs.string(),217values=cs.integer(),218aggregate_function="first",219).to_dict(as_series=False)220expected = {221"b": [1.5, 2.5],222'a_{"x","x"}': [1, None],223'a_{"x","y"}': [None, 4],224'd_{"x","x"}': [7, None],225'd_{"x","y"}': [None, 8],226}227assert result == expected228229230def test_pivot_duplicate_names_11663() -> None:231df = pl.DataFrame({"a": [1, 2], "b": [1, 2], "c": ["x", "x"], "d": ["x", "y"]})232result = df.pivot(index="b", on=["c", "d"], values="a").to_dict(as_series=False)233expected = {"b": [1, 2], '{"x","x"}': [1, None], '{"x","y"}': [None, 2]}234assert result == expected235236237def test_pivot_multiple_columns_12407() -> None:238df = pl.DataFrame(239{240"a": ["beep", "bop"],241"b": ["a", "b"],242"c": ["s", "f"],243"d": [7, 8],244"e": ["x", "y"],245}246)247result = df.pivot(248index="b", on=["c", "e"], values=["a"], aggregate_function="len"249).to_dict(as_series=False)250expected = {"b": ["a", "b"], '{"s","x"}': [1, 0], '{"f","y"}': [0, 1]}251assert result == expected252253254def test_pivot_struct_13120() -> None:255df = pl.DataFrame(256{257"index": [1, 2, 3, 1, 2, 3],258"item_type": ["a", "a", "a", "b", "b", "b"],259"item_id": [123, 123, 123, 456, 456, 456],260"values": [4, 5, 6, 7, 8, 9],261}262)263df = df.with_columns(pl.struct(["item_type", "item_id"]).alias("columns")).drop(264"item_type", "item_id"265)266result = df.pivot(index="index", on="columns", values="values").to_dict(267as_series=False268)269expected = {"index": [1, 2, 3], '{"a",123}': [4, 5, 6], '{"b",456}': [7, 8, 9]}270assert result == expected271272273def test_pivot_index_struct_14101() -> None:274df = pl.DataFrame(275{276"a": [1, 2, 1],277"b": [{"a": 1}, {"a": 1}, {"a": 2}],278"c": ["x", "y", "y"],279"d": [1, 1, 3],280}281)282result = df.pivot(index="b", on="c", values="a")283expected = pl.DataFrame({"b": [{"a": 1}, {"a": 2}], "x": [1, None], "y": [2, 1]})284assert_frame_equal(result, expected)285286287def test_pivot_nested_struct_17065() -> None:288df = pl.DataFrame(289{290"foo": ["one", "two", "one", "two"],291"bar": ["x", "x", "y", "y"],292"baz": [293{"a": 1, "b": {"c": 2}},294{"a": 3, "b": {"c": 4}},295{"a": 5, "b": {"c": 6}},296{"a": 7, "b": {"c": 8}},297],298}299)300result = df.pivot(on="bar", index="foo", values="baz")301expected = pl.DataFrame(302{303"foo": ["one", "two"],304"x": [305{"a": 1, "b": {"c": 2}},306{"a": 3, "b": {"c": 4}},307],308"y": [309{"a": 5, "b": {"c": 6}},310{"a": 7, "b": {"c": 8}},311],312}313)314assert_frame_equal(result, expected)315316317def test_pivot_name_already_exists() -> None:318# This should be extremely rare...but still, good to check it319df = pl.DataFrame(320{321"a": ["a", "b"],322"b": ["a", "b"],323'{"a","b"}': [1, 2],324}325)326with pytest.raises(ComputeError, match="already exists in the DataFrame"):327df.pivot(328values='{"a","b"}',329index="a",330on=["a", "b"],331aggregate_function="first",332)333334335def test_pivot_floats() -> None:336df = pl.DataFrame(337{338"article": ["a", "a", "a", "b", "b", "b"],339"weight": [1.0, 1.0, 4.4, 1.0, 8.8, 1.0],340"quantity": [1.0, 5.0, 1.0, 1.0, 1.0, 7.5],341"price": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],342}343)344345with pytest.raises(ComputeError, match="found multiple elements in the same group"):346result = df.pivot(347index="weight", on="quantity", values="price", aggregate_function=None348)349350result = df.pivot(351index="weight", on="quantity", values="price", aggregate_function="first"352)353expected = {354"weight": [1.0, 4.4, 8.8],355"1.0": [1.0, 3.0, 5.0],356"5.0": [2.0, None, None],357"7.5": [6.0, None, None],358}359assert result.to_dict(as_series=False) == expected360361result = df.pivot(362index=["article", "weight"],363on="quantity",364values="price",365aggregate_function=None,366)367expected = {368"article": ["a", "a", "b", "b"],369"weight": [1.0, 4.4, 1.0, 8.8],370"1.0": [1.0, 3.0, 4.0, 5.0],371"5.0": [2.0, None, None, None],372"7.5": [None, None, 6.0, None],373}374assert result.to_dict(as_series=False) == expected375376377def test_pivot_reinterpret_5907() -> None:378df = pl.DataFrame(379{380"A": pl.Series([3, -2, 3, -2], dtype=pl.Int32),381"B": ["x", "x", "y", "y"],382"C": [100, 50, 500, -80],383}384)385386result = df.pivot(387index=["A"], on=["B"], values=["C"], aggregate_function=pl.element().sum()388)389expected = {"A": [3, -2], "x": [100, 50], "y": [500, -80]}390assert result.to_dict(as_series=False) == expected391392393@pytest.mark.parametrize(394"dtype",395[396pl.Date,397pl.Datetime("ms"),398pl.Datetime("ns"),399pl.Datetime("us"),400pl.Datetime("ms", time_zone="Asia/Shanghai"),401pl.Duration("ms"),402pl.Duration("us"),403pl.Duration("ns"),404pl.Time,405],406)407def test_pivot_temporal_logical_types(dtype: PolarsTemporalType) -> None:408idx = pl.Series([7, 8, 9, 0, 1, 2, 3, 4]).cast(dtype)409df = pl.DataFrame(410{411"idx": idx,412"foo": ["a"] * 3 + ["b"] * 5,413"value": [0] * 8,414}415)416assert df.pivot(417index="idx", on="foo", values="value", aggregate_function=None418).to_dict(as_series=False) == {419"idx": idx.to_list(),420"a": [0, 0, 0, None, None, None, None, None],421"b": [None, None, None, 0, 0, 0, 0, 0],422}423424425def test_pivot_negative_duration() -> None:426df1 = pl.DataFrame({"root": [date(2020, i, 15) for i in (1, 2)]})427df2 = pl.DataFrame({"delta": [timedelta(days=i) for i in (-2, -1, 0, 1)]})428429df = df1.join(df2, how="cross", maintain_order="left_right").with_columns(430pl.Series(name="value", values=range(len(df1) * len(df2)))431)432assert df.pivot(433index="delta", on="root", values="value", aggregate_function=None434).to_dict(as_series=False) == {435"delta": [436timedelta(days=-2),437timedelta(days=-1),438timedelta(0),439timedelta(days=1),440],441"2020-01-15": [0, 1, 2, 3],442"2020-02-15": [4, 5, 6, 7],443}444445446def test_aggregate_function_default() -> None:447df = pl.DataFrame({"a": [1, 2], "b": ["foo", "foo"], "c": ["x", "x"]})448with pytest.raises(ComputeError, match="found multiple elements in the same group"):449df.pivot(index="b", on="c", values="a")450451452def test_pivot_aggregate_function_count_deprecated() -> None:453df = pl.DataFrame(454{455"foo": ["A", "A", "B", "B", "C"],456"N": [1, 2, 2, 4, 2],457"bar": ["k", "l", "m", "n", "o"],458}459)460with pytest.deprecated_call():461df.pivot(index="foo", on="bar", values="N", aggregate_function="count") # type: ignore[arg-type]462463464def test_pivot_struct() -> None:465data = {466"id": ["a", "a", "b", "c", "c", "c"],467"week": ["1", "2", "3", "4", "3", "1"],468"num1": [1, 3, 5, 4, 3, 6],469"num2": [4, 5, 3, 4, 6, 6],470}471df = pl.DataFrame(data).with_columns(nums=pl.struct(["num1", "num2"]))472473assert df.pivot(474values="nums", index="id", on="week", aggregate_function="first"475).to_dict(as_series=False) == {476"id": ["a", "b", "c"],477"1": [478{"num1": 1, "num2": 4},479None,480{"num1": 6, "num2": 6},481],482"2": [483{"num1": 3, "num2": 5},484None,485None,486],487"3": [488None,489{"num1": 5, "num2": 3},490{"num1": 3, "num2": 6},491],492"4": [493None,494None,495{"num1": 4, "num2": 4},496],497}498499500def test_duplicate_column_names_which_should_raise_14305() -> None:501df = pl.DataFrame({"a": [1, 3, 2], "c": ["a", "a", "a"], "d": [7, 8, 9]})502with pytest.raises(DuplicateError, match="has more than one occurrence"):503df.pivot(index="a", on="c", values="d")504505506def test_multi_index_containing_struct() -> None:507df = pl.DataFrame(508{509"a": [1, 2, 1],510"b": [{"a": 1}, {"a": 1}, {"a": 2}],511"c": ["x", "y", "y"],512"d": [1, 1, 3],513}514)515result = df.pivot(index=("b", "d"), on="c", values="a")516expected = pl.DataFrame(517{"b": [{"a": 1}, {"a": 2}], "d": [1, 3], "x": [1, None], "y": [2, 1]}518)519assert_frame_equal(result, expected)520521522def test_list_pivot() -> None:523df = pl.DataFrame(524{525"a": [1, 2, 3, 1],526"b": [[1, 2], [3, 4], [5, 6], [1, 2]],527"c": ["x", "x", "y", "y"],528"d": [1, 2, 3, 4],529}530)531assert df.pivot(532index=["a", "b"],533on="c",534values="d",535).to_dict(as_series=False) == {536"a": [1, 2, 3],537"b": [[1, 2], [3, 4], [5, 6]],538"x": [1, 2, None],539"y": [4, None, 3],540}541542543def test_pivot_string_17081() -> None:544df = pl.DataFrame(545{546"a": ["1", "2", "3"],547"b": ["4", "5", "6"],548"c": ["7", "8", "9"],549}550)551assert df.pivot(index="a", on="b", values="c", aggregate_function="min").to_dict(552as_series=False553) == {554"a": ["1", "2", "3"],555"4": ["7", None, None],556"5": [None, "8", None],557"6": [None, None, "9"],558}559560561def test_pivot_invalid() -> None:562with pytest.raises(563pl.exceptions.InvalidOperationError,564match="`index` and `values` cannot both be None in `pivot` operation",565):566pl.DataFrame({"a": [1, 2], "b": [2, 3], "c": [3, 4]}).pivot("a")567568569@pytest.mark.parametrize(570"dtype",571[pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64],572)573def test_pivot_empty_index_dtypes(dtype: PolarsIntegerType) -> None:574index = pl.Series([], dtype=dtype)575df = pl.DataFrame({"index": index, "on": [], "values": []})576result = df.pivot(index="index", on="on", values="values")577expected = pl.DataFrame({"index": index})578assert_frame_equal(result, expected)579580581def test_pivot_agg_column_ref_invalid_22479() -> None:582df = pl.DataFrame(583{"a": ["x", "x", "x"], "b": [1, 1, 1], "c": [7, 8, 9], "d": [0, 2, 1]}584)585with pytest.raises(586pl.exceptions.InvalidOperationError,587match="explicit column references are not allowed in aggregate_function",588):589df.pivot(590on="a", index="b", values="c", aggregate_function=pl.element().sort_by("d")591)592593594def test_pivot_agg_null_methods_23408() -> None:595df = pl.DataFrame(596{597"idx": [0, 0, 1, 1],598"on": ["a", "b", "a", "c"],599"val": ["aa", "bb", "aa", "cc"],600}601)602out = df.pivot(603on="on",604index="idx",605values="val",606aggregate_function=pl.element().first().is_null(),607)608expected = pl.DataFrame(609{"idx": [0, 1], "a": [False, False], "b": [False, True], "c": [True, False]}610)611assert_frame_equal(out, expected)612613out = df.pivot(614on="on",615index="idx",616values="val",617aggregate_function=pl.element().first().fill_null("xx"),618)619expected = pl.DataFrame(620{"idx": [0, 1], "a": ["aa", "aa"], "b": ["bb", "xx"], "c": ["xx", "cc"]}621)622assert_frame_equal(out, expected)623624625