Path: blob/main/py-polars/tests/unit/operations/test_pivot.py
8422 views
from __future__ import annotations12from datetime import date, timedelta3from typing import TYPE_CHECKING, Any45import pytest67import polars as pl8import polars.selectors as cs9from polars.exceptions import ComputeError, DuplicateError10from polars.testing import assert_frame_equal1112if TYPE_CHECKING:13from polars._typing import PivotAgg, PolarsIntegerType, PolarsTemporalType141516def test_pivot() -> None:17df = pl.DataFrame(18{19"foo": ["A", "A", "B", "B", "C"],20"bar": ["k", "l", "m", "n", "o"],21"N": [1, 2, 2, 4, 2],22}23)24result = df.pivot("bar", values="N", aggregate_function=None)2526expected = pl.DataFrame(27[28("A", 1, 2, None, None, None),29("B", None, None, 2, 4, None),30("C", None, None, None, None, 2),31],32schema=["foo", "k", "l", "m", "n", "o"],33orient="row",34)35assert_frame_equal(result, expected)363738def test_pivot_no_values() -> None:39df = pl.DataFrame(40{41"foo": ["A", "A", "B", "B", "C"],42"bar": ["k", "l", "m", "n", "o"],43"N1": [1, 2, 2, 4, 2],44"N2": [1, 2, 2, 4, 2],45}46)47result = df.pivot(on="bar", index="foo", aggregate_function=None)48expected = pl.DataFrame(49{50"foo": ["A", "B", "C"],51"N1_k": [1, None, None],52"N1_l": [2, None, None],53"N1_m": [None, 2, None],54"N1_n": [None, 4, None],55"N1_o": [None, None, 2],56"N2_k": [1, None, None],57"N2_l": [2, None, None],58"N2_m": [None, 2, None],59"N2_n": [None, 4, None],60"N2_o": [None, None, 2],61}62)6364assert_frame_equal(result, expected)656667def test_pivot_list() -> None:68df = pl.DataFrame({"a": [1, 2, 3], "b": [[1, 1], [2, 2], [3, 3]]})6970expected = pl.DataFrame(71{72"a": [1, 2, 3],73"1": [[1, 1], None, None],74"2": [None, [2, 2], None],75"3": [None, None, [3, 3]],76}77)78out = df.pivot(79index="a",80on="a",81values="b",82aggregate_function="first",83sort_columns=True,84)85assert_frame_equal(out, expected)868788@pytest.mark.parametrize(89("agg_fn", "expected_rows"),90[91("first", [("a", 2, None, None), ("b", None, None, 10)]),92("len", [("a", 2, 0, 0), ("b", 0, 2, 1)]),93("min", [("a", 2, None, None), ("b", None, 8, 10)]),94("max", [("a", 4, None, None), ("b", None, 8, 10)]),95("sum", [("a", 6, 0, 0), ("b", 0, 8, 10)]),96("mean", [("a", 3.0, None, None), ("b", None, 8.0, 10.0)]),97("median", [("a", 3.0, None, None), ("b", None, 8.0, 10.0)]),98],99)100def test_pivot_aggregate(agg_fn: PivotAgg, expected_rows: list[tuple[Any]]) -> None:101df = pl.DataFrame(102{103"a": [1, 1, 2, 2, 3],104"b": ["a", "a", "b", "b", "b"],105"c": [2, 4, None, 8, 10],106}107)108result = df.pivot(109index="b", on="a", values="c", aggregate_function=agg_fn, sort_columns=True110)111assert result.rows() == expected_rows112113114def test_pivot_categorical_3968() -> None:115df = pl.DataFrame(116{117"foo": ["one", "one", "one", "two", "two", "two"],118"bar": ["A", "B", "C", "A", "B", "C"],119"baz": [1, 2, 3, 4, 5, 6],120}121)122123result = df.with_columns(pl.col("baz").cast(str).cast(pl.Categorical))124125expected = pl.DataFrame(126{127"foo": ["one", "one", "one", "two", "two", "two"],128"bar": ["A", "B", "C", "A", "B", "C"],129"baz": ["1", "2", "3", "4", "5", "6"],130},131schema_overrides={"baz": pl.Categorical},132)133assert_frame_equal(result, expected, categorical_as_str=True)134135136@pytest.mark.parametrize("maintain_order", [False, True])137def test_pivot_categorical_index(maintain_order: bool) -> None:138df = pl.DataFrame(139{"A": ["Fire", "Water", "Water", "Fire"], "B": ["Car", "Car", "Car", "Ship"]},140schema=[("A", pl.Categorical), ("B", pl.Categorical)],141)142143result = df.pivot(144index=["A"],145on="B",146values="B",147aggregate_function="len",148maintain_order=maintain_order,149)150expected = pl.DataFrame(151{"A": ["Fire", "Water"], "Car": [1, 2], "Ship": [1, 0]},152schema={153"A": pl.Categorical,154"Car": pl.get_index_type(),155"Ship": pl.get_index_type(),156},157)158assert_frame_equal(result, expected, check_row_order=maintain_order)159160# test expression dispatch161result = df.pivot(162index=["A"],163on="B",164values="B",165aggregate_function=pl.len(),166maintain_order=maintain_order,167)168assert_frame_equal(result, expected, check_row_order=maintain_order)169170df = pl.DataFrame(171{172"A": ["Fire", "Water", "Water", "Fire"],173"B": ["Car", "Car", "Car", "Ship"],174"C": ["Paper", "Paper", "Paper", "Paper"],175},176schema=[("A", pl.Categorical), ("B", pl.Categorical), ("C", pl.Categorical)],177)178result = df.pivot(179index=["A", "C"],180on="B",181values="B",182aggregate_function="len",183maintain_order=maintain_order,184)185expected = pl.DataFrame(186{187"A": ["Fire", "Water"],188"C": ["Paper", "Paper"],189"Car": [1, 2],190"Ship": [1, 0],191},192schema={193"A": pl.Categorical,194"C": pl.Categorical,195"Car": pl.get_index_type(),196"Ship": pl.get_index_type(),197},198)199assert_frame_equal(result, expected, check_row_order=maintain_order)200201202def test_pivot_multiple_values_column_names_5116() -> None:203df = pl.DataFrame(204{205"x1": [1, 2, 3, 4, 5, 6, 7, 8],206"x2": [8, 7, 6, 5, 4, 3, 2, 1],207"c1": ["A", "B"] * 4,208"c2": ["C", "C", "D", "D"] * 2,209}210)211212with pytest.raises(213ComputeError,214match="aggregation 'item' expected no or a single value, got 2 values",215):216df.pivot(217index="c1",218on="c2",219values=["x1", "x2"],220separator="|",221aggregate_function=None,222)223224result = df.pivot(225index="c1",226on="c2",227values=["x1", "x2"],228separator="|",229aggregate_function="first",230)231expected = {232"c1": ["A", "B"],233"x1|C": [1, 2],234"x1|D": [3, 4],235"x2|C": [8, 7],236"x2|D": [6, 5],237}238assert result.to_dict(as_series=False) == expected239240241@pytest.mark.parametrize("maintain_order", [False, True])242def test_pivot_duplicate_names_7731(maintain_order: bool) -> None:243df = pl.DataFrame(244{245"a": [1, 4],246"b": [1.5, 2.5],247"c": ["x", "x"],248"d": [7, 8],249"e": ["x", "y"],250}251)252result = df.pivot(253index=cs.float(),254on=cs.string(),255values=cs.integer(),256aggregate_function="first",257maintain_order=maintain_order,258)259expected = pl.DataFrame(260{261"b": [1.5, 2.5],262'a_{"x","x"}': [1, None],263'a_{"x","y"}': [None, 4],264'd_{"x","x"}': [7, None],265'd_{"x","y"}': [None, 8],266}267)268assert_frame_equal(result, expected, check_row_order=maintain_order)269270271def test_pivot_duplicate_names_11663() -> None:272df = pl.DataFrame({"a": [1, 2], "b": [1, 2], "c": ["x", "x"], "d": ["x", "y"]})273result = df.pivot(index="b", on=["c", "d"], values="a").to_dict(as_series=False)274expected = {"b": [1, 2], '{"x","x"}': [1, None], '{"x","y"}': [None, 2]}275assert result == expected276277278def test_pivot_multiple_columns_12407() -> None:279df = pl.DataFrame(280{281"a": ["beep", "bop"],282"b": ["a", "b"],283"c": ["s", "f"],284"d": [7, 8],285"e": ["x", "y"],286}287)288result = df.pivot(289index="b", on=["c", "e"], values=["a"], aggregate_function="len"290).to_dict(as_series=False)291expected = {"b": ["a", "b"], '{"s","x"}': [1, 0], '{"f","y"}': [0, 1]}292assert result == expected293294295def test_pivot_struct_13120() -> None:296df = pl.DataFrame(297{298"index": [1, 2, 3, 1, 2, 3],299"item_type": ["a", "a", "a", "b", "b", "b"],300"item_id": [123, 123, 123, 456, 456, 456],301"values": [4, 5, 6, 7, 8, 9],302}303)304df = df.with_columns(pl.struct(["item_type", "item_id"]).alias("columns")).drop(305"item_type", "item_id"306)307result = df.pivot(index="index", on="columns", values="values").to_dict(308as_series=False309)310expected = {"index": [1, 2, 3], '{"a",123}': [4, 5, 6], '{"b",456}': [7, 8, 9]}311assert result == expected312313314def test_pivot_index_struct_14101() -> None:315df = pl.DataFrame(316{317"a": [1, 2, 1],318"b": [{"a": 1}, {"a": 1}, {"a": 2}],319"c": ["x", "y", "y"],320"d": [1, 1, 3],321}322)323result = df.pivot(index="b", on="c", values="a")324expected = pl.DataFrame({"b": [{"a": 1}, {"a": 2}], "x": [1, None], "y": [2, 1]})325assert_frame_equal(result, expected)326327328def test_pivot_nested_struct_17065() -> None:329df = pl.DataFrame(330{331"foo": ["one", "two", "one", "two"],332"bar": ["x", "x", "y", "y"],333"baz": [334{"a": 1, "b": {"c": 2}},335{"a": 3, "b": {"c": 4}},336{"a": 5, "b": {"c": 6}},337{"a": 7, "b": {"c": 8}},338],339}340)341result = df.pivot(on="bar", index="foo", values="baz")342expected = pl.DataFrame(343{344"foo": ["one", "two"],345"x": [346{"a": 1, "b": {"c": 2}},347{"a": 3, "b": {"c": 4}},348],349"y": [350{"a": 5, "b": {"c": 6}},351{"a": 7, "b": {"c": 8}},352],353}354)355assert_frame_equal(result, expected)356357358def test_pivot_name_already_exists() -> None:359# This should be extremely rare...but still, good to check it360df = pl.DataFrame(361{362"a": ["a", "b"],363"b": ["b", "a"],364'{"a","b"}': [1, 2],365}366)367with pytest.raises(DuplicateError, match="has more than one occurrence"):368df.pivot(369["a", "b"],370index='{"a","b"}',371values="a",372aggregate_function="first",373)374375376def test_pivot_floats() -> None:377df = pl.DataFrame(378{379"article": ["a", "a", "a", "b", "b", "b"],380"weight": [1.0, 1.0, 4.4, 1.0, 8.8, 1.0],381"quantity": [1.0, 5.0, 1.0, 1.0, 1.0, 7.5],382"price": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],383}384)385386with pytest.raises(387ComputeError,388match="aggregation 'item' expected no or a single value, got 2 values",389):390result = df.pivot(391index="weight", on="quantity", values="price", aggregate_function=None392)393394result = df.pivot(395index="weight", on="quantity", values="price", aggregate_function="first"396)397expected = {398"weight": [1.0, 4.4, 8.8],399"1.0": [1.0, 3.0, 5.0],400"5.0": [2.0, None, None],401"7.5": [6.0, None, None],402}403assert result.to_dict(as_series=False) == expected404405result = df.pivot(406index=["article", "weight"],407on="quantity",408values="price",409aggregate_function=None,410)411expected = {412"article": ["a", "a", "b", "b"],413"weight": [1.0, 4.4, 1.0, 8.8],414"1.0": [1.0, 3.0, 4.0, 5.0],415"5.0": [2.0, None, None, None],416"7.5": [None, None, 6.0, None],417}418assert result.to_dict(as_series=False) == expected419420421def test_pivot_reinterpret_5907() -> None:422df = pl.DataFrame(423{424"A": pl.Series([3, -2, 3, -2], dtype=pl.Int32),425"B": ["x", "x", "y", "y"],426"C": [100, 50, 500, -80],427}428)429430result = df.pivot(431index=["A"], on=["B"], values=["C"], aggregate_function=pl.element().sum()432)433expected = {"A": [3, -2], "x": [100, 50], "y": [500, -80]}434assert result.to_dict(as_series=False) == expected435436437@pytest.mark.parametrize(438"dtype",439[440pl.Date,441pl.Datetime("ms"),442pl.Datetime("ns"),443pl.Datetime("us"),444pl.Datetime("ms", time_zone="Asia/Shanghai"),445pl.Duration("ms"),446pl.Duration("us"),447pl.Duration("ns"),448pl.Time,449],450)451def test_pivot_temporal_logical_types(dtype: PolarsTemporalType) -> None:452idx = pl.Series([7, 8, 9, 0, 1, 2, 3, 4]).cast(dtype)453df = pl.DataFrame(454{455"idx": idx,456"foo": ["a"] * 3 + ["b"] * 5,457"value": [0] * 8,458}459)460assert df.pivot(461index="idx", on="foo", values="value", aggregate_function=None462).to_dict(as_series=False) == {463"idx": idx.to_list(),464"a": [0, 0, 0, None, None, None, None, None],465"b": [None, None, None, 0, 0, 0, 0, 0],466}467468469def test_pivot_negative_duration() -> None:470df1 = pl.DataFrame({"root": [date(2020, i, 15) for i in (1, 2)]})471df2 = pl.DataFrame({"delta": [timedelta(days=i) for i in (-2, -1, 0, 1)]})472473df = df1.join(df2, how="cross", maintain_order="left_right").with_columns(474pl.Series(name="value", values=range(len(df1) * len(df2)))475)476assert df.pivot(477index="delta", on="root", values="value", aggregate_function=None478).to_dict(as_series=False) == {479"delta": [480timedelta(days=-2),481timedelta(days=-1),482timedelta(0),483timedelta(days=1),484],485"2020-01-15": [0, 1, 2, 3],486"2020-02-15": [4, 5, 6, 7],487}488489490def test_aggregate_function_default() -> None:491df = pl.DataFrame({"a": [1, 2], "b": ["foo", "foo"], "c": ["x", "x"]})492with pytest.raises(493ComputeError,494match="aggregation 'item' expected no or a single value, got 2 values",495):496df.pivot(index="b", on="c", values="a")497498499def test_pivot_aggregate_function_count_deprecated() -> None:500df = pl.DataFrame(501{502"foo": ["A", "A", "B", "B", "C"],503"N": [1, 2, 2, 4, 2],504"bar": ["k", "l", "m", "n", "o"],505}506)507with pytest.deprecated_call():508df.pivot(index="foo", on="bar", values="N", aggregate_function="count") # type: ignore[arg-type]509510511def test_pivot_struct() -> None:512data = {513"id": ["a", "a", "b", "c", "c", "c"],514"week": ["1", "2", "3", "4", "3", "1"],515"num1": [1, 3, 5, 4, 3, 6],516"num2": [4, 5, 3, 4, 6, 6],517}518df = pl.DataFrame(data).with_columns(nums=pl.struct(["num1", "num2"]))519520assert df.pivot(521values="nums", index="id", on="week", aggregate_function="first"522).to_dict(as_series=False) == {523"id": ["a", "b", "c"],524"1": [525{"num1": 1, "num2": 4},526None,527{"num1": 6, "num2": 6},528],529"2": [530{"num1": 3, "num2": 5},531None,532None,533],534"3": [535None,536{"num1": 5, "num2": 3},537{"num1": 3, "num2": 6},538],539"4": [540None,541None,542{"num1": 4, "num2": 4},543],544}545546547def test_duplicate_column_names_which_should_raise_14305() -> None:548df = pl.DataFrame({"a": [1, 3, 2], "c": ["a", "a", "a"], "d": [7, 8, 9]})549with pytest.raises(DuplicateError, match="has more than one occurrence"):550df.pivot(index="a", on="c", values="d")551552553def test_multi_index_containing_struct() -> None:554df = pl.DataFrame(555{556"a": [1, 2, 1],557"b": [{"a": 1}, {"a": 1}, {"a": 2}],558"c": ["x", "y", "y"],559"d": [1, 1, 3],560}561)562result = df.pivot(index=("b", "d"), on="c", values="a")563expected = pl.DataFrame(564{"b": [{"a": 1}, {"a": 2}], "d": [1, 3], "x": [1, None], "y": [2, 1]}565)566assert_frame_equal(result, expected)567568569def test_list_pivot() -> None:570df = pl.DataFrame(571{572"a": [1, 2, 3, 1],573"b": [[1, 2], [3, 4], [5, 6], [1, 2]],574"c": ["x", "x", "y", "y"],575"d": [1, 2, 3, 4],576}577)578assert df.pivot(579index=["a", "b"],580on="c",581values="d",582).to_dict(as_series=False) == {583"a": [1, 2, 3],584"b": [[1, 2], [3, 4], [5, 6]],585"x": [1, 2, None],586"y": [4, None, 3],587}588589590def test_pivot_string_17081() -> None:591df = pl.DataFrame(592{593"a": ["1", "2", "3"],594"b": ["4", "5", "6"],595"c": ["7", "8", "9"],596}597)598assert df.pivot(index="a", on="b", values="c", aggregate_function="min").to_dict(599as_series=False600) == {601"a": ["1", "2", "3"],602"4": ["7", None, None],603"5": [None, "8", None],604"6": [None, None, "9"],605}606607608def test_pivot_invalid() -> None:609with pytest.raises(610pl.exceptions.InvalidOperationError,611match="`pivot` needs either `index or `values` needs to be specified",612):613pl.DataFrame({"a": [1, 2], "b": [2, 3], "c": [3, 4]}).pivot("a")614615616@pytest.mark.parametrize(617"dtype",618[pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64],619)620def test_pivot_empty_index_dtypes(dtype: PolarsIntegerType) -> None:621index = pl.Series([], dtype=dtype)622df = pl.DataFrame({"index": index, "on": [], "values": []})623result = df.pivot(index="index", on="on", values="values")624expected = pl.DataFrame({"index": index})625assert_frame_equal(result, expected)626627628def test_pivot_agg_column_ref_invalid_22479() -> None:629df = pl.DataFrame(630{"a": ["x", "x", "x"], "b": [1, 1, 1], "c": [7, 8, 9], "d": [0, 2, 1]}631)632with pytest.raises(633pl.exceptions.InvalidOperationError,634match="explicit column references are not allowed in the `aggregate_function` of `pivot`",635):636df.pivot(637on="a", index="b", values="c", aggregate_function=pl.element().sort_by("d")638)639640641def test_pivot_agg_null_methods_23408() -> None:642df = pl.DataFrame(643{644"idx": [0, 0, 1, 1],645"on": ["a", "b", "a", "c"],646"val": ["aa", "bb", "aa", "cc"],647}648)649out = df.pivot(650on="on",651index="idx",652values="val",653aggregate_function=pl.element().first().is_null(),654)655expected = pl.DataFrame(656{"idx": [0, 1], "a": [False, False], "b": [False, True], "c": [True, False]}657)658assert_frame_equal(out, expected)659660out = df.pivot(661on="on",662index="idx",663values="val",664aggregate_function=pl.element().first().fill_null("xx"),665)666expected = pl.DataFrame(667{"idx": [0, 1], "a": ["aa", "aa"], "b": ["bb", "xx"], "c": ["xx", "cc"]}668)669assert_frame_equal(out, expected)670671672def test_pivot_obj_25527() -> None:673df = pl.DataFrame(674{675"idx": [0, 0, 1, 1],676"key": ["foo", "bar", "foo", "bar"],677"value": ["obj 0 foo", "obj 0 bar", "obj 1 foo", "obj 1 bar"],678},679schema={680"idx": pl.Int64,681"key": pl.String,682"value": pl.Object,683},684)685686out = df.pivot(on="key", index="idx")687assert out["foo"].to_list() == ["obj 0 foo", "obj 1 foo"]688assert out["foo"].dtype == pl.Object689assert out["bar"].to_list() == ["obj 0 bar", "obj 1 bar"]690assert out["bar"].dtype == pl.Object691692693