Path: blob/main/py-polars/tests/unit/operations/test_explode.py
6939 views
from __future__ import annotations12import pyarrow as pa3import pytest45import polars as pl6import polars.selectors as cs7from polars.exceptions import ShapeError8from polars.testing import assert_frame_equal, assert_series_equal91011def test_explode_multiple() -> None:12df = pl.DataFrame({"a": [[1, 2], [3, 4]], "b": [[5, 6], [7, 8]]})1314expected = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})15assert_frame_equal(df.explode(cs.all()), expected)16assert_frame_equal(df.explode(["a", "b"]), expected)17assert_frame_equal(df.explode("a", "b"), expected)181920def test_group_by_flatten_list() -> None:21df = pl.DataFrame({"group": ["a", "b", "b"], "values": [[1, 2], [2, 3], [4]]})22result = df.group_by("group", maintain_order=True).agg(pl.col("values").flatten())2324expected = pl.DataFrame({"group": ["a", "b"], "values": [[1, 2], [2, 3, 4]]})25assert_frame_equal(result, expected)262728def test_explode_empty_df_3402() -> None:29df = pl.DataFrame({"a": pa.array([], type=pa.large_list(pa.int32()))})30assert df.explode("a").dtypes == [pl.Int32]313233def test_explode_empty_df_3460() -> None:34df = pl.DataFrame({"a": pa.array([[]], type=pa.large_list(pa.int32()))})35assert df.explode("a").dtypes == [pl.Int32]363738def test_explode_empty_df_3902() -> None:39df = pl.DataFrame(40{41"first": [1, 2, 3, 4, 5],42"second": [["a"], [], ["b", "c"], [], ["d", "f", "g"]],43}44)45expected = pl.DataFrame(46{47"first": [1, 2, 3, 3, 4, 5, 5, 5],48"second": ["a", None, "b", "c", None, "d", "f", "g"],49}50)51assert_frame_equal(df.explode("second"), expected)525354def test_explode_empty_list_4003() -> None:55df = pl.DataFrame(56[57{"id": 1, "nested": []},58{"id": 2, "nested": [1]},59{"id": 3, "nested": [2]},60]61)62assert df.explode("nested").to_dict(as_series=False) == {63"id": [1, 2, 3],64"nested": [None, 1, 2],65}666768def test_explode_empty_list_4107() -> None:69df = pl.DataFrame({"b": [[1], [2], []] * 2}).with_row_index()7071assert_frame_equal(72df.explode(["b"]), df.explode(["b"]).drop("index").with_row_index()73)747576def test_explode_correct_for_slice() -> None:77df = pl.DataFrame({"b": [[1, 1], [2, 2], [3, 3], [4, 4]]})78assert df.slice(2, 2).explode(["b"])["b"].to_list() == [3, 3, 4, 4]7980df = (81(82pl.DataFrame({"group": pl.arange(0, 5, eager=True)}).join(83pl.DataFrame(84{85"b": [[1, 2, 3], [2, 3], [4], [1, 2, 3], [0]],86}87),88how="cross",89)90)91.sort("group", maintain_order=True)92.with_row_index()93)94expected = pl.DataFrame(95{96"index": [0, 0, 0, 1, 1, 2, 3, 3, 3, 4, 5, 5, 5, 6, 6, 7, 8, 8, 8, 9],97"group": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],98"b": [1, 2, 3, 2, 3, 4, 1, 2, 3, 0, 1, 2, 3, 2, 3, 4, 1, 2, 3, 0],99},100schema_overrides={"index": pl.UInt32},101)102assert_frame_equal(df.slice(0, 10).explode(["b"]), expected)103104105def test_sliced_null_explode() -> None:106s = pl.Series("", [[1], [2], [3], [4], [], [6]])107assert s.slice(2, 4).list.explode().to_list() == [3, 4, None, 6]108assert s.slice(2, 2).list.explode().to_list() == [3, 4]109assert pl.Series("", [[1], [2], None, [4], [], [6]]).slice(1102, 4111).list.explode().to_list() == [None, 4, None, 6]112113s = pl.Series("", [["a"], ["b"], ["c"], ["d"], [], ["e"]])114assert s.slice(2, 4).list.explode().to_list() == ["c", "d", None, "e"]115assert s.slice(2, 2).list.explode().to_list() == ["c", "d"]116assert pl.Series("", [["a"], ["b"], None, ["d"], [], ["e"]]).slice(1172, 4118).list.explode().to_list() == [None, "d", None, "e"]119120s = pl.Series("", [[False], [False], [True], [False], [], [True]])121assert s.slice(2, 2).list.explode().to_list() == [True, False]122assert s.slice(2, 4).list.explode().to_list() == [True, False, None, True]123124125@pytest.mark.parametrize("maintain_order", [False, True])126def test_explode_in_agg_context(maintain_order: bool) -> None:127df = pl.DataFrame(128{"idxs": [[0], [1], [0, 2]], "array": [[0.0, 3.5], [4.6, 0.0], [0.0, 7.8, 0.0]]}129)130131assert_frame_equal(132df.with_row_index()133.explode("idxs")134.group_by("index", maintain_order=maintain_order)135.agg(pl.col("array").flatten()),136pl.DataFrame(137{138"index": [0, 1, 2],139"array": [[0.0, 3.5], [4.6, 0.0], [0.0, 7.8, 0.0, 0.0, 7.8, 0.0]],140},141schema_overrides={"index": pl.get_index_type()},142),143check_row_order=maintain_order,144)145146147def test_explode_inner_lists_3985() -> None:148df = pl.DataFrame(149data={"id": [1, 1, 1], "categories": [["a"], ["b"], ["a", "c"]]}150).lazy()151152assert (153df.group_by("id")154.agg(pl.col("categories"))155.with_columns(pl.col("categories").list.eval(pl.element().list.explode()))156).collect().to_dict(as_series=False) == {157"id": [1],158"categories": [["a", "b", "a", "c"]],159}160161162def test_list_struct_explode_6905() -> None:163assert pl.DataFrame(164{165"group": [166[],167[168{"params": [1]},169{"params": []},170],171]172},173schema={"group": pl.List(pl.Struct([pl.Field("params", pl.List(pl.Int32))]))},174)["group"].list.explode().to_list() == [175None,176{"params": [1]},177{"params": []},178]179180181def test_explode_binary() -> None:182assert pl.Series([[1, 2], [3]]).cast(183pl.List(pl.Binary)184).list.explode().to_list() == [185b"1",186b"2",187b"3",188]189190191def test_explode_null_list() -> None:192assert pl.Series([["a"], None], dtype=pl.List(pl.String))[1931:2194].list.min().to_list() == [None]195196197def test_explode_invalid_element_count() -> None:198df = pl.DataFrame(199{200"col1": [["X", "Y", "Z"], ["F", "G"], ["P"]],201"col2": [["A", "B", "C"], ["C"], ["D", "E"]],202}203).with_row_index()204with pytest.raises(205ShapeError, match=r"exploded columns must have matching element counts"206):207df.explode(["col1", "col2"])208209210def test_logical_explode() -> None:211out = (212pl.DataFrame(213{"cats": ["Value1", "Value2", "Value1"]},214schema_overrides={"cats": pl.Categorical},215)216.group_by(1)217.agg(pl.struct("cats"))218.explode("cats")219.unnest("cats")220)221assert out["cats"].dtype == pl.Categorical222assert out["cats"].to_list() == ["Value1", "Value2", "Value1"]223224225def test_explode_inner_null() -> None:226expected = pl.DataFrame({"A": [None, None]}, schema={"A": pl.Null})227out = pl.DataFrame({"A": [[], []]}, schema={"A": pl.List(pl.Null)}).explode("A")228assert_frame_equal(out, expected)229230231def test_explode_array() -> None:232df = pl.LazyFrame(233{"a": [[1, 2], [2, 3]], "b": [1, 2]},234schema_overrides={"a": pl.Array(pl.Int64, 2)},235)236expected = pl.DataFrame({"a": [1, 2, 2, 3], "b": [1, 1, 2, 2]})237for ex in ("a", ~cs.integer()):238out = df.explode(ex).collect()239assert_frame_equal(out, expected)240241242def test_string_list_agg_explode() -> None:243df = pl.DataFrame({"a": [[None], ["b"]]})244245df = df.select(246pl.col("a").list.eval(pl.element().filter(pl.element().is_not_null()))247)248assert not df["a"].flags["FAST_EXPLODE"]249250df2 = pl.DataFrame({"a": [[], ["b"]]})251252assert_frame_equal(df, df2)253assert_frame_equal(df.explode("a"), df2.explode("a"))254255256def test_explode_null_struct() -> None:257df = [258{"col1": None},259{260"col1": [261{"field1": None, "field2": None, "field3": None},262{"field1": None, "field2": "some", "field3": "value"},263]264},265]266267assert pl.DataFrame(df).explode("col1").to_dict(as_series=False) == {268"col1": [269None,270{"field1": None, "field2": None, "field3": None},271{"field1": None, "field2": "some", "field3": "value"},272]273}274275276def test_df_explode_with_array() -> None:277df = pl.DataFrame(278{279"arr": [["a", "b"], ["c", None], None, ["d", "e"]],280"list": [[1, 2], [3], [4, None], None],281"val": ["x", "y", "z", "q"],282},283schema={284"arr": pl.Array(pl.String, 2),285"list": pl.List(pl.Int64),286"val": pl.String,287},288)289290expected_by_arr = pl.DataFrame(291{292"arr": ["a", "b", "c", None, None, "d", "e"],293"list": [[1, 2], [1, 2], [3], [3], [4, None], None, None],294"val": ["x", "x", "y", "y", "z", "q", "q"],295}296)297assert_frame_equal(df.explode("arr"), expected_by_arr)298299expected_by_list = pl.DataFrame(300{301"arr": [["a", "b"], ["a", "b"], ["c", None], None, None, ["d", "e"]],302"list": [1, 2, 3, 4, None, None],303"val": ["x", "x", "y", "z", "z", "q"],304},305schema={306"arr": pl.Array(pl.String, 2),307"list": pl.Int64,308"val": pl.String,309},310)311assert_frame_equal(df.explode("list"), expected_by_list)312313df = pl.DataFrame(314{315"arr": [["a", "b"], ["c", None], None, ["d", "e"]],316"list": [[1, 2], [3, 4], None, [5, None]],317"val": [None, 1, 2, None],318},319schema={320"arr": pl.Array(pl.String, 2),321"list": pl.List(pl.Int64),322"val": pl.Int64,323},324)325expected_by_arr_and_list = pl.DataFrame(326{327"arr": ["a", "b", "c", None, None, "d", "e"],328"list": [1, 2, 3, 4, None, 5, None],329"val": [None, None, 1, 1, 2, None, None],330},331schema={332"arr": pl.String,333"list": pl.Int64,334"val": pl.Int64,335},336)337assert_frame_equal(df.explode("arr", "list"), expected_by_arr_and_list)338339340def test_explode_nullable_list() -> None:341df = pl.DataFrame({"layout1": [None, [1, 2]], "b": [False, True]}).with_columns(342layout2=pl.when(pl.col("b")).then([1, 2]),343)344345explode_df = df.explode("layout1", "layout2")346expected_df = pl.DataFrame(347{348"layout1": [None, 1, 2],349"b": [False, True, True],350"layout2": [None, 1, 2],351}352)353assert_frame_equal(explode_df, expected_df)354355explode_expr = df.select(356pl.col("layout1").explode(),357pl.col("layout2").explode(),358)359expected_df = pl.DataFrame(360{361"layout1": [None, 1, 2],362"layout2": [None, 1, 2],363}364)365assert_frame_equal(explode_expr, expected_df)366367368def test_group_by_flatten_string() -> None:369df = pl.DataFrame({"group": ["a", "b", "b"], "values": ["foo", "bar", "baz"]})370371result = df.group_by("group", maintain_order=True).agg(372pl.col("values").str.split("").explode()373)374375expected = pl.DataFrame(376{377"group": ["a", "b"],378"values": [["f", "o", "o"], ["b", "a", "r", "b", "a", "z"]],379}380)381assert_frame_equal(result, expected)382383384def test_fast_explode_merge_right_16923() -> None:385df = pl.concat(386[387pl.DataFrame({"foo": [["a", "b"], ["c"]]}),388pl.DataFrame({"foo": [None]}, schema={"foo": pl.List(pl.Utf8)}),389],390how="diagonal",391rechunk=True,392).explode("foo")393394assert df.height == 4395396397def test_fast_explode_merge_left_16923() -> None:398df = pl.concat(399[400pl.DataFrame({"foo": [None]}, schema={"foo": pl.List(pl.Utf8)}),401pl.DataFrame({"foo": [["a", "b"], ["c"]]}),402],403how="diagonal",404rechunk=True,405).explode("foo")406407assert df.height == 4408409410@pytest.mark.parametrize(411("values", "exploded"),412[413(["foobar", None], ["f", "o", "o", "b", "a", "r", None]),414([None, "foo", "bar"], [None, "f", "o", "o", "b", "a", "r"]),415(416[None, "foo", "bar", None, "ham"],417[None, "f", "o", "o", "b", "a", "r", None, "h", "a", "m"],418),419(["foo", "bar", "ham"], ["f", "o", "o", "b", "a", "r", "h", "a", "m"]),420(["", None, "foo", "bar"], ["", None, "f", "o", "o", "b", "a", "r"]),421(["", "foo", "bar"], ["", "f", "o", "o", "b", "a", "r"]),422],423)424def test_series_str_explode_deprecated(425values: list[str | None], exploded: list[str | None]426) -> None:427with pytest.deprecated_call():428result = pl.Series(values).str.explode()429assert result.to_list() == exploded430431432def test_expr_str_explode_deprecated() -> None:433df = pl.Series("a", ["Hello", "World"])434with pytest.deprecated_call():435result = df.to_frame().select(pl.col("a").str.explode()).to_series()436437expected = pl.Series("a", ["H", "e", "l", "l", "o", "W", "o", "r", "l", "d"])438assert_series_equal(result, expected)439440441def test_undefined_col_15852() -> None:442lf = pl.LazyFrame({"foo": [1]})443444with pytest.raises(pl.exceptions.ColumnNotFoundError):445lf.explode("bar").join(lf, on="foo").collect()446447448def test_explode_17648() -> None:449df = pl.DataFrame({"a": [[1, 3], [2, 6, 7], [3, 9, 2], [4], [5, 1, 2, 3, 4]]})450assert (451df.slice(1, 2)452.with_columns(pl.int_ranges(pl.col("a").list.len()).alias("count"))453.explode("a", "count")454).to_dict(as_series=False) == {"a": [2, 6, 7, 3, 9, 2], "count": [0, 1, 2, 0, 1, 2]}455456457def test_explode_struct_nulls() -> None:458df = pl.DataFrame({"A": [[{"B": 1}], [None], []]})459assert df.explode("A").to_dict(as_series=False) == {"A": [{"B": 1}, None, None]}460461462