Path: blob/main/py-polars/tests/unit/operations/test_explode.py
8406 views
from __future__ import annotations12import pyarrow as pa3import pytest4from hypothesis import given56import polars as pl7import polars.selectors as cs8from polars.exceptions import ShapeError9from polars.testing import assert_frame_equal, assert_series_equal10from polars.testing.parametric import series111213def test_explode_multiple() -> None:14df = pl.DataFrame({"a": [[1, 2], [3, 4]], "b": [[5, 6], [7, 8]]})1516expected = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})17assert_frame_equal(df.explode(cs.all()), expected)18assert_frame_equal(df.explode(["a", "b"]), expected)19assert_frame_equal(df.explode("a", "b"), expected)202122def test_group_by_flatten_list() -> None:23df = pl.DataFrame({"group": ["a", "b", "b"], "values": [[1, 2], [2, 3], [4]]})24result = df.group_by("group", maintain_order=True).agg(25pl.col("values").list.explode(keep_nulls=False, empty_as_null=False)26)2728expected = pl.DataFrame({"group": ["a", "b"], "values": [[1, 2], [2, 3, 4]]})29assert_frame_equal(result, expected)303132def test_explode_empty_df_3402() -> None:33df = pl.DataFrame({"a": pa.array([], type=pa.large_list(pa.int32()))})34assert df.explode("a").dtypes == [pl.Int32]353637def test_explode_empty_df_3460() -> None:38df = pl.DataFrame({"a": pa.array([[]], type=pa.large_list(pa.int32()))})39assert df.explode("a").dtypes == [pl.Int32]404142def test_explode_empty_df_3902() -> None:43df = pl.DataFrame(44{45"first": [1, 2, 3, 4, 5],46"second": [["a"], [], ["b", "c"], [], ["d", "f", "g"]],47}48)49expected = pl.DataFrame(50{51"first": [1, 2, 3, 3, 4, 5, 5, 5],52"second": ["a", None, "b", "c", None, "d", "f", "g"],53}54)55assert_frame_equal(df.explode("second"), expected)565758def test_explode_empty_list_4003() -> None:59df = pl.DataFrame(60[61{"id": 1, "nested": []},62{"id": 2, "nested": [1]},63{"id": 3, "nested": [2]},64]65)66assert df.explode("nested").to_dict(as_series=False) == {67"id": [1, 2, 3],68"nested": [None, 1, 2],69}707172def test_explode_empty_list_4107() -> None:73df = pl.DataFrame({"b": [[1], [2], []] * 2}).with_row_index()7475assert_frame_equal(76df.explode(["b"]), df.explode(["b"]).drop("index").with_row_index()77)787980def test_explode_correct_for_slice() -> None:81df = pl.DataFrame({"b": [[1, 1], [2, 2], [3, 3], [4, 4]]})82assert df.slice(2, 2).explode(["b"])["b"].to_list() == [3, 3, 4, 4]8384df = (85(86pl.DataFrame({"group": pl.arange(0, 5, eager=True)}).join(87pl.DataFrame(88{89"b": [[1, 2, 3], [2, 3], [4], [1, 2, 3], [0]],90}91),92how="cross",93)94)95.sort("group", maintain_order=True)96.with_row_index()97)98expected = pl.DataFrame(99{100"index": [0, 0, 0, 1, 1, 2, 3, 3, 3, 4, 5, 5, 5, 6, 6, 7, 8, 8, 8, 9],101"group": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],102"b": [1, 2, 3, 2, 3, 4, 1, 2, 3, 0, 1, 2, 3, 2, 3, 4, 1, 2, 3, 0],103},104schema_overrides={"index": pl.get_index_type()},105)106assert_frame_equal(df.slice(0, 10).explode(["b"]), expected)107108109def test_sliced_null_explode() -> None:110s = pl.Series("", [[1], [2], [3], [4], [], [6]])111assert s.slice(2, 4).list.explode().to_list() == [3, 4, None, 6]112assert s.slice(2, 2).list.explode().to_list() == [3, 4]113assert pl.Series("", [[1], [2], None, [4], [], [6]]).slice(1142, 4115).list.explode().to_list() == [None, 4, None, 6]116117s = pl.Series("", [["a"], ["b"], ["c"], ["d"], [], ["e"]])118assert s.slice(2, 4).list.explode().to_list() == ["c", "d", None, "e"]119assert s.slice(2, 2).list.explode().to_list() == ["c", "d"]120assert pl.Series("", [["a"], ["b"], None, ["d"], [], ["e"]]).slice(1212, 4122).list.explode().to_list() == [None, "d", None, "e"]123124s = pl.Series("", [[False], [False], [True], [False], [], [True]])125assert s.slice(2, 2).list.explode().to_list() == [True, False]126assert s.slice(2, 4).list.explode().to_list() == [True, False, None, True]127128129@pytest.mark.parametrize("maintain_order", [False, True])130def test_explode_in_agg_context(maintain_order: bool) -> None:131df = pl.DataFrame(132{"idxs": [[0], [1], [0, 2]], "array": [[0.0, 3.5], [4.6, 0.0], [0.0, 7.8, 0.0]]}133)134135assert_frame_equal(136df.with_row_index()137.explode("idxs")138.group_by("index", maintain_order=maintain_order)139.agg(pl.col("array").list.explode(keep_nulls=False, empty_as_null=False)),140pl.DataFrame(141{142"index": [0, 1, 2],143"array": [[0.0, 3.5], [4.6, 0.0], [0.0, 7.8, 0.0, 0.0, 7.8, 0.0]],144},145schema_overrides={"index": pl.get_index_type()},146),147check_row_order=maintain_order,148)149150151def test_explode_inner_lists_3985() -> None:152df = pl.DataFrame(153data={"id": [1, 1, 1], "categories": [["a"], ["b"], ["a", "c"]]}154).lazy()155156assert (157df.group_by("id")158.agg(pl.col("categories"))159.with_columns(pl.col("categories").list.eval(pl.element().list.explode()))160).collect().to_dict(as_series=False) == {161"id": [1],162"categories": [["a", "b", "a", "c"]],163}164165166def test_list_struct_explode_6905() -> None:167assert pl.DataFrame(168{169"group": [170[],171[172{"params": [1]},173{"params": []},174],175]176},177schema={"group": pl.List(pl.Struct([pl.Field("params", pl.List(pl.Int32))]))},178)["group"].list.explode().to_list() == [179None,180{"params": [1]},181{"params": []},182]183184185def test_explode_binary() -> None:186assert pl.Series([[1, 2], [3]]).cast(187pl.List(pl.Binary)188).list.explode().to_list() == [189b"1",190b"2",191b"3",192]193194195def test_explode_null_list() -> None:196assert pl.Series([["a"], None], dtype=pl.List(pl.String))[1971:2198].list.min().to_list() == [None]199200201def test_explode_invalid_element_count() -> None:202df = pl.DataFrame(203{204"col1": [["X", "Y", "Z"], ["F", "G"], ["P"]],205"col2": [["A", "B", "C"], ["C"], ["D", "E"]],206}207).with_row_index()208with pytest.raises(209ShapeError, match=r"exploded columns must have matching element counts"210):211df.explode(["col1", "col2"])212213214def test_logical_explode() -> None:215out = (216pl.DataFrame(217{"cats": ["Value1", "Value2", "Value1"]},218schema_overrides={"cats": pl.Categorical},219)220.group_by(1)221.agg(pl.struct("cats"))222.explode("cats")223.unnest("cats")224)225assert out["cats"].dtype == pl.Categorical226assert out["cats"].to_list() == ["Value1", "Value2", "Value1"]227228229def test_explode_inner_null() -> None:230expected = pl.DataFrame({"A": [None, None]}, schema={"A": pl.Null})231out = pl.DataFrame({"A": [[], []]}, schema={"A": pl.List(pl.Null)}).explode("A")232assert_frame_equal(out, expected)233234235def test_explode_array() -> None:236df = pl.LazyFrame(237{"a": [[1, 2], [2, 3]], "b": [1, 2]},238schema_overrides={"a": pl.Array(pl.Int64, 2)},239)240expected = pl.DataFrame({"a": [1, 2, 2, 3], "b": [1, 1, 2, 2]})241for ex in ("a", ~cs.integer()):242out = df.explode(ex).collect()243assert_frame_equal(out, expected)244245246def test_string_list_agg_explode() -> None:247df = pl.DataFrame({"a": [[None], ["b"]]})248249df = df.select(250pl.col("a").list.eval(pl.element().filter(pl.element().is_not_null()))251)252assert not df["a"].flags["FAST_EXPLODE"]253254df2 = pl.DataFrame({"a": [[], ["b"]]})255256assert_frame_equal(df, df2)257assert_frame_equal(df.explode("a"), df2.explode("a"))258259260def test_explode_null_struct() -> None:261df = [262{"col1": None},263{264"col1": [265{"field1": None, "field2": None, "field3": None},266{"field1": None, "field2": "some", "field3": "value"},267]268},269]270271assert pl.DataFrame(df).explode("col1").to_dict(as_series=False) == {272"col1": [273None,274{"field1": None, "field2": None, "field3": None},275{"field1": None, "field2": "some", "field3": "value"},276]277}278279280def test_df_explode_with_array() -> None:281df = pl.DataFrame(282{283"arr": [["a", "b"], ["c", None], None, ["d", "e"]],284"list": [[1, 2], [3], [4, None], None],285"val": ["x", "y", "z", "q"],286},287schema={288"arr": pl.Array(pl.String, 2),289"list": pl.List(pl.Int64),290"val": pl.String,291},292)293294expected_by_arr = pl.DataFrame(295{296"arr": ["a", "b", "c", None, None, "d", "e"],297"list": [[1, 2], [1, 2], [3], [3], [4, None], None, None],298"val": ["x", "x", "y", "y", "z", "q", "q"],299}300)301assert_frame_equal(df.explode("arr"), expected_by_arr)302303expected_by_list = pl.DataFrame(304{305"arr": [["a", "b"], ["a", "b"], ["c", None], None, None, ["d", "e"]],306"list": [1, 2, 3, 4, None, None],307"val": ["x", "x", "y", "z", "z", "q"],308},309schema={310"arr": pl.Array(pl.String, 2),311"list": pl.Int64,312"val": pl.String,313},314)315assert_frame_equal(df.explode("list"), expected_by_list)316317df = pl.DataFrame(318{319"arr": [["a", "b"], ["c", None], None, ["d", "e"]],320"list": [[1, 2], [3, 4], None, [5, None]],321"val": [None, 1, 2, None],322},323schema={324"arr": pl.Array(pl.String, 2),325"list": pl.List(pl.Int64),326"val": pl.Int64,327},328)329expected_by_arr_and_list = pl.DataFrame(330{331"arr": ["a", "b", "c", None, None, "d", "e"],332"list": [1, 2, 3, 4, None, 5, None],333"val": [None, None, 1, 1, 2, None, None],334},335schema={336"arr": pl.String,337"list": pl.Int64,338"val": pl.Int64,339},340)341assert_frame_equal(df.explode("arr", "list"), expected_by_arr_and_list)342343344def test_explode_nullable_list() -> None:345df = pl.DataFrame({"layout1": [None, [1, 2]], "b": [False, True]}).with_columns(346layout2=pl.when(pl.col("b")).then([1, 2]),347)348349explode_df = df.explode("layout1", "layout2")350expected_df = pl.DataFrame(351{352"layout1": [None, 1, 2],353"b": [False, True, True],354"layout2": [None, 1, 2],355}356)357assert_frame_equal(explode_df, expected_df)358359explode_expr = df.select(360pl.col("layout1").explode(),361pl.col("layout2").explode(),362)363expected_df = pl.DataFrame(364{365"layout1": [None, 1, 2],366"layout2": [None, 1, 2],367}368)369assert_frame_equal(explode_expr, expected_df)370371372def test_group_by_flatten_string() -> None:373df = pl.DataFrame({"group": ["a", "b", "b"], "values": ["foo", "bar", "baz"]})374375result = df.group_by("group", maintain_order=True).agg(376pl.col("values").str.split("").explode()377)378379expected = pl.DataFrame(380{381"group": ["a", "b"],382"values": [["f", "o", "o"], ["b", "a", "r", "b", "a", "z"]],383}384)385assert_frame_equal(result, expected)386387388def test_fast_explode_merge_right_16923() -> None:389df = pl.concat(390[391pl.DataFrame({"foo": [["a", "b"], ["c"]]}),392pl.DataFrame({"foo": [None]}, schema={"foo": pl.List(pl.Utf8)}),393],394how="diagonal",395rechunk=True,396).explode("foo")397398assert df.height == 4399400401def test_fast_explode_merge_left_16923() -> None:402df = pl.concat(403[404pl.DataFrame({"foo": [None]}, schema={"foo": pl.List(pl.Utf8)}),405pl.DataFrame({"foo": [["a", "b"], ["c"]]}),406],407how="diagonal",408rechunk=True,409).explode("foo")410411assert df.height == 4412413414@pytest.mark.parametrize(415("values", "exploded"),416[417(["foobar", None], ["f", "o", "o", "b", "a", "r", None]),418([None, "foo", "bar"], [None, "f", "o", "o", "b", "a", "r"]),419(420[None, "foo", "bar", None, "ham"],421[None, "f", "o", "o", "b", "a", "r", None, "h", "a", "m"],422),423(["foo", "bar", "ham"], ["f", "o", "o", "b", "a", "r", "h", "a", "m"]),424(["", None, "foo", "bar"], ["", None, "f", "o", "o", "b", "a", "r"]),425(["", "foo", "bar"], ["", "f", "o", "o", "b", "a", "r"]),426],427)428def test_series_str_explode_deprecated(429values: list[str | None], exploded: list[str | None]430) -> None:431with pytest.deprecated_call():432result = pl.Series(values).str.explode()433assert result.to_list() == exploded434435436def test_expr_str_explode_deprecated() -> None:437df = pl.Series("a", ["Hello", "World"])438with pytest.deprecated_call():439result = df.to_frame().select(pl.col("a").str.explode()).to_series()440441expected = pl.Series("a", ["H", "e", "l", "l", "o", "W", "o", "r", "l", "d"])442assert_series_equal(result, expected)443444445def test_undefined_col_15852() -> None:446lf = pl.LazyFrame({"foo": [1]})447448with pytest.raises(pl.exceptions.ColumnNotFoundError):449lf.explode("bar").join(lf, on="foo").collect()450451452def test_explode_17648() -> None:453df = pl.DataFrame({"a": [[1, 3], [2, 6, 7], [3, 9, 2], [4], [5, 1, 2, 3, 4]]})454assert (455df.slice(1, 2)456.with_columns(pl.int_ranges(pl.col("a").list.len()).alias("count"))457.explode("a", "count")458).to_dict(as_series=False) == {"a": [2, 6, 7, 3, 9, 2], "count": [0, 1, 2, 0, 1, 2]}459460461def test_explode_struct_nulls() -> None:462df = pl.DataFrame({"A": [[{"B": 1}], [None], []]})463assert df.explode("A").to_dict(as_series=False) == {"A": [{"B": 1}, None, None]}464465466def test_explode_basic() -> None:467s = pl.Series468469assert_series_equal(s([[1, 2, 3]]).explode(), pl.Series([1, 2, 3]))470assert_series_equal(s([[1, 2, 3], None]).explode(), pl.Series([1, 2, 3, None]))471assert_series_equal(s([[1, 2, 3], []]).explode(), pl.Series([1, 2, 3, None]))472masked = (473s([[1, 2, 3], [1, 2], [1, 2]])474.to_frame()475.select(pl.when(pl.Series([True, False, True])).then(pl.col("")))476.to_series()477)478assert_series_equal(masked.explode(), pl.Series([1, 2, 3, None, 1, 2]))479masked = (480s([[1, 2, 3], [], [1, 2]])481.to_frame()482.select(pl.when(pl.Series([True, False, True])).then(pl.col("")))483.to_series()484)485assert_series_equal(masked.explode(), pl.Series([1, 2, 3, None, 1, 2]))486487assert_series_equal(488s([[1, 2, 3]]).explode(empty_as_null=False, keep_nulls=False),489pl.Series([1, 2, 3]),490)491492assert_series_equal(s([[1, 2, 3], None]).explode(), pl.Series([1, 2, 3, None]))493assert_series_equal(494s([[1, 2, 3], None]).explode(keep_nulls=False), pl.Series([1, 2, 3])495)496assert_series_equal(497s([[1, 2, 3], [None]]).explode(keep_nulls=False), pl.Series([1, 2, 3, None])498)499500assert_series_equal(s([[1, 2, 3], []]).explode(), pl.Series([1, 2, 3, None]))501assert_series_equal(502s([[1, 2, 3], []]).explode(empty_as_null=False), pl.Series([1, 2, 3])503)504assert_series_equal(505s([[1, 2, 3], [None]]).explode(empty_as_null=False), pl.Series([1, 2, 3, None])506)507508509@given(s=series(min_size=1))510@pytest.mark.parametrize("empty_as_null", [False, True])511@pytest.mark.parametrize("keep_nulls", [False, True])512def test_explode_parametric(513s: pl.Series, empty_as_null: bool, keep_nulls: bool514) -> None:515a = {"empty_as_null": empty_as_null, "keep_nulls": keep_nulls}516si = s.implode()517518empty_list_item = s.clear(1) if empty_as_null else s.clear()519null_list_item = s.clear(1) if keep_nulls else s.clear()520521assert_series_equal(si.explode(**a), s)522assert_series_equal(s.clear().implode().explode(**a), empty_list_item)523assert_series_equal(si.clear(1).explode(**a), null_list_item)524525assert_series_equal(526pl.concat([si, s.clear().implode(), si]).explode(**a),527pl.concat([s, empty_list_item, s]),528)529assert_series_equal(530pl.concat([si, si.clear(1), si]).explode(**a), pl.concat([s, null_list_item, s])531)532533for mask in [534(False, False, False),535(True, False, True),536(False, False, True),537(True, False, False),538(False, True, False),539]:540masked = (541pl.concat([si, si, si])542.to_frame()543.select(pl.when(pl.Series(mask)).then(pl.col(s.name)).alias(s.name))544.to_series()545)546assert_series_equal(547masked.explode(**a), pl.concat([s if m else null_list_item for m in mask])548)549550for size in [2, 3, 7, 15]:551assert_series_equal(pl.concat([si] * size).explode(**a), pl.concat([s] * size))552553assert_series_equal(554pl.concat([s.clear().implode()] + [si] * size).explode(**a),555pl.concat([empty_list_item] + [s] * size),556)557assert_series_equal(558pl.concat([si] * size + [s.clear().implode()]).explode(**a),559pl.concat([s] * size + [empty_list_item]),560)561562assert_series_equal(563pl.concat([si.clear(1)] + [si] * size).explode(**a),564pl.concat([null_list_item] + [s] * size),565)566assert_series_equal(567pl.concat([si] * size + [si.clear(1)]).explode(**a),568pl.concat([s] * size + [null_list_item]),569)570571572def test_explode_array_parameters() -> None:573s = pl.Series("a", [[1, 2, 3], [4, 5, 6], [7, 8, 9]], pl.Array(pl.Int64, 3))574assert_series_equal(s.explode(), pl.Series("a", list(range(1, 10)), pl.Int64))575576s = pl.Series("a", [[1, 2, 3], [4, 5, 6], None], pl.Array(pl.Int64, 3))577assert_series_equal(578s.explode(), pl.Series("a", list(range(1, 7)) + [None], pl.Int64)579)580assert_series_equal(581s.explode(keep_nulls=False), pl.Series("a", list(range(1, 7)), pl.Int64)582)583584s = pl.Series("a", [[], [], None], pl.Array(pl.Int64, 0))585assert_series_equal(s.explode(), pl.Series("a", [None] * 3, pl.Int64))586assert_series_equal(587s.explode(keep_nulls=False), pl.Series("a", [None] * 2, pl.Int64)588)589assert_series_equal(590s.explode(empty_as_null=False), pl.Series("a", [None], pl.Int64)591)592assert_series_equal(593s.explode(empty_as_null=False, keep_nulls=False), pl.Series("a", [], pl.Int64)594)595596597def test_explode_params() -> None:598df = pl.DataFrame({"a": [[1, 2, 3], None, [4, 5, 6], []], "b": [1, 2, 3, 4]})599600assert_frame_equal(601df.explode("a"),602pl.DataFrame(603{"a": [1, 2, 3, None, 4, 5, 6, None], "b": [1, 1, 1, 2, 3, 3, 3, 4]}604),605)606assert_frame_equal(607df.explode("a", empty_as_null=False),608pl.DataFrame({"a": [1, 2, 3, None, 4, 5, 6], "b": [1, 1, 1, 2, 3, 3, 3]}),609)610assert_frame_equal(611df.explode("a", keep_nulls=False),612pl.DataFrame({"a": [1, 2, 3, 4, 5, 6, None], "b": [1, 1, 1, 3, 3, 3, 4]}),613)614assert_frame_equal(615df.explode("a", empty_as_null=False, keep_nulls=False),616pl.DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [1, 1, 1, 3, 3, 3]}),617)618619620