Path: blob/main/py-polars/tests/unit/functions/test_concat.py
8421 views
import io1from typing import IO23import pytest45import polars as pl6from polars.testing import assert_frame_equal789@pytest.mark.may_fail_cloud # reason: @serialize-stack-overflow10@pytest.mark.slow11def test_concat_expressions_stack_overflow() -> None:12n = 1000013e = pl.concat([pl.lit(x) for x in range(n)])1415df = pl.select(e)16assert df.shape == (n, 1)171819@pytest.mark.may_fail_cloud # reason: @serialize-stack-overflow20@pytest.mark.slow21def test_concat_lf_stack_overflow() -> None:22n = 100023bar = pl.DataFrame({"a": 0}).lazy()2425for i in range(n):26bar = pl.concat([bar, pl.DataFrame({"a": i}).lazy()])27assert bar.collect().shape == (1001, 1)282930def test_concat_horizontally_strict() -> None:31df1 = pl.DataFrame({"c": [11], "d": [42]}) # 1 vs N (may broadcast)32df2 = pl.DataFrame({"c": [11, 12], "d": [42, 24]}) # 2 vs N33df3 = pl.DataFrame({"a": [0, 1, 2], "b": [1, 2, 3]})34with pytest.raises(pl.exceptions.ShapeError):35pl.concat([df1, df3], how="horizontal", strict=True)3637with pytest.raises(pl.exceptions.ShapeError):38pl.concat([df2, df3], how="horizontal", strict=True)3940with pytest.raises(pl.exceptions.ShapeError):41pl.concat([df1.lazy(), df3.lazy()], how="horizontal", strict=True).collect()4243with pytest.raises(pl.exceptions.ShapeError):44pl.concat([df2.lazy(), df3.lazy()], how="horizontal", strict=True).collect()4546out = pl.concat([df1, df3], how="horizontal", strict=False)47assert out.to_dict(as_series=False) == {48"a": [0, 1, 2],49"b": [1, 2, 3],50"c": [11, None, None],51"d": [42, None, None],52}5354out = pl.concat([df2, df3], how="horizontal", strict=False)55assert out.to_dict(as_series=False) == {56"a": [0, 1, 2],57"b": [1, 2, 3],58"c": [11, 12, None],59"d": [42, 24, None],60}616263def test_concat_vertically_relaxed() -> None:64a = pl.DataFrame(65data={"a": [1, 2, 3], "b": [True, False, None]},66schema={"a": pl.Int8, "b": pl.Boolean},67)68b = pl.DataFrame(69data={"a": [43, 2, 3], "b": [32, 1, None]},70schema={"a": pl.Int16, "b": pl.Int64},71)72out = pl.concat([a, b], how="vertical_relaxed")73assert out.schema == {"a": pl.Int16, "b": pl.Int64}74assert out.to_dict(as_series=False) == {75"a": [1, 2, 3, 43, 2, 3],76"b": [1, 0, None, 32, 1, None],77}78out = pl.concat([b, a], how="vertical_relaxed")79assert out.schema == {"a": pl.Int16, "b": pl.Int64}80assert out.to_dict(as_series=False) == {81"a": [43, 2, 3, 1, 2, 3],82"b": [32, 1, None, 1, 0, None],83}8485c = pl.DataFrame({"a": [1, 2], "b": [2, 1]})86d = pl.DataFrame({"a": [1.0, 0.2], "b": [None, 0.1]})8788out = pl.concat([c, d], how="vertical_relaxed")89assert out.schema == {"a": pl.Float64, "b": pl.Float64}90assert out.to_dict(as_series=False) == {91"a": [1.0, 2.0, 1.0, 0.2],92"b": [2.0, 1.0, None, 0.1],93}94out = pl.concat([d, c], how="vertical_relaxed")95assert out.schema == {"a": pl.Float64, "b": pl.Float64}96assert out.to_dict(as_series=False) == {97"a": [1.0, 0.2, 1.0, 2.0],98"b": [None, 0.1, 2.0, 1.0],99}100101102def test_concat_group_by() -> None:103df = pl.DataFrame(104{105"g": [0, 0, 0, 0, 1, 1, 1, 1],106"a": [0, 1, 2, 3, 4, 5, 6, 7],107"b": [8, 9, 10, 11, 12, 13, 14, 15],108}109)110out = df.group_by("g").agg(pl.concat([pl.col.a, pl.col.b]))111112assert_frame_equal(113out,114pl.DataFrame(115{116"g": [0, 1],117"a": [[0, 1, 2, 3, 8, 9, 10, 11], [4, 5, 6, 7, 12, 13, 14, 15]],118}119),120check_row_order=False,121)122123124def test_concat_19877() -> None:125df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})126out = df.select(pl.concat([pl.col("a"), pl.col("b")]))127assert_frame_equal(out, pl.DataFrame({"a": [1, 2, 3, 4]}))128129130def test_concat_zip_series_21980() -> None:131df = pl.DataFrame({"x": 1, "y": 2})132out = df.select(pl.concat([pl.col.x, pl.col.y]), pl.Series([3, 4]))133assert_frame_equal(out, pl.DataFrame({"x": [1, 2], "": [3, 4]}))134135136def test_concat_invalid_schema_err_20355() -> None:137lf1 = pl.LazyFrame({"x": [1], "y": [None]})138lf2 = pl.LazyFrame({"y": [1]})139with pytest.raises(pl.exceptions.InvalidOperationError):140pl.concat([lf1, lf2]).collect(engine="streaming")141142143def test_concat_df() -> None:144df1 = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1, 2, 3]})145df2 = pl.concat([df1, df1], rechunk=True)146147assert df2.shape == (6, 3)148assert df2.n_chunks() == 1149assert df2.rows() == df1.rows() + df1.rows()150assert pl.concat([df1, df1], rechunk=False).n_chunks() == 2151152# concat from generator of frames153df3 = pl.concat(items=(df1 for _ in range(2)))154assert_frame_equal(df2, df3)155156# check that df4 is not modified following concat of itself157df4 = pl.from_records(((1, 2), (1, 2)))158_ = pl.concat([df4, df4, df4])159160assert df4.shape == (2, 2)161assert df4.rows() == [(1, 1), (2, 2)]162163# misc error conditions164with pytest.raises(ValueError):165_ = pl.concat([])166167with pytest.raises(ValueError):168pl.concat([df1, df1], how="rubbish") # type: ignore[arg-type]169170171def test_concat_to_empty() -> None:172assert pl.concat([pl.DataFrame([]), pl.DataFrame({"a": [1]})]).to_dict(173as_series=False174) == {"a": [1]}175176177def test_concat_multiple_parquet_inmem() -> None:178f = io.BytesIO()179g = io.BytesIO()180181df1 = pl.DataFrame(182{183"a": [1, 2, 3],184"b": ["xyz", "abc", "wow"],185}186)187df2 = pl.DataFrame(188{189"a": [5, 6, 7],190"b": ["a", "few", "entries"],191}192)193194dfs = pl.concat([df1, df2])195196df1.write_parquet(f)197df2.write_parquet(g)198199f.seek(0)200g.seek(0)201202items: list[IO[bytes]] = [f, g]203assert_frame_equal(pl.read_parquet(items), dfs)204205f.seek(0)206g.seek(0)207208assert_frame_equal(pl.read_parquet(items, use_pyarrow=True), dfs)209210f.seek(0)211g.seek(0)212213fb = f.read()214gb = g.read()215216assert_frame_equal(pl.read_parquet([fb, gb]), dfs)217assert_frame_equal(pl.read_parquet([fb, gb], use_pyarrow=True), dfs)218219220def test_concat_series() -> None:221s = pl.Series("a", [2, 1, 3])222223assert pl.concat([s, s]).len() == 6224# check if s remains unchanged225assert s.len() == 3226227228def test_concat_null_20501() -> None:229a = pl.DataFrame({"id": [1], "value": ["foo"]})230b = pl.DataFrame({"id": [2], "value": [None]})231232assert pl.concat([a.lazy(), b.lazy()]).collect().to_dict(as_series=False) == {233"id": [1, 2],234"value": ["foo", None],235}236237238def test_concat_single_element() -> None:239df = pl.DataFrame({"a": [1, 2, 3]})240result = pl.concat([df])241assert result is df242243s = pl.Series("test", [1, 2, 3])244result_s = pl.concat([s])245assert result_s is s246247248def test_concat_diagonal() -> None:249df1 = pl.DataFrame({"a": [1, 2], "b": [3, 4]})250df2 = pl.DataFrame({"a": [5, 6], "c": [7, 8]})251df3 = pl.DataFrame({"b": [9, 10], "c": [11, 12]})252253result = pl.concat([df1, df2, df3], how="diagonal")254expected = pl.DataFrame(255{256"a": [1, 2, 5, 6, None, None],257"b": [3, 4, None, None, 9, 10],258"c": [None, None, 7, 8, 11, 12],259}260)261assert_frame_equal(result, expected)262263264def test_concat_diagonal_relaxed() -> None:265df1 = pl.DataFrame(266{"a": [1, 2], "c": [10, 20]}, schema={"a": pl.Int32, "c": pl.Int64}267)268df2 = pl.DataFrame(269{"a": [3.5, 4.5], "b": [30.1, 40.2]}, schema={"a": pl.Float64, "b": pl.Float32}270)271df3 = pl.DataFrame({"b": [5, 6], "c": [50, 60]})272273result = pl.concat([df1, df2, df3], how="diagonal_relaxed")274275assert result.schema["a"] == pl.Float64276assert result.schema["b"] == pl.Float64277assert result.schema["c"] == pl.Int64278279expected = pl.DataFrame(280{281"a": [1.0, 2.0, 3.5, 4.5, None, None],282"c": [10, 20, None, None, 50, 60],283"b": [None, None, 30.1, 40.2, 5.0, 6.0],284}285)286287assert_frame_equal(result, expected)288289290def test_concat_horizontal() -> None:291df1 = pl.DataFrame({"a": [1, 2, 3]})292df2 = pl.DataFrame({"b": [4, 5]})293df3 = pl.DataFrame({"c": [6, 7, 8, 9]})294295result = pl.concat([df1, df2, df3], how="horizontal")296expected = pl.DataFrame(297{"a": [1, 2, 3, None], "b": [4, 5, None, None], "c": [6, 7, 8, 9]}298)299assert_frame_equal(result, expected)300301302def test_concat_align_no_common_columns() -> None:303df1 = pl.DataFrame({"a": [1, 2]})304df2 = pl.DataFrame({"b": [3, 4]})305306with pytest.raises(307pl.exceptions.InvalidOperationError, match="requires at least one common column"308):309pl.concat([df1, df2], how="align")310311312def test_concat_align_lazy_frames() -> None:313lf1 = pl.DataFrame({"id": [1, 2], "x": [3, 4]}).lazy()314lf2 = pl.DataFrame({"id": [2, 3], "y": [5, 6]}).lazy()315316result = pl.concat([lf1, lf2], how="align")317assert isinstance(result, pl.LazyFrame)318319collected = result.collect()320expected = pl.DataFrame({"id": [1, 2, 3], "x": [3, 4, None], "y": [None, 5, 6]})321assert_frame_equal(collected, expected, check_row_order=False)322323324def test_concat_lazyframe_horizontal() -> None:325lf1 = pl.DataFrame({"a": [1, 2]}).lazy()326lf2 = pl.DataFrame({"b": [3, 4, 5]}).lazy()327328result = pl.concat([lf1, lf2], how="horizontal")329assert isinstance(result, pl.LazyFrame)330331collected = result.collect()332expected = pl.DataFrame({"a": [1, 2, None], "b": [3, 4, 5]})333assert_frame_equal(collected, expected)334335336def test_concat_lazyframe_diagonal() -> None:337lf1 = pl.DataFrame({"a": [1, 2], "b": [3, 4]}).lazy()338lf2 = pl.DataFrame({"a": [5, 6], "c": [7, 8]}).lazy()339340result = pl.concat([lf1, lf2], how="diagonal")341assert isinstance(result, pl.LazyFrame)342343collected = result.collect()344expected = pl.DataFrame(345{"a": [1, 2, 5, 6], "b": [3, 4, None, None], "c": [None, None, 7, 8]}346)347assert_frame_equal(collected, expected)348349350def test_concat_series_invalid_strategy() -> None:351s1 = pl.Series("a", [1, 2, 3])352s2 = pl.Series("b", [4, 5, 6])353354with pytest.raises(355ValueError, match="Series only supports 'vertical' concat strategy"356):357pl.concat([s1, s2], how="horizontal")358359with pytest.raises(360ValueError, match="Series only supports 'vertical' concat strategy"361):362pl.concat([s1, s2], how="diagonal")363364365def test_concat_invalid_how_parameter() -> None:366df1 = pl.DataFrame({"a": [1, 2]})367df2 = pl.DataFrame({"a": [3, 4]})368369with pytest.raises(ValueError, match="DataFrame `how` must be one of"):370pl.concat([df1, df2], how="invalid_strategy") # type: ignore[arg-type]371372373def test_concat_unsupported_type() -> None:374with pytest.raises(TypeError, match="did not expect type"):375pl.concat([1, 2, 3]) # type: ignore[type-var]376377378def test_concat_expressions() -> None:379expr1 = pl.col("a")380expr2 = pl.col("b")381concat_expr = pl.concat([expr1, expr2])382383df_input = pl.DataFrame({"a": [1, 2], "b": [3, 4]})384result = df_input.select(concat_expr.alias("concatenated"))385386expected = pl.DataFrame({"concatenated": [1, 2, 3, 4]})387assert_frame_equal(result, expected)388389390def test_concat_with_empty_dataframes() -> None:391empty_df = pl.DataFrame(schema={"a": pl.Int64, "b": pl.String})392df_with_data = pl.DataFrame({"a": [1, 2], "b": ["x", "y"]})393394result = pl.concat([empty_df, df_with_data])395assert_frame_equal(result, df_with_data)396397result2 = pl.concat([df_with_data, empty_df])398assert_frame_equal(result2, df_with_data)399400401def test_concat_with_empty_dataframes_nonstrict_25727() -> None:402df = pl.LazyFrame({"a": [1, 2], "b": ["x", "y"]})403result = pl.concat([df, df.select([])], how="horizontal", strict=False)404expected = pl.LazyFrame({"a": [1, 2], "b": ["x", "y"]})405assert_frame_equal(result, expected)406407empty_df = pl.LazyFrame(schema={"c": pl.Int64})408result = pl.concat([empty_df, df], how="horizontal", strict=False)409expected = pl.LazyFrame(410{"c": [None, None], "a": [1, 2], "b": ["x", "y"]},411schema={"c": pl.Int64, "a": pl.Int64, "b": pl.String},412)413assert_frame_equal(result, expected)414415416