Path: blob/main/py-polars/tests/unit/dataframe/test_partition_by.py
6939 views
from typing import Any12import pytest34import polars as pl5import polars.selectors as cs678@pytest.fixture9def df() -> pl.DataFrame:10return pl.DataFrame(11{12"foo": ["A", "A", "B", "B", "C"],13"N": [1, 2, 2, 4, 2],14"bar": ["k", "l", "m", "m", "l"],15}16)171819@pytest.mark.parametrize("input", [["foo", "bar"], cs.string()])20def test_partition_by(df: pl.DataFrame, input: Any) -> None:21result = df.partition_by(input, maintain_order=True)22expected = [23{"foo": ["A"], "N": [1], "bar": ["k"]},24{"foo": ["A"], "N": [2], "bar": ["l"]},25{"foo": ["B", "B"], "N": [2, 4], "bar": ["m", "m"]},26{"foo": ["C"], "N": [2], "bar": ["l"]},27]28assert [a.to_dict(as_series=False) for a in result] == expected293031def test_partition_by_include_key_false(df: pl.DataFrame) -> None:32result = df.partition_by("foo", "bar", maintain_order=True, include_key=False)33expected = [34{"N": [1]},35{"N": [2]},36{"N": [2, 4]},37{"N": [2]},38]39assert [a.to_dict(as_series=False) for a in result] == expected404142def test_partition_by_single(df: pl.DataFrame) -> None:43result = df.partition_by("foo", maintain_order=True)44expected = [45{"foo": ["A", "A"], "N": [1, 2], "bar": ["k", "l"]},46{"foo": ["B", "B"], "N": [2, 4], "bar": ["m", "m"]},47{"foo": ["C"], "N": [2], "bar": ["l"]},48]49assert [a.to_dict(as_series=False) for a in result] == expected505152def test_partition_by_as_dict() -> None:53df = pl.DataFrame({"a": ["one", "two", "one", "two"], "b": [1, 2, 3, 4]})54result = df.partition_by(cs.all(), as_dict=True)55result_first = result["one", 1]56assert result_first.to_dict(as_series=False) == {"a": ["one"], "b": [1]}5758result = df.partition_by("a", as_dict=True)59result_first = result["one",]60assert result_first.to_dict(as_series=False) == {"a": ["one", "one"], "b": [1, 3]}616263def test_partition_by_as_dict_include_keys_false() -> None:64df = pl.DataFrame({"a": ["one", "two", "one", "two"], "b": [1, 2, 3, 4]})6566result = df.partition_by("a", include_key=False, as_dict=True)67result_first = result["one",]68assert result_first.to_dict(as_series=False) == {"b": [1, 3]}697071def test_partition_by_as_dict_include_keys_false_maintain_order_false() -> None:72df = pl.DataFrame({"a": ["one", "two", "one", "two"], "b": [1, 2, 3, 4]})73with pytest.raises(ValueError):74df.partition_by(["a"], maintain_order=False, include_key=False, as_dict=True)757677@pytest.mark.may_fail_cloud78@pytest.mark.slow79def test_partition_by_as_dict_include_keys_false_large() -> None:80# test with both as_dict and include_key=False81df = pl.DataFrame(82{83"a": pl.int_range(0, 100, dtype=pl.UInt8, eager=True),84"b": pl.int_range(0, 100, dtype=pl.UInt8, eager=True),85"c": pl.int_range(0, 100, dtype=pl.UInt8, eager=True),86"d": pl.int_range(0, 100, dtype=pl.UInt8, eager=True),87}88).sample(n=100_000, with_replacement=True, shuffle=True)8990partitions = df.partition_by(["a", "b"], as_dict=True, include_key=False)91assert all(key == value.row(0) for key, value in partitions.items())929394def test_partition_by_tuple_typing_24112() -> None:95df = pl.DataFrame({"id": ["a", "b", "a"], "val": [1, 2, 3]})96for (id_,) in df.partition_by("id", as_dict=True):97_should_work: str = id_9899100