Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/dataframe/test_partition_by.py
6939 views
1
from typing import Any
2
3
import pytest
4
5
import polars as pl
6
import polars.selectors as cs
7
8
9
@pytest.fixture
10
def df() -> pl.DataFrame:
11
return pl.DataFrame(
12
{
13
"foo": ["A", "A", "B", "B", "C"],
14
"N": [1, 2, 2, 4, 2],
15
"bar": ["k", "l", "m", "m", "l"],
16
}
17
)
18
19
20
@pytest.mark.parametrize("input", [["foo", "bar"], cs.string()])
21
def test_partition_by(df: pl.DataFrame, input: Any) -> None:
22
result = df.partition_by(input, maintain_order=True)
23
expected = [
24
{"foo": ["A"], "N": [1], "bar": ["k"]},
25
{"foo": ["A"], "N": [2], "bar": ["l"]},
26
{"foo": ["B", "B"], "N": [2, 4], "bar": ["m", "m"]},
27
{"foo": ["C"], "N": [2], "bar": ["l"]},
28
]
29
assert [a.to_dict(as_series=False) for a in result] == expected
30
31
32
def test_partition_by_include_key_false(df: pl.DataFrame) -> None:
33
result = df.partition_by("foo", "bar", maintain_order=True, include_key=False)
34
expected = [
35
{"N": [1]},
36
{"N": [2]},
37
{"N": [2, 4]},
38
{"N": [2]},
39
]
40
assert [a.to_dict(as_series=False) for a in result] == expected
41
42
43
def test_partition_by_single(df: pl.DataFrame) -> None:
44
result = df.partition_by("foo", maintain_order=True)
45
expected = [
46
{"foo": ["A", "A"], "N": [1, 2], "bar": ["k", "l"]},
47
{"foo": ["B", "B"], "N": [2, 4], "bar": ["m", "m"]},
48
{"foo": ["C"], "N": [2], "bar": ["l"]},
49
]
50
assert [a.to_dict(as_series=False) for a in result] == expected
51
52
53
def test_partition_by_as_dict() -> None:
54
df = pl.DataFrame({"a": ["one", "two", "one", "two"], "b": [1, 2, 3, 4]})
55
result = df.partition_by(cs.all(), as_dict=True)
56
result_first = result["one", 1]
57
assert result_first.to_dict(as_series=False) == {"a": ["one"], "b": [1]}
58
59
result = df.partition_by("a", as_dict=True)
60
result_first = result["one",]
61
assert result_first.to_dict(as_series=False) == {"a": ["one", "one"], "b": [1, 3]}
62
63
64
def test_partition_by_as_dict_include_keys_false() -> None:
65
df = pl.DataFrame({"a": ["one", "two", "one", "two"], "b": [1, 2, 3, 4]})
66
67
result = df.partition_by("a", include_key=False, as_dict=True)
68
result_first = result["one",]
69
assert result_first.to_dict(as_series=False) == {"b": [1, 3]}
70
71
72
def test_partition_by_as_dict_include_keys_false_maintain_order_false() -> None:
73
df = pl.DataFrame({"a": ["one", "two", "one", "two"], "b": [1, 2, 3, 4]})
74
with pytest.raises(ValueError):
75
df.partition_by(["a"], maintain_order=False, include_key=False, as_dict=True)
76
77
78
@pytest.mark.may_fail_cloud
79
@pytest.mark.slow
80
def test_partition_by_as_dict_include_keys_false_large() -> None:
81
# test with both as_dict and include_key=False
82
df = pl.DataFrame(
83
{
84
"a": pl.int_range(0, 100, dtype=pl.UInt8, eager=True),
85
"b": pl.int_range(0, 100, dtype=pl.UInt8, eager=True),
86
"c": pl.int_range(0, 100, dtype=pl.UInt8, eager=True),
87
"d": pl.int_range(0, 100, dtype=pl.UInt8, eager=True),
88
}
89
).sample(n=100_000, with_replacement=True, shuffle=True)
90
91
partitions = df.partition_by(["a", "b"], as_dict=True, include_key=False)
92
assert all(key == value.row(0) for key, value in partitions.items())
93
94
95
def test_partition_by_tuple_typing_24112() -> None:
96
df = pl.DataFrame({"id": ["a", "b", "a"], "val": [1, 2, 3]})
97
for (id_,) in df.partition_by("id", as_dict=True):
98
_should_work: str = id_
99
100