CoCalc -- test_partition

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/dataframe/test_partition_by.py
⁶⁹³⁹ views
1
from typing import Any
2

3
import pytest
4

5
import polars as pl
6
import polars.selectors as cs
7

8

9
@pytest.fixture
10
def df() -> pl.DataFrame:
11
    return pl.DataFrame(
12
        {
13
            "foo": ["A", "A", "B", "B", "C"],
14
            "N": [1, 2, 2, 4, 2],
15
            "bar": ["k", "l", "m", "m", "l"],
16
        }
17
    )
18

19

20
@pytest.mark.parametrize("input", [["foo", "bar"], cs.string()])
21
def test_partition_by(df: pl.DataFrame, input: Any) -> None:
22
    result = df.partition_by(input, maintain_order=True)
23
    expected = [
24
        {"foo": ["A"], "N": [1], "bar": ["k"]},
25
        {"foo": ["A"], "N": [2], "bar": ["l"]},
26
        {"foo": ["B", "B"], "N": [2, 4], "bar": ["m", "m"]},
27
        {"foo": ["C"], "N": [2], "bar": ["l"]},
28
    ]
29
    assert [a.to_dict(as_series=False) for a in result] == expected
30

31

32
def test_partition_by_include_key_false(df: pl.DataFrame) -> None:
33
    result = df.partition_by("foo", "bar", maintain_order=True, include_key=False)
34
    expected = [
35
        {"N": [1]},
36
        {"N": [2]},
37
        {"N": [2, 4]},
38
        {"N": [2]},
39
    ]
40
    assert [a.to_dict(as_series=False) for a in result] == expected
41

42

43
def test_partition_by_single(df: pl.DataFrame) -> None:
44
    result = df.partition_by("foo", maintain_order=True)
45
    expected = [
46
        {"foo": ["A", "A"], "N": [1, 2], "bar": ["k", "l"]},
47
        {"foo": ["B", "B"], "N": [2, 4], "bar": ["m", "m"]},
48
        {"foo": ["C"], "N": [2], "bar": ["l"]},
49
    ]
50
    assert [a.to_dict(as_series=False) for a in result] == expected
51

52

53
def test_partition_by_as_dict() -> None:
54
    df = pl.DataFrame({"a": ["one", "two", "one", "two"], "b": [1, 2, 3, 4]})
55
    result = df.partition_by(cs.all(), as_dict=True)
56
    result_first = result["one", 1]
57
    assert result_first.to_dict(as_series=False) == {"a": ["one"], "b": [1]}
58

59
    result = df.partition_by("a", as_dict=True)
60
    result_first = result["one",]
61
    assert result_first.to_dict(as_series=False) == {"a": ["one", "one"], "b": [1, 3]}
62

63

64
def test_partition_by_as_dict_include_keys_false() -> None:
65
    df = pl.DataFrame({"a": ["one", "two", "one", "two"], "b": [1, 2, 3, 4]})
66

67
    result = df.partition_by("a", include_key=False, as_dict=True)
68
    result_first = result["one",]
69
    assert result_first.to_dict(as_series=False) == {"b": [1, 3]}
70

71

72
def test_partition_by_as_dict_include_keys_false_maintain_order_false() -> None:
73
    df = pl.DataFrame({"a": ["one", "two", "one", "two"], "b": [1, 2, 3, 4]})
74
    with pytest.raises(ValueError):
75
        df.partition_by(["a"], maintain_order=False, include_key=False, as_dict=True)
76

77

78
@pytest.mark.may_fail_cloud
79
@pytest.mark.slow
80
def test_partition_by_as_dict_include_keys_false_large() -> None:
81
    # test with both as_dict and include_key=False
82
    df = pl.DataFrame(
83
        {
84
            "a": pl.int_range(0, 100, dtype=pl.UInt8, eager=True),
85
            "b": pl.int_range(0, 100, dtype=pl.UInt8, eager=True),
86
            "c": pl.int_range(0, 100, dtype=pl.UInt8, eager=True),
87
            "d": pl.int_range(0, 100, dtype=pl.UInt8, eager=True),
88
        }
89
    ).sample(n=100_000, with_replacement=True, shuffle=True)
90

91
    partitions = df.partition_by(["a", "b"], as_dict=True, include_key=False)
92
    assert all(key == value.row(0) for key, value in partitions.items())
93

94

95
def test_partition_by_tuple_typing_24112() -> None:
96
    df = pl.DataFrame({"id": ["a", "b", "a"], "val": [1, 2, 3]})
97
    for (id_,) in df.partition_by("id", as_dict=True):
98
        _should_work: str = id_
99

100
Product

Resources

Company