CoCalc -- test_describe.py

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/dataframe/test_describe.py
⁶⁹³⁹ views
1
from __future__ import annotations
2

3
from datetime import date, datetime, time
4

5
import pytest
6

7
import polars as pl
8
from polars.testing import assert_frame_equal
9

10

11
@pytest.mark.parametrize("lazy", [False, True])
12
def test_df_describe(lazy: bool) -> None:
13
    df = pl.DataFrame(
14
        {
15
            "a": [1.0, 2.8, 3.0],
16
            "b": [4, 5, None],
17
            "c": [True, False, True],
18
            "d": [None, "b", "c"],
19
            "e": ["usd", "eur", None],
20
            "f": [
21
                datetime(2020, 1, 1, 10, 30),
22
                datetime(2021, 7, 5, 15, 0),
23
                datetime(2022, 12, 31, 20, 30),
24
            ],
25
            "g": [date(2020, 1, 1), date(2021, 7, 5), date(2022, 12, 31)],
26
            "h": [time(10, 30), time(15, 0), time(20, 30)],
27
            "i": [1_000_000, 2_000_000, 3_000_000],
28
        },
29
        schema_overrides={"e": pl.Categorical, "i": pl.Duration},
30
    )
31

32
    frame: pl.DataFrame | pl.LazyFrame = df.lazy() if lazy else df
33
    result = frame.describe()
34
    print(result)
35

36
    expected = pl.DataFrame(
37
        {
38
            "statistic": [
39
                "count",
40
                "null_count",
41
                "mean",
42
                "std",
43
                "min",
44
                "25%",
45
                "50%",
46
                "75%",
47
                "max",
48
            ],
49
            "a": [
50
                3.0,
51
                0.0,
52
                2.2666666666666666,
53
                1.1015141094572205,
54
                1.0,
55
                2.8,
56
                2.8,
57
                3.0,
58
                3.0,
59
            ],
60
            "b": [2.0, 1.0, 4.5, 0.7071067811865476, 4.0, 4.0, 5.0, 5.0, 5.0],
61
            "c": [3.0, 0.0, 2 / 3, None, False, None, None, None, True],
62
            "d": ["2", "1", None, None, "b", None, None, None, "c"],
63
            "e": ["2", "1", None, None, None, None, None, None, None],
64
            "f": [
65
                "3",
66
                "0",
67
                "2021-07-03 07:20:00",
68
                None,
69
                "2020-01-01 10:30:00",
70
                "2021-07-05 15:00:00",
71
                "2021-07-05 15:00:00",
72
                "2022-12-31 20:30:00",
73
                "2022-12-31 20:30:00",
74
            ],
75
            "g": [
76
                "3",
77
                "0",
78
                "2021-07-02 16:00:00",
79
                None,
80
                "2020-01-01",
81
                "2021-07-05",
82
                "2021-07-05",
83
                "2022-12-31",
84
                "2022-12-31",
85
            ],
86
            "h": [
87
                "3",
88
                "0",
89
                "15:20:00",
90
                None,
91
                "10:30:00",
92
                "15:00:00",
93
                "15:00:00",
94
                "20:30:00",
95
                "20:30:00",
96
            ],
97
            "i": [
98
                "3",
99
                "0",
100
                "0:00:02",
101
                None,
102
                "0:00:01",
103
                "0:00:02",
104
                "0:00:02",
105
                "0:00:03",
106
                "0:00:03",
107
            ],
108
        }
109
    )
110
    assert_frame_equal(result, expected)
111

112

113
def test_df_describe_nested() -> None:
114
    df = pl.DataFrame(
115
        {
116
            "struct": [{"x": 1, "y": 2}, {"x": 3, "y": 4}, {"x": 1, "y": 2}, None],
117
            "list": [[1, 2], [3, 4], [1, 2], None],
118
        }
119
    )
120
    result = df.describe()
121
    expected = pl.DataFrame(
122
        [
123
            ("count", 3, 3),
124
            ("null_count", 1, 1),
125
            ("mean", None, None),
126
            ("std", None, None),
127
            ("min", None, None),
128
            ("25%", None, None),
129
            ("50%", None, None),
130
            ("75%", None, None),
131
            ("max", None, None),
132
        ],
133
        schema=["statistic"] + df.columns,
134
        schema_overrides={"struct": pl.Float64, "list": pl.Float64},
135
        orient="row",
136
    )
137
    assert_frame_equal(result, expected)
138

139

140
def test_df_describe_custom_percentiles() -> None:
141
    df = pl.DataFrame({"numeric": [1, 2, 1, None]})
142
    result = df.describe(percentiles=(0.2, 0.4, 0.5, 0.6, 0.8))
143
    expected = pl.DataFrame(
144
        [
145
            ("count", 3.0),
146
            ("null_count", 1.0),
147
            ("mean", 1.3333333333333333),
148
            ("std", 0.5773502691896257),
149
            ("min", 1.0),
150
            ("20%", 1.0),
151
            ("40%", 1.0),
152
            ("50%", 1.0),
153
            ("60%", 1.0),
154
            ("80%", 2.0),
155
            ("max", 2.0),
156
        ],
157
        schema=["statistic"] + df.columns,
158
        orient="row",
159
    )
160
    assert_frame_equal(result, expected)
161

162

163
@pytest.mark.parametrize("pcts", [None, []])
164
def test_df_describe_no_percentiles(pcts: list[float] | None) -> None:
165
    df = pl.DataFrame({"numeric": [1, 2, 1, None]})
166
    result = df.describe(percentiles=pcts)
167
    expected = pl.DataFrame(
168
        [
169
            ("count", 3.0),
170
            ("null_count", 1.0),
171
            ("mean", 1.3333333333333333),
172
            ("std", 0.5773502691896257),
173
            ("min", 1.0),
174
            ("max", 2.0),
175
        ],
176
        schema=["statistic"] + df.columns,
177
        orient="row",
178
    )
179
    assert_frame_equal(result, expected)
180

181

182
def test_df_describe_empty_column() -> None:
183
    df = pl.DataFrame(schema={"a": pl.Int64})
184
    result = df.describe()
185
    expected = pl.DataFrame(
186
        [
187
            ("count", 0.0),
188
            ("null_count", 0.0),
189
            ("mean", None),
190
            ("std", None),
191
            ("min", None),
192
            ("25%", None),
193
            ("50%", None),
194
            ("75%", None),
195
            ("max", None),
196
        ],
197
        schema=["statistic"] + df.columns,
198
        orient="row",
199
    )
200
    assert_frame_equal(result, expected)
201

202

203
@pytest.mark.parametrize("lazy", [False, True])
204
def test_df_describe_empty(lazy: bool) -> None:
205
    frame: pl.DataFrame | pl.LazyFrame = pl.LazyFrame() if lazy else pl.DataFrame()
206
    cls_name = "LazyFrame" if lazy else "DataFrame"
207
    with pytest.raises(
208
        TypeError, match=f"cannot describe a {cls_name} that has no columns"
209
    ):
210
        frame.describe()
211

212

213
def test_df_describe_quantile_precision() -> None:
214
    df = pl.DataFrame({"a": range(10)})
215
    result = df.describe(percentiles=[0.99, 0.999, 0.9999])
216
    result_metrics = result.get_column("statistic").to_list()
217
    expected_metrics = ["99%", "99.9%", "99.99%"]
218
    for m in expected_metrics:
219
        assert m in result_metrics
220

221

222
# https://github.com/pola-rs/polars/issues/9830
223
@pytest.mark.may_fail_cloud
224
def test_df_describe_object() -> None:
225
    df = pl.Series(
226
        "object",
227
        [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}],
228
        dtype=pl.Object,
229
    ).to_frame()
230

231
    result = df.describe(percentiles=(0.05, 0.25, 0.5, 0.75, 0.95))
232

233
    expected = pl.DataFrame(
234
        {"statistic": ["count", "null_count"], "object": ["3", "0"]}
235
    )
236
    assert_frame_equal(result.head(2), expected)
237

238
Product

Resources

Company