Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/dataframe/test_describe.py
6939 views
1
from __future__ import annotations
2
3
from datetime import date, datetime, time
4
5
import pytest
6
7
import polars as pl
8
from polars.testing import assert_frame_equal
9
10
11
@pytest.mark.parametrize("lazy", [False, True])
12
def test_df_describe(lazy: bool) -> None:
13
df = pl.DataFrame(
14
{
15
"a": [1.0, 2.8, 3.0],
16
"b": [4, 5, None],
17
"c": [True, False, True],
18
"d": [None, "b", "c"],
19
"e": ["usd", "eur", None],
20
"f": [
21
datetime(2020, 1, 1, 10, 30),
22
datetime(2021, 7, 5, 15, 0),
23
datetime(2022, 12, 31, 20, 30),
24
],
25
"g": [date(2020, 1, 1), date(2021, 7, 5), date(2022, 12, 31)],
26
"h": [time(10, 30), time(15, 0), time(20, 30)],
27
"i": [1_000_000, 2_000_000, 3_000_000],
28
},
29
schema_overrides={"e": pl.Categorical, "i": pl.Duration},
30
)
31
32
frame: pl.DataFrame | pl.LazyFrame = df.lazy() if lazy else df
33
result = frame.describe()
34
print(result)
35
36
expected = pl.DataFrame(
37
{
38
"statistic": [
39
"count",
40
"null_count",
41
"mean",
42
"std",
43
"min",
44
"25%",
45
"50%",
46
"75%",
47
"max",
48
],
49
"a": [
50
3.0,
51
0.0,
52
2.2666666666666666,
53
1.1015141094572205,
54
1.0,
55
2.8,
56
2.8,
57
3.0,
58
3.0,
59
],
60
"b": [2.0, 1.0, 4.5, 0.7071067811865476, 4.0, 4.0, 5.0, 5.0, 5.0],
61
"c": [3.0, 0.0, 2 / 3, None, False, None, None, None, True],
62
"d": ["2", "1", None, None, "b", None, None, None, "c"],
63
"e": ["2", "1", None, None, None, None, None, None, None],
64
"f": [
65
"3",
66
"0",
67
"2021-07-03 07:20:00",
68
None,
69
"2020-01-01 10:30:00",
70
"2021-07-05 15:00:00",
71
"2021-07-05 15:00:00",
72
"2022-12-31 20:30:00",
73
"2022-12-31 20:30:00",
74
],
75
"g": [
76
"3",
77
"0",
78
"2021-07-02 16:00:00",
79
None,
80
"2020-01-01",
81
"2021-07-05",
82
"2021-07-05",
83
"2022-12-31",
84
"2022-12-31",
85
],
86
"h": [
87
"3",
88
"0",
89
"15:20:00",
90
None,
91
"10:30:00",
92
"15:00:00",
93
"15:00:00",
94
"20:30:00",
95
"20:30:00",
96
],
97
"i": [
98
"3",
99
"0",
100
"0:00:02",
101
None,
102
"0:00:01",
103
"0:00:02",
104
"0:00:02",
105
"0:00:03",
106
"0:00:03",
107
],
108
}
109
)
110
assert_frame_equal(result, expected)
111
112
113
def test_df_describe_nested() -> None:
114
df = pl.DataFrame(
115
{
116
"struct": [{"x": 1, "y": 2}, {"x": 3, "y": 4}, {"x": 1, "y": 2}, None],
117
"list": [[1, 2], [3, 4], [1, 2], None],
118
}
119
)
120
result = df.describe()
121
expected = pl.DataFrame(
122
[
123
("count", 3, 3),
124
("null_count", 1, 1),
125
("mean", None, None),
126
("std", None, None),
127
("min", None, None),
128
("25%", None, None),
129
("50%", None, None),
130
("75%", None, None),
131
("max", None, None),
132
],
133
schema=["statistic"] + df.columns,
134
schema_overrides={"struct": pl.Float64, "list": pl.Float64},
135
orient="row",
136
)
137
assert_frame_equal(result, expected)
138
139
140
def test_df_describe_custom_percentiles() -> None:
141
df = pl.DataFrame({"numeric": [1, 2, 1, None]})
142
result = df.describe(percentiles=(0.2, 0.4, 0.5, 0.6, 0.8))
143
expected = pl.DataFrame(
144
[
145
("count", 3.0),
146
("null_count", 1.0),
147
("mean", 1.3333333333333333),
148
("std", 0.5773502691896257),
149
("min", 1.0),
150
("20%", 1.0),
151
("40%", 1.0),
152
("50%", 1.0),
153
("60%", 1.0),
154
("80%", 2.0),
155
("max", 2.0),
156
],
157
schema=["statistic"] + df.columns,
158
orient="row",
159
)
160
assert_frame_equal(result, expected)
161
162
163
@pytest.mark.parametrize("pcts", [None, []])
164
def test_df_describe_no_percentiles(pcts: list[float] | None) -> None:
165
df = pl.DataFrame({"numeric": [1, 2, 1, None]})
166
result = df.describe(percentiles=pcts)
167
expected = pl.DataFrame(
168
[
169
("count", 3.0),
170
("null_count", 1.0),
171
("mean", 1.3333333333333333),
172
("std", 0.5773502691896257),
173
("min", 1.0),
174
("max", 2.0),
175
],
176
schema=["statistic"] + df.columns,
177
orient="row",
178
)
179
assert_frame_equal(result, expected)
180
181
182
def test_df_describe_empty_column() -> None:
183
df = pl.DataFrame(schema={"a": pl.Int64})
184
result = df.describe()
185
expected = pl.DataFrame(
186
[
187
("count", 0.0),
188
("null_count", 0.0),
189
("mean", None),
190
("std", None),
191
("min", None),
192
("25%", None),
193
("50%", None),
194
("75%", None),
195
("max", None),
196
],
197
schema=["statistic"] + df.columns,
198
orient="row",
199
)
200
assert_frame_equal(result, expected)
201
202
203
@pytest.mark.parametrize("lazy", [False, True])
204
def test_df_describe_empty(lazy: bool) -> None:
205
frame: pl.DataFrame | pl.LazyFrame = pl.LazyFrame() if lazy else pl.DataFrame()
206
cls_name = "LazyFrame" if lazy else "DataFrame"
207
with pytest.raises(
208
TypeError, match=f"cannot describe a {cls_name} that has no columns"
209
):
210
frame.describe()
211
212
213
def test_df_describe_quantile_precision() -> None:
214
df = pl.DataFrame({"a": range(10)})
215
result = df.describe(percentiles=[0.99, 0.999, 0.9999])
216
result_metrics = result.get_column("statistic").to_list()
217
expected_metrics = ["99%", "99.9%", "99.99%"]
218
for m in expected_metrics:
219
assert m in result_metrics
220
221
222
# https://github.com/pola-rs/polars/issues/9830
223
@pytest.mark.may_fail_cloud
224
def test_df_describe_object() -> None:
225
df = pl.Series(
226
"object",
227
[{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}],
228
dtype=pl.Object,
229
).to_frame()
230
231
result = df.describe(percentiles=(0.05, 0.25, 0.5, 0.75, 0.95))
232
233
expected = pl.DataFrame(
234
{"statistic": ["count", "null_count"], "object": ["3", "0"]}
235
)
236
assert_frame_equal(result.head(2), expected)
237
238