Path: blob/main/py-polars/tests/unit/dataframe/test_describe.py
6939 views
from __future__ import annotations12from datetime import date, datetime, time34import pytest56import polars as pl7from polars.testing import assert_frame_equal8910@pytest.mark.parametrize("lazy", [False, True])11def test_df_describe(lazy: bool) -> None:12df = pl.DataFrame(13{14"a": [1.0, 2.8, 3.0],15"b": [4, 5, None],16"c": [True, False, True],17"d": [None, "b", "c"],18"e": ["usd", "eur", None],19"f": [20datetime(2020, 1, 1, 10, 30),21datetime(2021, 7, 5, 15, 0),22datetime(2022, 12, 31, 20, 30),23],24"g": [date(2020, 1, 1), date(2021, 7, 5), date(2022, 12, 31)],25"h": [time(10, 30), time(15, 0), time(20, 30)],26"i": [1_000_000, 2_000_000, 3_000_000],27},28schema_overrides={"e": pl.Categorical, "i": pl.Duration},29)3031frame: pl.DataFrame | pl.LazyFrame = df.lazy() if lazy else df32result = frame.describe()33print(result)3435expected = pl.DataFrame(36{37"statistic": [38"count",39"null_count",40"mean",41"std",42"min",43"25%",44"50%",45"75%",46"max",47],48"a": [493.0,500.0,512.2666666666666666,521.1015141094572205,531.0,542.8,552.8,563.0,573.0,58],59"b": [2.0, 1.0, 4.5, 0.7071067811865476, 4.0, 4.0, 5.0, 5.0, 5.0],60"c": [3.0, 0.0, 2 / 3, None, False, None, None, None, True],61"d": ["2", "1", None, None, "b", None, None, None, "c"],62"e": ["2", "1", None, None, None, None, None, None, None],63"f": [64"3",65"0",66"2021-07-03 07:20:00",67None,68"2020-01-01 10:30:00",69"2021-07-05 15:00:00",70"2021-07-05 15:00:00",71"2022-12-31 20:30:00",72"2022-12-31 20:30:00",73],74"g": [75"3",76"0",77"2021-07-02 16:00:00",78None,79"2020-01-01",80"2021-07-05",81"2021-07-05",82"2022-12-31",83"2022-12-31",84],85"h": [86"3",87"0",88"15:20:00",89None,90"10:30:00",91"15:00:00",92"15:00:00",93"20:30:00",94"20:30:00",95],96"i": [97"3",98"0",99"0:00:02",100None,101"0:00:01",102"0:00:02",103"0:00:02",104"0:00:03",105"0:00:03",106],107}108)109assert_frame_equal(result, expected)110111112def test_df_describe_nested() -> None:113df = pl.DataFrame(114{115"struct": [{"x": 1, "y": 2}, {"x": 3, "y": 4}, {"x": 1, "y": 2}, None],116"list": [[1, 2], [3, 4], [1, 2], None],117}118)119result = df.describe()120expected = pl.DataFrame(121[122("count", 3, 3),123("null_count", 1, 1),124("mean", None, None),125("std", None, None),126("min", None, None),127("25%", None, None),128("50%", None, None),129("75%", None, None),130("max", None, None),131],132schema=["statistic"] + df.columns,133schema_overrides={"struct": pl.Float64, "list": pl.Float64},134orient="row",135)136assert_frame_equal(result, expected)137138139def test_df_describe_custom_percentiles() -> None:140df = pl.DataFrame({"numeric": [1, 2, 1, None]})141result = df.describe(percentiles=(0.2, 0.4, 0.5, 0.6, 0.8))142expected = pl.DataFrame(143[144("count", 3.0),145("null_count", 1.0),146("mean", 1.3333333333333333),147("std", 0.5773502691896257),148("min", 1.0),149("20%", 1.0),150("40%", 1.0),151("50%", 1.0),152("60%", 1.0),153("80%", 2.0),154("max", 2.0),155],156schema=["statistic"] + df.columns,157orient="row",158)159assert_frame_equal(result, expected)160161162@pytest.mark.parametrize("pcts", [None, []])163def test_df_describe_no_percentiles(pcts: list[float] | None) -> None:164df = pl.DataFrame({"numeric": [1, 2, 1, None]})165result = df.describe(percentiles=pcts)166expected = pl.DataFrame(167[168("count", 3.0),169("null_count", 1.0),170("mean", 1.3333333333333333),171("std", 0.5773502691896257),172("min", 1.0),173("max", 2.0),174],175schema=["statistic"] + df.columns,176orient="row",177)178assert_frame_equal(result, expected)179180181def test_df_describe_empty_column() -> None:182df = pl.DataFrame(schema={"a": pl.Int64})183result = df.describe()184expected = pl.DataFrame(185[186("count", 0.0),187("null_count", 0.0),188("mean", None),189("std", None),190("min", None),191("25%", None),192("50%", None),193("75%", None),194("max", None),195],196schema=["statistic"] + df.columns,197orient="row",198)199assert_frame_equal(result, expected)200201202@pytest.mark.parametrize("lazy", [False, True])203def test_df_describe_empty(lazy: bool) -> None:204frame: pl.DataFrame | pl.LazyFrame = pl.LazyFrame() if lazy else pl.DataFrame()205cls_name = "LazyFrame" if lazy else "DataFrame"206with pytest.raises(207TypeError, match=f"cannot describe a {cls_name} that has no columns"208):209frame.describe()210211212def test_df_describe_quantile_precision() -> None:213df = pl.DataFrame({"a": range(10)})214result = df.describe(percentiles=[0.99, 0.999, 0.9999])215result_metrics = result.get_column("statistic").to_list()216expected_metrics = ["99%", "99.9%", "99.99%"]217for m in expected_metrics:218assert m in result_metrics219220221# https://github.com/pola-rs/polars/issues/9830222@pytest.mark.may_fail_cloud223def test_df_describe_object() -> None:224df = pl.Series(225"object",226[{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}],227dtype=pl.Object,228).to_frame()229230result = df.describe(percentiles=(0.05, 0.25, 0.5, 0.75, 0.95))231232expected = pl.DataFrame(233{"statistic": ["count", "null_count"], "object": ["3", "0"]}234)235assert_frame_equal(result.head(2), expected)236237238