Path: blob/main/docs/source/src/python/user-guide/expressions/aggregation.py
7890 views
# --8<-- [start:dataframe]1import polars as pl23url = "hf://datasets/nameexhaustion/polars-docs/legislators-historical.csv"45schema_overrides = {6"first_name": pl.Categorical,7"gender": pl.Categorical,8"type": pl.Categorical,9"state": pl.Categorical,10"party": pl.Categorical,11}1213dataset = (14pl.read_csv(url, schema_overrides=schema_overrides)15.with_columns(pl.col("first", "middle", "last").name.suffix("_name"))16.with_columns(pl.col("birthday").str.to_date(strict=False))17)18# --8<-- [end:dataframe]1920# --8<-- [start:basic]21q = (22dataset.lazy()23.group_by("first_name")24.agg(25pl.len(),26pl.col("gender"),27pl.first("last_name"), # Short for `pl.col("last_name").first()`28)29.sort("len", descending=True)30.limit(5)31)3233df = q.collect()34print(df)35# --8<-- [end:basic]3637# --8<-- [start:conditional]38q = (39dataset.lazy()40.group_by("state")41.agg(42(pl.col("party") == "Anti-Administration").sum().alias("anti"),43(pl.col("party") == "Pro-Administration").sum().alias("pro"),44)45.sort("pro", descending=True)46.limit(5)47)4849df = q.collect()50print(df)51# --8<-- [end:conditional]5253# --8<-- [start:nested]54q = (55dataset.lazy()56.group_by("state", "party")57.agg(pl.len().alias("count"))58.filter(59(pl.col("party") == "Anti-Administration")60| (pl.col("party") == "Pro-Administration")61)62.sort("count", descending=True)63.limit(5)64)6566df = q.collect()67print(df)68# --8<-- [end:nested]697071# --8<-- [start:filter]72from datetime import date737475def compute_age():76return date.today().year - pl.col("birthday").dt.year()777879def avg_age(gender: str) -> pl.Expr:80return (81compute_age()82.filter(pl.col("gender") == gender)83.mean()84.alias(f"avg {gender} age")85)868788q = (89dataset.lazy()90.group_by("state")91.agg(92avg_age("M"),93avg_age("F"),94(pl.col("gender") == "M").sum().alias("# male"),95(pl.col("gender") == "F").sum().alias("# female"),96)97.limit(5)98)99100df = q.collect()101print(df)102# --8<-- [end:filter]103104105# --8<-- [start:filter-nested]106q = (107dataset.lazy()108.group_by("state", "gender")109.agg(110# The function `avg_age` is not needed:111compute_age().mean().alias("avg age"),112pl.len().alias("#"),113)114.sort("#", descending=True)115.limit(5)116)117118df = q.collect()119print(df)120# --8<-- [end:filter-nested]121122123# --8<-- [start:sort]124def get_name() -> pl.Expr:125return pl.col("first_name") + pl.lit(" ") + pl.col("last_name")126127128q = (129dataset.lazy()130.sort("birthday", descending=True)131.group_by("state")132.agg(133get_name().first().alias("youngest"),134get_name().last().alias("oldest"),135)136.limit(5)137)138139df = q.collect()140print(df)141# --8<-- [end:sort]142143144# --8<-- [start:sort2]145q = (146dataset.lazy()147.sort("birthday", descending=True)148.group_by("state")149.agg(150get_name().first().alias("youngest"),151get_name().last().alias("oldest"),152get_name().sort().first().alias("alphabetical_first"),153)154.limit(5)155)156157df = q.collect()158print(df)159# --8<-- [end:sort2]160161162# --8<-- [start:sort3]163q = (164dataset.lazy()165.sort("birthday", descending=True)166.group_by("state")167.agg(168get_name().first().alias("youngest"),169get_name().last().alias("oldest"),170get_name().sort().first().alias("alphabetical_first"),171pl.col("gender").sort_by(get_name()).first(),172)173.sort("state")174.limit(5)175)176177df = q.collect()178print(df)179# --8<-- [end:sort3]180181182