Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/docs/source/src/python/user-guide/expressions/aggregation.py
7890 views
1
# --8<-- [start:dataframe]
2
import polars as pl
3
4
url = "hf://datasets/nameexhaustion/polars-docs/legislators-historical.csv"
5
6
schema_overrides = {
7
"first_name": pl.Categorical,
8
"gender": pl.Categorical,
9
"type": pl.Categorical,
10
"state": pl.Categorical,
11
"party": pl.Categorical,
12
}
13
14
dataset = (
15
pl.read_csv(url, schema_overrides=schema_overrides)
16
.with_columns(pl.col("first", "middle", "last").name.suffix("_name"))
17
.with_columns(pl.col("birthday").str.to_date(strict=False))
18
)
19
# --8<-- [end:dataframe]
20
21
# --8<-- [start:basic]
22
q = (
23
dataset.lazy()
24
.group_by("first_name")
25
.agg(
26
pl.len(),
27
pl.col("gender"),
28
pl.first("last_name"), # Short for `pl.col("last_name").first()`
29
)
30
.sort("len", descending=True)
31
.limit(5)
32
)
33
34
df = q.collect()
35
print(df)
36
# --8<-- [end:basic]
37
38
# --8<-- [start:conditional]
39
q = (
40
dataset.lazy()
41
.group_by("state")
42
.agg(
43
(pl.col("party") == "Anti-Administration").sum().alias("anti"),
44
(pl.col("party") == "Pro-Administration").sum().alias("pro"),
45
)
46
.sort("pro", descending=True)
47
.limit(5)
48
)
49
50
df = q.collect()
51
print(df)
52
# --8<-- [end:conditional]
53
54
# --8<-- [start:nested]
55
q = (
56
dataset.lazy()
57
.group_by("state", "party")
58
.agg(pl.len().alias("count"))
59
.filter(
60
(pl.col("party") == "Anti-Administration")
61
| (pl.col("party") == "Pro-Administration")
62
)
63
.sort("count", descending=True)
64
.limit(5)
65
)
66
67
df = q.collect()
68
print(df)
69
# --8<-- [end:nested]
70
71
72
# --8<-- [start:filter]
73
from datetime import date
74
75
76
def compute_age():
77
return date.today().year - pl.col("birthday").dt.year()
78
79
80
def avg_age(gender: str) -> pl.Expr:
81
return (
82
compute_age()
83
.filter(pl.col("gender") == gender)
84
.mean()
85
.alias(f"avg {gender} age")
86
)
87
88
89
q = (
90
dataset.lazy()
91
.group_by("state")
92
.agg(
93
avg_age("M"),
94
avg_age("F"),
95
(pl.col("gender") == "M").sum().alias("# male"),
96
(pl.col("gender") == "F").sum().alias("# female"),
97
)
98
.limit(5)
99
)
100
101
df = q.collect()
102
print(df)
103
# --8<-- [end:filter]
104
105
106
# --8<-- [start:filter-nested]
107
q = (
108
dataset.lazy()
109
.group_by("state", "gender")
110
.agg(
111
# The function `avg_age` is not needed:
112
compute_age().mean().alias("avg age"),
113
pl.len().alias("#"),
114
)
115
.sort("#", descending=True)
116
.limit(5)
117
)
118
119
df = q.collect()
120
print(df)
121
# --8<-- [end:filter-nested]
122
123
124
# --8<-- [start:sort]
125
def get_name() -> pl.Expr:
126
return pl.col("first_name") + pl.lit(" ") + pl.col("last_name")
127
128
129
q = (
130
dataset.lazy()
131
.sort("birthday", descending=True)
132
.group_by("state")
133
.agg(
134
get_name().first().alias("youngest"),
135
get_name().last().alias("oldest"),
136
)
137
.limit(5)
138
)
139
140
df = q.collect()
141
print(df)
142
# --8<-- [end:sort]
143
144
145
# --8<-- [start:sort2]
146
q = (
147
dataset.lazy()
148
.sort("birthday", descending=True)
149
.group_by("state")
150
.agg(
151
get_name().first().alias("youngest"),
152
get_name().last().alias("oldest"),
153
get_name().sort().first().alias("alphabetical_first"),
154
)
155
.limit(5)
156
)
157
158
df = q.collect()
159
print(df)
160
# --8<-- [end:sort2]
161
162
163
# --8<-- [start:sort3]
164
q = (
165
dataset.lazy()
166
.sort("birthday", descending=True)
167
.group_by("state")
168
.agg(
169
get_name().first().alias("youngest"),
170
get_name().last().alias("oldest"),
171
get_name().sort().first().alias("alphabetical_first"),
172
pl.col("gender").sort_by(get_name()).first(),
173
)
174
.sort("state")
175
.limit(5)
176
)
177
178
df = q.collect()
179
print(df)
180
# --8<-- [end:sort3]
181
182