Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/docs/source/src/python/user-guide/getting-started.py
6940 views
1
# --8<-- [start:df]
2
import polars as pl
3
import datetime as dt
4
5
df = pl.DataFrame(
6
{
7
"name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
8
"birthdate": [
9
dt.date(1997, 1, 10),
10
dt.date(1985, 2, 15),
11
dt.date(1983, 3, 22),
12
dt.date(1981, 4, 30),
13
],
14
"weight": [57.9, 72.5, 53.6, 83.1], # (kg)
15
"height": [1.56, 1.77, 1.65, 1.75], # (m)
16
}
17
)
18
19
print(df)
20
# --8<-- [end:df]
21
22
# --8<-- [start:csv]
23
df.write_csv("docs/assets/data/output.csv")
24
df_csv = pl.read_csv("docs/assets/data/output.csv", try_parse_dates=True)
25
print(df_csv)
26
# --8<-- [end:csv]
27
28
# --8<-- [start:select]
29
result = df.select(
30
pl.col("name"),
31
pl.col("birthdate").dt.year().alias("birth_year"),
32
(pl.col("weight") / (pl.col("height") ** 2)).alias("bmi"),
33
)
34
print(result)
35
# --8<-- [end:select]
36
37
# --8<-- [start:expression-expansion]
38
result = df.select(
39
pl.col("name"),
40
(pl.col("weight", "height") * 0.95).round(2).name.suffix("-5%"),
41
)
42
print(result)
43
# --8<-- [end:expression-expansion]
44
45
# --8<-- [start:with_columns]
46
result = df.with_columns(
47
birth_year=pl.col("birthdate").dt.year(),
48
bmi=pl.col("weight") / (pl.col("height") ** 2),
49
)
50
print(result)
51
# --8<-- [end:with_columns]
52
53
# --8<-- [start:filter]
54
result = df.filter(pl.col("birthdate").dt.year() < 1990)
55
print(result)
56
# --8<-- [end:filter]
57
58
# --8<-- [start:filter-multiple]
59
result = df.filter(
60
pl.col("birthdate").is_between(dt.date(1982, 12, 31), dt.date(1996, 1, 1)),
61
pl.col("height") > 1.7,
62
)
63
print(result)
64
# --8<-- [end:filter-multiple]
65
66
# --8<-- [start:group_by]
67
result = df.group_by(
68
(pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
69
maintain_order=True,
70
).len()
71
print(result)
72
# --8<-- [end:group_by]
73
74
# --8<-- [start:group_by-agg]
75
result = df.group_by(
76
(pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
77
maintain_order=True,
78
).agg(
79
pl.len().alias("sample_size"),
80
pl.col("weight").mean().round(2).alias("avg_weight"),
81
pl.col("height").max().alias("tallest"),
82
)
83
print(result)
84
# --8<-- [end:group_by-agg]
85
86
# --8<-- [start:complex]
87
result = (
88
df.with_columns(
89
(pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
90
pl.col("name").str.split(by=" ").list.first(),
91
)
92
.select(
93
pl.all().exclude("birthdate"),
94
)
95
.group_by(
96
pl.col("decade"),
97
maintain_order=True,
98
)
99
.agg(
100
pl.col("name"),
101
pl.col("weight", "height").mean().round(2).name.prefix("avg_"),
102
)
103
)
104
print(result)
105
# --8<-- [end:complex]
106
107
# --8<-- [start:join]
108
df2 = pl.DataFrame(
109
{
110
"name": ["Ben Brown", "Daniel Donovan", "Alice Archer", "Chloe Cooper"],
111
"parent": [True, False, False, False],
112
"siblings": [1, 2, 3, 4],
113
}
114
)
115
116
print(df.join(df2, on="name", how="left"))
117
# --8<-- [end:join]
118
119
# --8<-- [start:concat]
120
df3 = pl.DataFrame(
121
{
122
"name": ["Ethan Edwards", "Fiona Foster", "Grace Gibson", "Henry Harris"],
123
"birthdate": [
124
dt.date(1977, 5, 10),
125
dt.date(1975, 6, 23),
126
dt.date(1973, 7, 22),
127
dt.date(1971, 8, 3),
128
],
129
"weight": [67.9, 72.5, 57.6, 93.1], # (kg)
130
"height": [1.76, 1.6, 1.66, 1.8], # (m)
131
}
132
)
133
134
print(pl.concat([df, df3], how="vertical"))
135
# --8<-- [end:concat]
136
137