CoCalc -- getting-started.py

GitHub Repository: pola-rs/polars
Path: blob/main/docs/source/src/python/user-guide/getting-started.py
⁶⁹⁴⁰ views
1
# --8<-- [start:df]
2
import polars as pl
3
import datetime as dt
4

5
df = pl.DataFrame(
6
    {
7
        "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
8
        "birthdate": [
9
            dt.date(1997, 1, 10),
10
            dt.date(1985, 2, 15),
11
            dt.date(1983, 3, 22),
12
            dt.date(1981, 4, 30),
13
        ],
14
        "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
15
        "height": [1.56, 1.77, 1.65, 1.75],  # (m)
16
    }
17
)
18

19
print(df)
20
# --8<-- [end:df]
21

22
# --8<-- [start:csv]
23
df.write_csv("docs/assets/data/output.csv")
24
df_csv = pl.read_csv("docs/assets/data/output.csv", try_parse_dates=True)
25
print(df_csv)
26
# --8<-- [end:csv]
27

28
# --8<-- [start:select]
29
result = df.select(
30
    pl.col("name"),
31
    pl.col("birthdate").dt.year().alias("birth_year"),
32
    (pl.col("weight") / (pl.col("height") ** 2)).alias("bmi"),
33
)
34
print(result)
35
# --8<-- [end:select]
36

37
# --8<-- [start:expression-expansion]
38
result = df.select(
39
    pl.col("name"),
40
    (pl.col("weight", "height") * 0.95).round(2).name.suffix("-5%"),
41
)
42
print(result)
43
# --8<-- [end:expression-expansion]
44

45
# --8<-- [start:with_columns]
46
result = df.with_columns(
47
    birth_year=pl.col("birthdate").dt.year(),
48
    bmi=pl.col("weight") / (pl.col("height") ** 2),
49
)
50
print(result)
51
# --8<-- [end:with_columns]
52

53
# --8<-- [start:filter]
54
result = df.filter(pl.col("birthdate").dt.year() < 1990)
55
print(result)
56
# --8<-- [end:filter]
57

58
# --8<-- [start:filter-multiple]
59
result = df.filter(
60
    pl.col("birthdate").is_between(dt.date(1982, 12, 31), dt.date(1996, 1, 1)),
61
    pl.col("height") > 1.7,
62
)
63
print(result)
64
# --8<-- [end:filter-multiple]
65

66
# --8<-- [start:group_by]
67
result = df.group_by(
68
    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
69
    maintain_order=True,
70
).len()
71
print(result)
72
# --8<-- [end:group_by]
73

74
# --8<-- [start:group_by-agg]
75
result = df.group_by(
76
    (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
77
    maintain_order=True,
78
).agg(
79
    pl.len().alias("sample_size"),
80
    pl.col("weight").mean().round(2).alias("avg_weight"),
81
    pl.col("height").max().alias("tallest"),
82
)
83
print(result)
84
# --8<-- [end:group_by-agg]
85

86
# --8<-- [start:complex]
87
result = (
88
    df.with_columns(
89
        (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
90
        pl.col("name").str.split(by=" ").list.first(),
91
    )
92
    .select(
93
        pl.all().exclude("birthdate"),
94
    )
95
    .group_by(
96
        pl.col("decade"),
97
        maintain_order=True,
98
    )
99
    .agg(
100
        pl.col("name"),
101
        pl.col("weight", "height").mean().round(2).name.prefix("avg_"),
102
    )
103
)
104
print(result)
105
# --8<-- [end:complex]
106

107
# --8<-- [start:join]
108
df2 = pl.DataFrame(
109
    {
110
        "name": ["Ben Brown", "Daniel Donovan", "Alice Archer", "Chloe Cooper"],
111
        "parent": [True, False, False, False],
112
        "siblings": [1, 2, 3, 4],
113
    }
114
)
115

116
print(df.join(df2, on="name", how="left"))
117
# --8<-- [end:join]
118

119
# --8<-- [start:concat]
120
df3 = pl.DataFrame(
121
    {
122
        "name": ["Ethan Edwards", "Fiona Foster", "Grace Gibson", "Henry Harris"],
123
        "birthdate": [
124
            dt.date(1977, 5, 10),
125
            dt.date(1975, 6, 23),
126
            dt.date(1973, 7, 22),
127
            dt.date(1971, 8, 3),
128
        ],
129
        "weight": [67.9, 72.5, 57.6, 93.1],  # (kg)
130
        "height": [1.76, 1.6, 1.66, 1.8],  # (m)
131
    }
132
)
133

134
print(pl.concat([df, df3], how="vertical"))
135
# --8<-- [end:concat]
136

137
Product

Resources

Company