CoCalc -- lists.py

GitHub Repository: pola-rs/polars
Path: blob/main/docs/source/src/python/user-guide/expressions/lists.py
⁷⁸⁹⁰ views
1
# --8<-- [start:list-example]
2
from datetime import datetime
3
import polars as pl
4

5
df = pl.DataFrame(
6
    {
7
        "names": [
8
            ["Anne", "Averill", "Adams"],
9
            ["Brandon", "Brooke", "Borden", "Branson"],
10
            ["Camila", "Campbell"],
11
            ["Dennis", "Doyle"],
12
        ],
13
        "children_ages": [
14
            [5, 7],
15
            [],
16
            [],
17
            [8, 11, 18],
18
        ],
19
        "medical_appointments": [
20
            [],
21
            [],
22
            [],
23
            [datetime(2022, 5, 22, 16, 30)],
24
        ],
25
    }
26
)
27

28
print(df)
29
# --8<-- [end:list-example]
30

31
# --8<-- [start:array-example]
32
df = pl.DataFrame(
33
    {
34
        "bit_flags": [
35
            [True, True, True, True, False],
36
            [False, True, True, True, True],
37
        ],
38
        "tic_tac_toe": [
39
            [
40
                [" ", "x", "o"],
41
                [" ", "x", " "],
42
                ["o", "x", " "],
43
            ],
44
            [
45
                ["o", "x", "x"],
46
                [" ", "o", "x"],
47
                [" ", " ", "o"],
48
            ],
49
        ],
50
    },
51
    schema={
52
        "bit_flags": pl.Array(pl.Boolean, 5),
53
        "tic_tac_toe": pl.Array(pl.String, (3, 3)),
54
    },
55
)
56

57
print(df)
58
# --8<-- [end:array-example]
59

60
# --8<-- [start:numpy-array-inference]
61
import numpy as np
62

63
array = np.arange(0, 120).reshape((5, 2, 3, 4))  # 4D array
64

65
print(pl.Series(array).dtype)  # Column with the 3D subarrays
66
# --8<-- [end:numpy-array-inference]
67

68
# --8<-- [start:weather]
69
weather = pl.DataFrame(
70
    {
71
        "station": [f"Station {idx}" for idx in range(1, 6)],
72
        "temperatures": [
73
            "20 5 5 E1 7 13 19 9 6 20",
74
            "18 8 16 11 23 E2 8 E2 E2 E2 90 70 40",
75
            "19 24 E9 16 6 12 10 22",
76
            "E2 E0 15 7 8 10 E1 24 17 13 6",
77
            "14 8 E0 16 22 24 E1",
78
        ],
79
    }
80
)
81

82
print(weather)
83
# --8<-- [end:weather]
84

85
# --8<-- [start:split]
86
weather = weather.with_columns(
87
    pl.col("temperatures").str.split(" "),
88
)
89
print(weather)
90
# --8<-- [end:split]
91

92
# --8<-- [start:explode]
93
result = weather.explode("temperatures")
94
print(result)
95
# --8<-- [end:explode]
96

97
# --8<-- [start:list-slicing]
98
result = weather.with_columns(
99
    pl.col("temperatures").list.head(3).alias("head"),
100
    pl.col("temperatures").list.tail(3).alias("tail"),
101
    pl.col("temperatures").list.slice(-3, 2).alias("two_next_to_last"),
102
)
103
print(result)
104
# --8<-- [end:list-slicing]
105

106
# --8<-- [start:element-wise-casting]
107
result = weather.with_columns(
108
    pl.col("temperatures")
109
    .list.eval(pl.element().cast(pl.Int64, strict=False).is_null())
110
    .list.sum()
111
    .alias("errors"),
112
)
113
print(result)
114
# --8<-- [end:element-wise-casting]
115

116
# --8<-- [start:element-wise-regex]
117
result2 = weather.with_columns(
118
    pl.col("temperatures")
119
    .list.eval(pl.element().str.contains("(?i)[a-z]"))
120
    .list.sum()
121
    .alias("errors"),
122
)
123
print(result.equals(result2))
124
# --8<-- [end:element-wise-regex]
125

126
# --8<-- [start:children]
127
df = pl.DataFrame(
128
    {
129
        "children": [
130
            [
131
                {"name": "Anne", "age": 5},
132
                {"name": "Averill", "age": 7},
133
            ],
134
            [
135
                {"name": "Brandon", "age": 12},
136
                {"name": "Brooke", "age": 9},
137
                {"name": "Branson", "age": 11},
138
            ],
139
            [{"name": "Camila", "age": 19}],
140
            [
141
                {"name": "Dennis", "age": 8},
142
                {"name": "Doyle", "age": 11},
143
                {"name": "Dina", "age": 18},
144
            ],
145
        ],
146
    }
147
)
148

149
print(df)
150
# --8<-- [end:children]
151

152
# --8<-- [start:list-sorting]
153
result = df.select(
154
    pl.col("children")
155
    .list.eval(
156
        pl.element()
157
        .sort_by(pl.element().struct.field("age"), descending=True)
158
        .struct.field("name")
159
    )
160
    .alias("names_by_age"),
161
    pl.col("children")
162
    .list.eval(pl.element().struct.field("age").min())
163
    .alias("min_age"),
164
    pl.col("children")
165
    .list.eval(pl.element().struct.field("age").max())
166
    .alias("max_age"),
167
)
168
print(result)
169
# --8<-- [end:list-sorting]
170

171
# --8<-- [start:list-aggregation]
172
result = df.select(
173
    pl.col("children")
174
    .list.eval(
175
        pl.element()
176
        .sort_by(pl.element().struct.field("age"), descending=True)
177
        .struct.field("name")
178
    )
179
    .alias("names_by_age"),
180
    pl.col("children")
181
    .list.agg(pl.element().struct.field("age").min())
182
    .alias("min_age"),
183
    pl.col("children")
184
    .list.agg(pl.element().struct.field("age").max())
185
    .alias("max_age"),
186
)
187
print(result)
188
# --8<-- [end:list-aggregation]
189

190
# --8<-- [start:list-entropy]
191
result = df.with_columns(
192
    pl.col("children")
193
    .list.agg(pl.element().struct.field("age").entropy())
194
    .alias("age_entropy"),
195
)
196
print(result)
197
# --8<-- [end:list-entropy]
198

199
# --8<-- [start:weather_by_day]
200
weather_by_day = pl.DataFrame(
201
    {
202
        "station": [f"Station {idx}" for idx in range(1, 11)],
203
        "day_1": [17, 11, 8, 22, 9, 21, 20, 8, 8, 17],
204
        "day_2": [15, 11, 10, 8, 7, 14, 18, 21, 15, 13],
205
        "day_3": [16, 15, 24, 24, 8, 23, 19, 23, 16, 10],
206
    }
207
)
208
print(weather_by_day)
209
# --8<-- [end:weather_by_day]
210

211
# --8<-- [start:rank_pct]
212
rank_pct = (pl.element().rank(descending=True) / pl.element().count()).round(2)
213

214
result = weather_by_day.with_columns(
215
    # create the list of homogeneous data
216
    pl.concat_list(pl.all().exclude("station")).alias("all_temps")
217
).select(
218
    # select all columns except the intermediate list
219
    pl.all().exclude("all_temps"),
220
    # compute the rank by calling `list.eval`
221
    pl.col("all_temps").list.eval(rank_pct, parallel=True).alias("temps_rank"),
222
)
223

224
print(result)
225
# --8<-- [end:rank_pct]
226

227
# --8<-- [start:array-overview]
228
df = pl.DataFrame(
229
    {
230
        "first_last": [
231
            ["Anne", "Adams"],
232
            ["Brandon", "Branson"],
233
            ["Camila", "Campbell"],
234
            ["Dennis", "Doyle"],
235
        ],
236
        "fav_numbers": [
237
            [42, 0, 1],
238
            [2, 3, 5],
239
            [13, 21, 34],
240
            [73, 3, 7],
241
        ],
242
    },
243
    schema={
244
        "first_last": pl.Array(pl.String, 2),
245
        "fav_numbers": pl.Array(pl.Int32, 3),
246
    },
247
)
248

249
result = df.select(
250
    pl.col("first_last").arr.join(" ").alias("name"),
251
    pl.col("fav_numbers").arr.sort(),
252
    pl.col("fav_numbers").arr.max().alias("largest_fav"),
253
    pl.col("fav_numbers").arr.sum().alias("summed"),
254
    pl.col("fav_numbers").arr.contains(3).alias("likes_3"),
255
)
256
print(result)
257
# --8<-- [end:array-overview]
258

259
Product

Resources

Company