Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/docs/source/src/python/user-guide/expressions/lists.py
7890 views
1
# --8<-- [start:list-example]
2
from datetime import datetime
3
import polars as pl
4
5
df = pl.DataFrame(
6
{
7
"names": [
8
["Anne", "Averill", "Adams"],
9
["Brandon", "Brooke", "Borden", "Branson"],
10
["Camila", "Campbell"],
11
["Dennis", "Doyle"],
12
],
13
"children_ages": [
14
[5, 7],
15
[],
16
[],
17
[8, 11, 18],
18
],
19
"medical_appointments": [
20
[],
21
[],
22
[],
23
[datetime(2022, 5, 22, 16, 30)],
24
],
25
}
26
)
27
28
print(df)
29
# --8<-- [end:list-example]
30
31
# --8<-- [start:array-example]
32
df = pl.DataFrame(
33
{
34
"bit_flags": [
35
[True, True, True, True, False],
36
[False, True, True, True, True],
37
],
38
"tic_tac_toe": [
39
[
40
[" ", "x", "o"],
41
[" ", "x", " "],
42
["o", "x", " "],
43
],
44
[
45
["o", "x", "x"],
46
[" ", "o", "x"],
47
[" ", " ", "o"],
48
],
49
],
50
},
51
schema={
52
"bit_flags": pl.Array(pl.Boolean, 5),
53
"tic_tac_toe": pl.Array(pl.String, (3, 3)),
54
},
55
)
56
57
print(df)
58
# --8<-- [end:array-example]
59
60
# --8<-- [start:numpy-array-inference]
61
import numpy as np
62
63
array = np.arange(0, 120).reshape((5, 2, 3, 4)) # 4D array
64
65
print(pl.Series(array).dtype) # Column with the 3D subarrays
66
# --8<-- [end:numpy-array-inference]
67
68
# --8<-- [start:weather]
69
weather = pl.DataFrame(
70
{
71
"station": [f"Station {idx}" for idx in range(1, 6)],
72
"temperatures": [
73
"20 5 5 E1 7 13 19 9 6 20",
74
"18 8 16 11 23 E2 8 E2 E2 E2 90 70 40",
75
"19 24 E9 16 6 12 10 22",
76
"E2 E0 15 7 8 10 E1 24 17 13 6",
77
"14 8 E0 16 22 24 E1",
78
],
79
}
80
)
81
82
print(weather)
83
# --8<-- [end:weather]
84
85
# --8<-- [start:split]
86
weather = weather.with_columns(
87
pl.col("temperatures").str.split(" "),
88
)
89
print(weather)
90
# --8<-- [end:split]
91
92
# --8<-- [start:explode]
93
result = weather.explode("temperatures")
94
print(result)
95
# --8<-- [end:explode]
96
97
# --8<-- [start:list-slicing]
98
result = weather.with_columns(
99
pl.col("temperatures").list.head(3).alias("head"),
100
pl.col("temperatures").list.tail(3).alias("tail"),
101
pl.col("temperatures").list.slice(-3, 2).alias("two_next_to_last"),
102
)
103
print(result)
104
# --8<-- [end:list-slicing]
105
106
# --8<-- [start:element-wise-casting]
107
result = weather.with_columns(
108
pl.col("temperatures")
109
.list.eval(pl.element().cast(pl.Int64, strict=False).is_null())
110
.list.sum()
111
.alias("errors"),
112
)
113
print(result)
114
# --8<-- [end:element-wise-casting]
115
116
# --8<-- [start:element-wise-regex]
117
result2 = weather.with_columns(
118
pl.col("temperatures")
119
.list.eval(pl.element().str.contains("(?i)[a-z]"))
120
.list.sum()
121
.alias("errors"),
122
)
123
print(result.equals(result2))
124
# --8<-- [end:element-wise-regex]
125
126
# --8<-- [start:children]
127
df = pl.DataFrame(
128
{
129
"children": [
130
[
131
{"name": "Anne", "age": 5},
132
{"name": "Averill", "age": 7},
133
],
134
[
135
{"name": "Brandon", "age": 12},
136
{"name": "Brooke", "age": 9},
137
{"name": "Branson", "age": 11},
138
],
139
[{"name": "Camila", "age": 19}],
140
[
141
{"name": "Dennis", "age": 8},
142
{"name": "Doyle", "age": 11},
143
{"name": "Dina", "age": 18},
144
],
145
],
146
}
147
)
148
149
print(df)
150
# --8<-- [end:children]
151
152
# --8<-- [start:list-sorting]
153
result = df.select(
154
pl.col("children")
155
.list.eval(
156
pl.element()
157
.sort_by(pl.element().struct.field("age"), descending=True)
158
.struct.field("name")
159
)
160
.alias("names_by_age"),
161
pl.col("children")
162
.list.eval(pl.element().struct.field("age").min())
163
.alias("min_age"),
164
pl.col("children")
165
.list.eval(pl.element().struct.field("age").max())
166
.alias("max_age"),
167
)
168
print(result)
169
# --8<-- [end:list-sorting]
170
171
# --8<-- [start:list-aggregation]
172
result = df.select(
173
pl.col("children")
174
.list.eval(
175
pl.element()
176
.sort_by(pl.element().struct.field("age"), descending=True)
177
.struct.field("name")
178
)
179
.alias("names_by_age"),
180
pl.col("children")
181
.list.agg(pl.element().struct.field("age").min())
182
.alias("min_age"),
183
pl.col("children")
184
.list.agg(pl.element().struct.field("age").max())
185
.alias("max_age"),
186
)
187
print(result)
188
# --8<-- [end:list-aggregation]
189
190
# --8<-- [start:list-entropy]
191
result = df.with_columns(
192
pl.col("children")
193
.list.agg(pl.element().struct.field("age").entropy())
194
.alias("age_entropy"),
195
)
196
print(result)
197
# --8<-- [end:list-entropy]
198
199
# --8<-- [start:weather_by_day]
200
weather_by_day = pl.DataFrame(
201
{
202
"station": [f"Station {idx}" for idx in range(1, 11)],
203
"day_1": [17, 11, 8, 22, 9, 21, 20, 8, 8, 17],
204
"day_2": [15, 11, 10, 8, 7, 14, 18, 21, 15, 13],
205
"day_3": [16, 15, 24, 24, 8, 23, 19, 23, 16, 10],
206
}
207
)
208
print(weather_by_day)
209
# --8<-- [end:weather_by_day]
210
211
# --8<-- [start:rank_pct]
212
rank_pct = (pl.element().rank(descending=True) / pl.element().count()).round(2)
213
214
result = weather_by_day.with_columns(
215
# create the list of homogeneous data
216
pl.concat_list(pl.all().exclude("station")).alias("all_temps")
217
).select(
218
# select all columns except the intermediate list
219
pl.all().exclude("all_temps"),
220
# compute the rank by calling `list.eval`
221
pl.col("all_temps").list.eval(rank_pct, parallel=True).alias("temps_rank"),
222
)
223
224
print(result)
225
# --8<-- [end:rank_pct]
226
227
# --8<-- [start:array-overview]
228
df = pl.DataFrame(
229
{
230
"first_last": [
231
["Anne", "Adams"],
232
["Brandon", "Branson"],
233
["Camila", "Campbell"],
234
["Dennis", "Doyle"],
235
],
236
"fav_numbers": [
237
[42, 0, 1],
238
[2, 3, 5],
239
[13, 21, 34],
240
[73, 3, 7],
241
],
242
},
243
schema={
244
"first_last": pl.Array(pl.String, 2),
245
"fav_numbers": pl.Array(pl.Int32, 3),
246
},
247
)
248
249
result = df.select(
250
pl.col("first_last").arr.join(" ").alias("name"),
251
pl.col("fav_numbers").arr.sort(),
252
pl.col("fav_numbers").arr.max().alias("largest_fav"),
253
pl.col("fav_numbers").arr.sum().alias("summed"),
254
pl.col("fav_numbers").arr.contains(3).alias("likes_3"),
255
)
256
print(result)
257
# --8<-- [end:array-overview]
258
259