Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/test_explode.py
6939 views
1
from __future__ import annotations
2
3
import pyarrow as pa
4
import pytest
5
6
import polars as pl
7
import polars.selectors as cs
8
from polars.exceptions import ShapeError
9
from polars.testing import assert_frame_equal, assert_series_equal
10
11
12
def test_explode_multiple() -> None:
13
df = pl.DataFrame({"a": [[1, 2], [3, 4]], "b": [[5, 6], [7, 8]]})
14
15
expected = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})
16
assert_frame_equal(df.explode(cs.all()), expected)
17
assert_frame_equal(df.explode(["a", "b"]), expected)
18
assert_frame_equal(df.explode("a", "b"), expected)
19
20
21
def test_group_by_flatten_list() -> None:
22
df = pl.DataFrame({"group": ["a", "b", "b"], "values": [[1, 2], [2, 3], [4]]})
23
result = df.group_by("group", maintain_order=True).agg(pl.col("values").flatten())
24
25
expected = pl.DataFrame({"group": ["a", "b"], "values": [[1, 2], [2, 3, 4]]})
26
assert_frame_equal(result, expected)
27
28
29
def test_explode_empty_df_3402() -> None:
30
df = pl.DataFrame({"a": pa.array([], type=pa.large_list(pa.int32()))})
31
assert df.explode("a").dtypes == [pl.Int32]
32
33
34
def test_explode_empty_df_3460() -> None:
35
df = pl.DataFrame({"a": pa.array([[]], type=pa.large_list(pa.int32()))})
36
assert df.explode("a").dtypes == [pl.Int32]
37
38
39
def test_explode_empty_df_3902() -> None:
40
df = pl.DataFrame(
41
{
42
"first": [1, 2, 3, 4, 5],
43
"second": [["a"], [], ["b", "c"], [], ["d", "f", "g"]],
44
}
45
)
46
expected = pl.DataFrame(
47
{
48
"first": [1, 2, 3, 3, 4, 5, 5, 5],
49
"second": ["a", None, "b", "c", None, "d", "f", "g"],
50
}
51
)
52
assert_frame_equal(df.explode("second"), expected)
53
54
55
def test_explode_empty_list_4003() -> None:
56
df = pl.DataFrame(
57
[
58
{"id": 1, "nested": []},
59
{"id": 2, "nested": [1]},
60
{"id": 3, "nested": [2]},
61
]
62
)
63
assert df.explode("nested").to_dict(as_series=False) == {
64
"id": [1, 2, 3],
65
"nested": [None, 1, 2],
66
}
67
68
69
def test_explode_empty_list_4107() -> None:
70
df = pl.DataFrame({"b": [[1], [2], []] * 2}).with_row_index()
71
72
assert_frame_equal(
73
df.explode(["b"]), df.explode(["b"]).drop("index").with_row_index()
74
)
75
76
77
def test_explode_correct_for_slice() -> None:
78
df = pl.DataFrame({"b": [[1, 1], [2, 2], [3, 3], [4, 4]]})
79
assert df.slice(2, 2).explode(["b"])["b"].to_list() == [3, 3, 4, 4]
80
81
df = (
82
(
83
pl.DataFrame({"group": pl.arange(0, 5, eager=True)}).join(
84
pl.DataFrame(
85
{
86
"b": [[1, 2, 3], [2, 3], [4], [1, 2, 3], [0]],
87
}
88
),
89
how="cross",
90
)
91
)
92
.sort("group", maintain_order=True)
93
.with_row_index()
94
)
95
expected = pl.DataFrame(
96
{
97
"index": [0, 0, 0, 1, 1, 2, 3, 3, 3, 4, 5, 5, 5, 6, 6, 7, 8, 8, 8, 9],
98
"group": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
99
"b": [1, 2, 3, 2, 3, 4, 1, 2, 3, 0, 1, 2, 3, 2, 3, 4, 1, 2, 3, 0],
100
},
101
schema_overrides={"index": pl.UInt32},
102
)
103
assert_frame_equal(df.slice(0, 10).explode(["b"]), expected)
104
105
106
def test_sliced_null_explode() -> None:
107
s = pl.Series("", [[1], [2], [3], [4], [], [6]])
108
assert s.slice(2, 4).list.explode().to_list() == [3, 4, None, 6]
109
assert s.slice(2, 2).list.explode().to_list() == [3, 4]
110
assert pl.Series("", [[1], [2], None, [4], [], [6]]).slice(
111
2, 4
112
).list.explode().to_list() == [None, 4, None, 6]
113
114
s = pl.Series("", [["a"], ["b"], ["c"], ["d"], [], ["e"]])
115
assert s.slice(2, 4).list.explode().to_list() == ["c", "d", None, "e"]
116
assert s.slice(2, 2).list.explode().to_list() == ["c", "d"]
117
assert pl.Series("", [["a"], ["b"], None, ["d"], [], ["e"]]).slice(
118
2, 4
119
).list.explode().to_list() == [None, "d", None, "e"]
120
121
s = pl.Series("", [[False], [False], [True], [False], [], [True]])
122
assert s.slice(2, 2).list.explode().to_list() == [True, False]
123
assert s.slice(2, 4).list.explode().to_list() == [True, False, None, True]
124
125
126
@pytest.mark.parametrize("maintain_order", [False, True])
127
def test_explode_in_agg_context(maintain_order: bool) -> None:
128
df = pl.DataFrame(
129
{"idxs": [[0], [1], [0, 2]], "array": [[0.0, 3.5], [4.6, 0.0], [0.0, 7.8, 0.0]]}
130
)
131
132
assert_frame_equal(
133
df.with_row_index()
134
.explode("idxs")
135
.group_by("index", maintain_order=maintain_order)
136
.agg(pl.col("array").flatten()),
137
pl.DataFrame(
138
{
139
"index": [0, 1, 2],
140
"array": [[0.0, 3.5], [4.6, 0.0], [0.0, 7.8, 0.0, 0.0, 7.8, 0.0]],
141
},
142
schema_overrides={"index": pl.get_index_type()},
143
),
144
check_row_order=maintain_order,
145
)
146
147
148
def test_explode_inner_lists_3985() -> None:
149
df = pl.DataFrame(
150
data={"id": [1, 1, 1], "categories": [["a"], ["b"], ["a", "c"]]}
151
).lazy()
152
153
assert (
154
df.group_by("id")
155
.agg(pl.col("categories"))
156
.with_columns(pl.col("categories").list.eval(pl.element().list.explode()))
157
).collect().to_dict(as_series=False) == {
158
"id": [1],
159
"categories": [["a", "b", "a", "c"]],
160
}
161
162
163
def test_list_struct_explode_6905() -> None:
164
assert pl.DataFrame(
165
{
166
"group": [
167
[],
168
[
169
{"params": [1]},
170
{"params": []},
171
],
172
]
173
},
174
schema={"group": pl.List(pl.Struct([pl.Field("params", pl.List(pl.Int32))]))},
175
)["group"].list.explode().to_list() == [
176
None,
177
{"params": [1]},
178
{"params": []},
179
]
180
181
182
def test_explode_binary() -> None:
183
assert pl.Series([[1, 2], [3]]).cast(
184
pl.List(pl.Binary)
185
).list.explode().to_list() == [
186
b"1",
187
b"2",
188
b"3",
189
]
190
191
192
def test_explode_null_list() -> None:
193
assert pl.Series([["a"], None], dtype=pl.List(pl.String))[
194
1:2
195
].list.min().to_list() == [None]
196
197
198
def test_explode_invalid_element_count() -> None:
199
df = pl.DataFrame(
200
{
201
"col1": [["X", "Y", "Z"], ["F", "G"], ["P"]],
202
"col2": [["A", "B", "C"], ["C"], ["D", "E"]],
203
}
204
).with_row_index()
205
with pytest.raises(
206
ShapeError, match=r"exploded columns must have matching element counts"
207
):
208
df.explode(["col1", "col2"])
209
210
211
def test_logical_explode() -> None:
212
out = (
213
pl.DataFrame(
214
{"cats": ["Value1", "Value2", "Value1"]},
215
schema_overrides={"cats": pl.Categorical},
216
)
217
.group_by(1)
218
.agg(pl.struct("cats"))
219
.explode("cats")
220
.unnest("cats")
221
)
222
assert out["cats"].dtype == pl.Categorical
223
assert out["cats"].to_list() == ["Value1", "Value2", "Value1"]
224
225
226
def test_explode_inner_null() -> None:
227
expected = pl.DataFrame({"A": [None, None]}, schema={"A": pl.Null})
228
out = pl.DataFrame({"A": [[], []]}, schema={"A": pl.List(pl.Null)}).explode("A")
229
assert_frame_equal(out, expected)
230
231
232
def test_explode_array() -> None:
233
df = pl.LazyFrame(
234
{"a": [[1, 2], [2, 3]], "b": [1, 2]},
235
schema_overrides={"a": pl.Array(pl.Int64, 2)},
236
)
237
expected = pl.DataFrame({"a": [1, 2, 2, 3], "b": [1, 1, 2, 2]})
238
for ex in ("a", ~cs.integer()):
239
out = df.explode(ex).collect()
240
assert_frame_equal(out, expected)
241
242
243
def test_string_list_agg_explode() -> None:
244
df = pl.DataFrame({"a": [[None], ["b"]]})
245
246
df = df.select(
247
pl.col("a").list.eval(pl.element().filter(pl.element().is_not_null()))
248
)
249
assert not df["a"].flags["FAST_EXPLODE"]
250
251
df2 = pl.DataFrame({"a": [[], ["b"]]})
252
253
assert_frame_equal(df, df2)
254
assert_frame_equal(df.explode("a"), df2.explode("a"))
255
256
257
def test_explode_null_struct() -> None:
258
df = [
259
{"col1": None},
260
{
261
"col1": [
262
{"field1": None, "field2": None, "field3": None},
263
{"field1": None, "field2": "some", "field3": "value"},
264
]
265
},
266
]
267
268
assert pl.DataFrame(df).explode("col1").to_dict(as_series=False) == {
269
"col1": [
270
None,
271
{"field1": None, "field2": None, "field3": None},
272
{"field1": None, "field2": "some", "field3": "value"},
273
]
274
}
275
276
277
def test_df_explode_with_array() -> None:
278
df = pl.DataFrame(
279
{
280
"arr": [["a", "b"], ["c", None], None, ["d", "e"]],
281
"list": [[1, 2], [3], [4, None], None],
282
"val": ["x", "y", "z", "q"],
283
},
284
schema={
285
"arr": pl.Array(pl.String, 2),
286
"list": pl.List(pl.Int64),
287
"val": pl.String,
288
},
289
)
290
291
expected_by_arr = pl.DataFrame(
292
{
293
"arr": ["a", "b", "c", None, None, "d", "e"],
294
"list": [[1, 2], [1, 2], [3], [3], [4, None], None, None],
295
"val": ["x", "x", "y", "y", "z", "q", "q"],
296
}
297
)
298
assert_frame_equal(df.explode("arr"), expected_by_arr)
299
300
expected_by_list = pl.DataFrame(
301
{
302
"arr": [["a", "b"], ["a", "b"], ["c", None], None, None, ["d", "e"]],
303
"list": [1, 2, 3, 4, None, None],
304
"val": ["x", "x", "y", "z", "z", "q"],
305
},
306
schema={
307
"arr": pl.Array(pl.String, 2),
308
"list": pl.Int64,
309
"val": pl.String,
310
},
311
)
312
assert_frame_equal(df.explode("list"), expected_by_list)
313
314
df = pl.DataFrame(
315
{
316
"arr": [["a", "b"], ["c", None], None, ["d", "e"]],
317
"list": [[1, 2], [3, 4], None, [5, None]],
318
"val": [None, 1, 2, None],
319
},
320
schema={
321
"arr": pl.Array(pl.String, 2),
322
"list": pl.List(pl.Int64),
323
"val": pl.Int64,
324
},
325
)
326
expected_by_arr_and_list = pl.DataFrame(
327
{
328
"arr": ["a", "b", "c", None, None, "d", "e"],
329
"list": [1, 2, 3, 4, None, 5, None],
330
"val": [None, None, 1, 1, 2, None, None],
331
},
332
schema={
333
"arr": pl.String,
334
"list": pl.Int64,
335
"val": pl.Int64,
336
},
337
)
338
assert_frame_equal(df.explode("arr", "list"), expected_by_arr_and_list)
339
340
341
def test_explode_nullable_list() -> None:
342
df = pl.DataFrame({"layout1": [None, [1, 2]], "b": [False, True]}).with_columns(
343
layout2=pl.when(pl.col("b")).then([1, 2]),
344
)
345
346
explode_df = df.explode("layout1", "layout2")
347
expected_df = pl.DataFrame(
348
{
349
"layout1": [None, 1, 2],
350
"b": [False, True, True],
351
"layout2": [None, 1, 2],
352
}
353
)
354
assert_frame_equal(explode_df, expected_df)
355
356
explode_expr = df.select(
357
pl.col("layout1").explode(),
358
pl.col("layout2").explode(),
359
)
360
expected_df = pl.DataFrame(
361
{
362
"layout1": [None, 1, 2],
363
"layout2": [None, 1, 2],
364
}
365
)
366
assert_frame_equal(explode_expr, expected_df)
367
368
369
def test_group_by_flatten_string() -> None:
370
df = pl.DataFrame({"group": ["a", "b", "b"], "values": ["foo", "bar", "baz"]})
371
372
result = df.group_by("group", maintain_order=True).agg(
373
pl.col("values").str.split("").explode()
374
)
375
376
expected = pl.DataFrame(
377
{
378
"group": ["a", "b"],
379
"values": [["f", "o", "o"], ["b", "a", "r", "b", "a", "z"]],
380
}
381
)
382
assert_frame_equal(result, expected)
383
384
385
def test_fast_explode_merge_right_16923() -> None:
386
df = pl.concat(
387
[
388
pl.DataFrame({"foo": [["a", "b"], ["c"]]}),
389
pl.DataFrame({"foo": [None]}, schema={"foo": pl.List(pl.Utf8)}),
390
],
391
how="diagonal",
392
rechunk=True,
393
).explode("foo")
394
395
assert df.height == 4
396
397
398
def test_fast_explode_merge_left_16923() -> None:
399
df = pl.concat(
400
[
401
pl.DataFrame({"foo": [None]}, schema={"foo": pl.List(pl.Utf8)}),
402
pl.DataFrame({"foo": [["a", "b"], ["c"]]}),
403
],
404
how="diagonal",
405
rechunk=True,
406
).explode("foo")
407
408
assert df.height == 4
409
410
411
@pytest.mark.parametrize(
412
("values", "exploded"),
413
[
414
(["foobar", None], ["f", "o", "o", "b", "a", "r", None]),
415
([None, "foo", "bar"], [None, "f", "o", "o", "b", "a", "r"]),
416
(
417
[None, "foo", "bar", None, "ham"],
418
[None, "f", "o", "o", "b", "a", "r", None, "h", "a", "m"],
419
),
420
(["foo", "bar", "ham"], ["f", "o", "o", "b", "a", "r", "h", "a", "m"]),
421
(["", None, "foo", "bar"], ["", None, "f", "o", "o", "b", "a", "r"]),
422
(["", "foo", "bar"], ["", "f", "o", "o", "b", "a", "r"]),
423
],
424
)
425
def test_series_str_explode_deprecated(
426
values: list[str | None], exploded: list[str | None]
427
) -> None:
428
with pytest.deprecated_call():
429
result = pl.Series(values).str.explode()
430
assert result.to_list() == exploded
431
432
433
def test_expr_str_explode_deprecated() -> None:
434
df = pl.Series("a", ["Hello", "World"])
435
with pytest.deprecated_call():
436
result = df.to_frame().select(pl.col("a").str.explode()).to_series()
437
438
expected = pl.Series("a", ["H", "e", "l", "l", "o", "W", "o", "r", "l", "d"])
439
assert_series_equal(result, expected)
440
441
442
def test_undefined_col_15852() -> None:
443
lf = pl.LazyFrame({"foo": [1]})
444
445
with pytest.raises(pl.exceptions.ColumnNotFoundError):
446
lf.explode("bar").join(lf, on="foo").collect()
447
448
449
def test_explode_17648() -> None:
450
df = pl.DataFrame({"a": [[1, 3], [2, 6, 7], [3, 9, 2], [4], [5, 1, 2, 3, 4]]})
451
assert (
452
df.slice(1, 2)
453
.with_columns(pl.int_ranges(pl.col("a").list.len()).alias("count"))
454
.explode("a", "count")
455
).to_dict(as_series=False) == {"a": [2, 6, 7, 3, 9, 2], "count": [0, 1, 2, 0, 1, 2]}
456
457
458
def test_explode_struct_nulls() -> None:
459
df = pl.DataFrame({"A": [[{"B": 1}], [None], []]})
460
assert df.explode("A").to_dict(as_series=False) == {"A": [{"B": 1}, None, None]}
461
462