Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/test_explode.py
8406 views
1
from __future__ import annotations
2
3
import pyarrow as pa
4
import pytest
5
from hypothesis import given
6
7
import polars as pl
8
import polars.selectors as cs
9
from polars.exceptions import ShapeError
10
from polars.testing import assert_frame_equal, assert_series_equal
11
from polars.testing.parametric import series
12
13
14
def test_explode_multiple() -> None:
15
df = pl.DataFrame({"a": [[1, 2], [3, 4]], "b": [[5, 6], [7, 8]]})
16
17
expected = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})
18
assert_frame_equal(df.explode(cs.all()), expected)
19
assert_frame_equal(df.explode(["a", "b"]), expected)
20
assert_frame_equal(df.explode("a", "b"), expected)
21
22
23
def test_group_by_flatten_list() -> None:
24
df = pl.DataFrame({"group": ["a", "b", "b"], "values": [[1, 2], [2, 3], [4]]})
25
result = df.group_by("group", maintain_order=True).agg(
26
pl.col("values").list.explode(keep_nulls=False, empty_as_null=False)
27
)
28
29
expected = pl.DataFrame({"group": ["a", "b"], "values": [[1, 2], [2, 3, 4]]})
30
assert_frame_equal(result, expected)
31
32
33
def test_explode_empty_df_3402() -> None:
34
df = pl.DataFrame({"a": pa.array([], type=pa.large_list(pa.int32()))})
35
assert df.explode("a").dtypes == [pl.Int32]
36
37
38
def test_explode_empty_df_3460() -> None:
39
df = pl.DataFrame({"a": pa.array([[]], type=pa.large_list(pa.int32()))})
40
assert df.explode("a").dtypes == [pl.Int32]
41
42
43
def test_explode_empty_df_3902() -> None:
44
df = pl.DataFrame(
45
{
46
"first": [1, 2, 3, 4, 5],
47
"second": [["a"], [], ["b", "c"], [], ["d", "f", "g"]],
48
}
49
)
50
expected = pl.DataFrame(
51
{
52
"first": [1, 2, 3, 3, 4, 5, 5, 5],
53
"second": ["a", None, "b", "c", None, "d", "f", "g"],
54
}
55
)
56
assert_frame_equal(df.explode("second"), expected)
57
58
59
def test_explode_empty_list_4003() -> None:
60
df = pl.DataFrame(
61
[
62
{"id": 1, "nested": []},
63
{"id": 2, "nested": [1]},
64
{"id": 3, "nested": [2]},
65
]
66
)
67
assert df.explode("nested").to_dict(as_series=False) == {
68
"id": [1, 2, 3],
69
"nested": [None, 1, 2],
70
}
71
72
73
def test_explode_empty_list_4107() -> None:
74
df = pl.DataFrame({"b": [[1], [2], []] * 2}).with_row_index()
75
76
assert_frame_equal(
77
df.explode(["b"]), df.explode(["b"]).drop("index").with_row_index()
78
)
79
80
81
def test_explode_correct_for_slice() -> None:
82
df = pl.DataFrame({"b": [[1, 1], [2, 2], [3, 3], [4, 4]]})
83
assert df.slice(2, 2).explode(["b"])["b"].to_list() == [3, 3, 4, 4]
84
85
df = (
86
(
87
pl.DataFrame({"group": pl.arange(0, 5, eager=True)}).join(
88
pl.DataFrame(
89
{
90
"b": [[1, 2, 3], [2, 3], [4], [1, 2, 3], [0]],
91
}
92
),
93
how="cross",
94
)
95
)
96
.sort("group", maintain_order=True)
97
.with_row_index()
98
)
99
expected = pl.DataFrame(
100
{
101
"index": [0, 0, 0, 1, 1, 2, 3, 3, 3, 4, 5, 5, 5, 6, 6, 7, 8, 8, 8, 9],
102
"group": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
103
"b": [1, 2, 3, 2, 3, 4, 1, 2, 3, 0, 1, 2, 3, 2, 3, 4, 1, 2, 3, 0],
104
},
105
schema_overrides={"index": pl.get_index_type()},
106
)
107
assert_frame_equal(df.slice(0, 10).explode(["b"]), expected)
108
109
110
def test_sliced_null_explode() -> None:
111
s = pl.Series("", [[1], [2], [3], [4], [], [6]])
112
assert s.slice(2, 4).list.explode().to_list() == [3, 4, None, 6]
113
assert s.slice(2, 2).list.explode().to_list() == [3, 4]
114
assert pl.Series("", [[1], [2], None, [4], [], [6]]).slice(
115
2, 4
116
).list.explode().to_list() == [None, 4, None, 6]
117
118
s = pl.Series("", [["a"], ["b"], ["c"], ["d"], [], ["e"]])
119
assert s.slice(2, 4).list.explode().to_list() == ["c", "d", None, "e"]
120
assert s.slice(2, 2).list.explode().to_list() == ["c", "d"]
121
assert pl.Series("", [["a"], ["b"], None, ["d"], [], ["e"]]).slice(
122
2, 4
123
).list.explode().to_list() == [None, "d", None, "e"]
124
125
s = pl.Series("", [[False], [False], [True], [False], [], [True]])
126
assert s.slice(2, 2).list.explode().to_list() == [True, False]
127
assert s.slice(2, 4).list.explode().to_list() == [True, False, None, True]
128
129
130
@pytest.mark.parametrize("maintain_order", [False, True])
131
def test_explode_in_agg_context(maintain_order: bool) -> None:
132
df = pl.DataFrame(
133
{"idxs": [[0], [1], [0, 2]], "array": [[0.0, 3.5], [4.6, 0.0], [0.0, 7.8, 0.0]]}
134
)
135
136
assert_frame_equal(
137
df.with_row_index()
138
.explode("idxs")
139
.group_by("index", maintain_order=maintain_order)
140
.agg(pl.col("array").list.explode(keep_nulls=False, empty_as_null=False)),
141
pl.DataFrame(
142
{
143
"index": [0, 1, 2],
144
"array": [[0.0, 3.5], [4.6, 0.0], [0.0, 7.8, 0.0, 0.0, 7.8, 0.0]],
145
},
146
schema_overrides={"index": pl.get_index_type()},
147
),
148
check_row_order=maintain_order,
149
)
150
151
152
def test_explode_inner_lists_3985() -> None:
153
df = pl.DataFrame(
154
data={"id": [1, 1, 1], "categories": [["a"], ["b"], ["a", "c"]]}
155
).lazy()
156
157
assert (
158
df.group_by("id")
159
.agg(pl.col("categories"))
160
.with_columns(pl.col("categories").list.eval(pl.element().list.explode()))
161
).collect().to_dict(as_series=False) == {
162
"id": [1],
163
"categories": [["a", "b", "a", "c"]],
164
}
165
166
167
def test_list_struct_explode_6905() -> None:
168
assert pl.DataFrame(
169
{
170
"group": [
171
[],
172
[
173
{"params": [1]},
174
{"params": []},
175
],
176
]
177
},
178
schema={"group": pl.List(pl.Struct([pl.Field("params", pl.List(pl.Int32))]))},
179
)["group"].list.explode().to_list() == [
180
None,
181
{"params": [1]},
182
{"params": []},
183
]
184
185
186
def test_explode_binary() -> None:
187
assert pl.Series([[1, 2], [3]]).cast(
188
pl.List(pl.Binary)
189
).list.explode().to_list() == [
190
b"1",
191
b"2",
192
b"3",
193
]
194
195
196
def test_explode_null_list() -> None:
197
assert pl.Series([["a"], None], dtype=pl.List(pl.String))[
198
1:2
199
].list.min().to_list() == [None]
200
201
202
def test_explode_invalid_element_count() -> None:
203
df = pl.DataFrame(
204
{
205
"col1": [["X", "Y", "Z"], ["F", "G"], ["P"]],
206
"col2": [["A", "B", "C"], ["C"], ["D", "E"]],
207
}
208
).with_row_index()
209
with pytest.raises(
210
ShapeError, match=r"exploded columns must have matching element counts"
211
):
212
df.explode(["col1", "col2"])
213
214
215
def test_logical_explode() -> None:
216
out = (
217
pl.DataFrame(
218
{"cats": ["Value1", "Value2", "Value1"]},
219
schema_overrides={"cats": pl.Categorical},
220
)
221
.group_by(1)
222
.agg(pl.struct("cats"))
223
.explode("cats")
224
.unnest("cats")
225
)
226
assert out["cats"].dtype == pl.Categorical
227
assert out["cats"].to_list() == ["Value1", "Value2", "Value1"]
228
229
230
def test_explode_inner_null() -> None:
231
expected = pl.DataFrame({"A": [None, None]}, schema={"A": pl.Null})
232
out = pl.DataFrame({"A": [[], []]}, schema={"A": pl.List(pl.Null)}).explode("A")
233
assert_frame_equal(out, expected)
234
235
236
def test_explode_array() -> None:
237
df = pl.LazyFrame(
238
{"a": [[1, 2], [2, 3]], "b": [1, 2]},
239
schema_overrides={"a": pl.Array(pl.Int64, 2)},
240
)
241
expected = pl.DataFrame({"a": [1, 2, 2, 3], "b": [1, 1, 2, 2]})
242
for ex in ("a", ~cs.integer()):
243
out = df.explode(ex).collect()
244
assert_frame_equal(out, expected)
245
246
247
def test_string_list_agg_explode() -> None:
248
df = pl.DataFrame({"a": [[None], ["b"]]})
249
250
df = df.select(
251
pl.col("a").list.eval(pl.element().filter(pl.element().is_not_null()))
252
)
253
assert not df["a"].flags["FAST_EXPLODE"]
254
255
df2 = pl.DataFrame({"a": [[], ["b"]]})
256
257
assert_frame_equal(df, df2)
258
assert_frame_equal(df.explode("a"), df2.explode("a"))
259
260
261
def test_explode_null_struct() -> None:
262
df = [
263
{"col1": None},
264
{
265
"col1": [
266
{"field1": None, "field2": None, "field3": None},
267
{"field1": None, "field2": "some", "field3": "value"},
268
]
269
},
270
]
271
272
assert pl.DataFrame(df).explode("col1").to_dict(as_series=False) == {
273
"col1": [
274
None,
275
{"field1": None, "field2": None, "field3": None},
276
{"field1": None, "field2": "some", "field3": "value"},
277
]
278
}
279
280
281
def test_df_explode_with_array() -> None:
282
df = pl.DataFrame(
283
{
284
"arr": [["a", "b"], ["c", None], None, ["d", "e"]],
285
"list": [[1, 2], [3], [4, None], None],
286
"val": ["x", "y", "z", "q"],
287
},
288
schema={
289
"arr": pl.Array(pl.String, 2),
290
"list": pl.List(pl.Int64),
291
"val": pl.String,
292
},
293
)
294
295
expected_by_arr = pl.DataFrame(
296
{
297
"arr": ["a", "b", "c", None, None, "d", "e"],
298
"list": [[1, 2], [1, 2], [3], [3], [4, None], None, None],
299
"val": ["x", "x", "y", "y", "z", "q", "q"],
300
}
301
)
302
assert_frame_equal(df.explode("arr"), expected_by_arr)
303
304
expected_by_list = pl.DataFrame(
305
{
306
"arr": [["a", "b"], ["a", "b"], ["c", None], None, None, ["d", "e"]],
307
"list": [1, 2, 3, 4, None, None],
308
"val": ["x", "x", "y", "z", "z", "q"],
309
},
310
schema={
311
"arr": pl.Array(pl.String, 2),
312
"list": pl.Int64,
313
"val": pl.String,
314
},
315
)
316
assert_frame_equal(df.explode("list"), expected_by_list)
317
318
df = pl.DataFrame(
319
{
320
"arr": [["a", "b"], ["c", None], None, ["d", "e"]],
321
"list": [[1, 2], [3, 4], None, [5, None]],
322
"val": [None, 1, 2, None],
323
},
324
schema={
325
"arr": pl.Array(pl.String, 2),
326
"list": pl.List(pl.Int64),
327
"val": pl.Int64,
328
},
329
)
330
expected_by_arr_and_list = pl.DataFrame(
331
{
332
"arr": ["a", "b", "c", None, None, "d", "e"],
333
"list": [1, 2, 3, 4, None, 5, None],
334
"val": [None, None, 1, 1, 2, None, None],
335
},
336
schema={
337
"arr": pl.String,
338
"list": pl.Int64,
339
"val": pl.Int64,
340
},
341
)
342
assert_frame_equal(df.explode("arr", "list"), expected_by_arr_and_list)
343
344
345
def test_explode_nullable_list() -> None:
346
df = pl.DataFrame({"layout1": [None, [1, 2]], "b": [False, True]}).with_columns(
347
layout2=pl.when(pl.col("b")).then([1, 2]),
348
)
349
350
explode_df = df.explode("layout1", "layout2")
351
expected_df = pl.DataFrame(
352
{
353
"layout1": [None, 1, 2],
354
"b": [False, True, True],
355
"layout2": [None, 1, 2],
356
}
357
)
358
assert_frame_equal(explode_df, expected_df)
359
360
explode_expr = df.select(
361
pl.col("layout1").explode(),
362
pl.col("layout2").explode(),
363
)
364
expected_df = pl.DataFrame(
365
{
366
"layout1": [None, 1, 2],
367
"layout2": [None, 1, 2],
368
}
369
)
370
assert_frame_equal(explode_expr, expected_df)
371
372
373
def test_group_by_flatten_string() -> None:
374
df = pl.DataFrame({"group": ["a", "b", "b"], "values": ["foo", "bar", "baz"]})
375
376
result = df.group_by("group", maintain_order=True).agg(
377
pl.col("values").str.split("").explode()
378
)
379
380
expected = pl.DataFrame(
381
{
382
"group": ["a", "b"],
383
"values": [["f", "o", "o"], ["b", "a", "r", "b", "a", "z"]],
384
}
385
)
386
assert_frame_equal(result, expected)
387
388
389
def test_fast_explode_merge_right_16923() -> None:
390
df = pl.concat(
391
[
392
pl.DataFrame({"foo": [["a", "b"], ["c"]]}),
393
pl.DataFrame({"foo": [None]}, schema={"foo": pl.List(pl.Utf8)}),
394
],
395
how="diagonal",
396
rechunk=True,
397
).explode("foo")
398
399
assert df.height == 4
400
401
402
def test_fast_explode_merge_left_16923() -> None:
403
df = pl.concat(
404
[
405
pl.DataFrame({"foo": [None]}, schema={"foo": pl.List(pl.Utf8)}),
406
pl.DataFrame({"foo": [["a", "b"], ["c"]]}),
407
],
408
how="diagonal",
409
rechunk=True,
410
).explode("foo")
411
412
assert df.height == 4
413
414
415
@pytest.mark.parametrize(
416
("values", "exploded"),
417
[
418
(["foobar", None], ["f", "o", "o", "b", "a", "r", None]),
419
([None, "foo", "bar"], [None, "f", "o", "o", "b", "a", "r"]),
420
(
421
[None, "foo", "bar", None, "ham"],
422
[None, "f", "o", "o", "b", "a", "r", None, "h", "a", "m"],
423
),
424
(["foo", "bar", "ham"], ["f", "o", "o", "b", "a", "r", "h", "a", "m"]),
425
(["", None, "foo", "bar"], ["", None, "f", "o", "o", "b", "a", "r"]),
426
(["", "foo", "bar"], ["", "f", "o", "o", "b", "a", "r"]),
427
],
428
)
429
def test_series_str_explode_deprecated(
430
values: list[str | None], exploded: list[str | None]
431
) -> None:
432
with pytest.deprecated_call():
433
result = pl.Series(values).str.explode()
434
assert result.to_list() == exploded
435
436
437
def test_expr_str_explode_deprecated() -> None:
438
df = pl.Series("a", ["Hello", "World"])
439
with pytest.deprecated_call():
440
result = df.to_frame().select(pl.col("a").str.explode()).to_series()
441
442
expected = pl.Series("a", ["H", "e", "l", "l", "o", "W", "o", "r", "l", "d"])
443
assert_series_equal(result, expected)
444
445
446
def test_undefined_col_15852() -> None:
447
lf = pl.LazyFrame({"foo": [1]})
448
449
with pytest.raises(pl.exceptions.ColumnNotFoundError):
450
lf.explode("bar").join(lf, on="foo").collect()
451
452
453
def test_explode_17648() -> None:
454
df = pl.DataFrame({"a": [[1, 3], [2, 6, 7], [3, 9, 2], [4], [5, 1, 2, 3, 4]]})
455
assert (
456
df.slice(1, 2)
457
.with_columns(pl.int_ranges(pl.col("a").list.len()).alias("count"))
458
.explode("a", "count")
459
).to_dict(as_series=False) == {"a": [2, 6, 7, 3, 9, 2], "count": [0, 1, 2, 0, 1, 2]}
460
461
462
def test_explode_struct_nulls() -> None:
463
df = pl.DataFrame({"A": [[{"B": 1}], [None], []]})
464
assert df.explode("A").to_dict(as_series=False) == {"A": [{"B": 1}, None, None]}
465
466
467
def test_explode_basic() -> None:
468
s = pl.Series
469
470
assert_series_equal(s([[1, 2, 3]]).explode(), pl.Series([1, 2, 3]))
471
assert_series_equal(s([[1, 2, 3], None]).explode(), pl.Series([1, 2, 3, None]))
472
assert_series_equal(s([[1, 2, 3], []]).explode(), pl.Series([1, 2, 3, None]))
473
masked = (
474
s([[1, 2, 3], [1, 2], [1, 2]])
475
.to_frame()
476
.select(pl.when(pl.Series([True, False, True])).then(pl.col("")))
477
.to_series()
478
)
479
assert_series_equal(masked.explode(), pl.Series([1, 2, 3, None, 1, 2]))
480
masked = (
481
s([[1, 2, 3], [], [1, 2]])
482
.to_frame()
483
.select(pl.when(pl.Series([True, False, True])).then(pl.col("")))
484
.to_series()
485
)
486
assert_series_equal(masked.explode(), pl.Series([1, 2, 3, None, 1, 2]))
487
488
assert_series_equal(
489
s([[1, 2, 3]]).explode(empty_as_null=False, keep_nulls=False),
490
pl.Series([1, 2, 3]),
491
)
492
493
assert_series_equal(s([[1, 2, 3], None]).explode(), pl.Series([1, 2, 3, None]))
494
assert_series_equal(
495
s([[1, 2, 3], None]).explode(keep_nulls=False), pl.Series([1, 2, 3])
496
)
497
assert_series_equal(
498
s([[1, 2, 3], [None]]).explode(keep_nulls=False), pl.Series([1, 2, 3, None])
499
)
500
501
assert_series_equal(s([[1, 2, 3], []]).explode(), pl.Series([1, 2, 3, None]))
502
assert_series_equal(
503
s([[1, 2, 3], []]).explode(empty_as_null=False), pl.Series([1, 2, 3])
504
)
505
assert_series_equal(
506
s([[1, 2, 3], [None]]).explode(empty_as_null=False), pl.Series([1, 2, 3, None])
507
)
508
509
510
@given(s=series(min_size=1))
511
@pytest.mark.parametrize("empty_as_null", [False, True])
512
@pytest.mark.parametrize("keep_nulls", [False, True])
513
def test_explode_parametric(
514
s: pl.Series, empty_as_null: bool, keep_nulls: bool
515
) -> None:
516
a = {"empty_as_null": empty_as_null, "keep_nulls": keep_nulls}
517
si = s.implode()
518
519
empty_list_item = s.clear(1) if empty_as_null else s.clear()
520
null_list_item = s.clear(1) if keep_nulls else s.clear()
521
522
assert_series_equal(si.explode(**a), s)
523
assert_series_equal(s.clear().implode().explode(**a), empty_list_item)
524
assert_series_equal(si.clear(1).explode(**a), null_list_item)
525
526
assert_series_equal(
527
pl.concat([si, s.clear().implode(), si]).explode(**a),
528
pl.concat([s, empty_list_item, s]),
529
)
530
assert_series_equal(
531
pl.concat([si, si.clear(1), si]).explode(**a), pl.concat([s, null_list_item, s])
532
)
533
534
for mask in [
535
(False, False, False),
536
(True, False, True),
537
(False, False, True),
538
(True, False, False),
539
(False, True, False),
540
]:
541
masked = (
542
pl.concat([si, si, si])
543
.to_frame()
544
.select(pl.when(pl.Series(mask)).then(pl.col(s.name)).alias(s.name))
545
.to_series()
546
)
547
assert_series_equal(
548
masked.explode(**a), pl.concat([s if m else null_list_item for m in mask])
549
)
550
551
for size in [2, 3, 7, 15]:
552
assert_series_equal(pl.concat([si] * size).explode(**a), pl.concat([s] * size))
553
554
assert_series_equal(
555
pl.concat([s.clear().implode()] + [si] * size).explode(**a),
556
pl.concat([empty_list_item] + [s] * size),
557
)
558
assert_series_equal(
559
pl.concat([si] * size + [s.clear().implode()]).explode(**a),
560
pl.concat([s] * size + [empty_list_item]),
561
)
562
563
assert_series_equal(
564
pl.concat([si.clear(1)] + [si] * size).explode(**a),
565
pl.concat([null_list_item] + [s] * size),
566
)
567
assert_series_equal(
568
pl.concat([si] * size + [si.clear(1)]).explode(**a),
569
pl.concat([s] * size + [null_list_item]),
570
)
571
572
573
def test_explode_array_parameters() -> None:
574
s = pl.Series("a", [[1, 2, 3], [4, 5, 6], [7, 8, 9]], pl.Array(pl.Int64, 3))
575
assert_series_equal(s.explode(), pl.Series("a", list(range(1, 10)), pl.Int64))
576
577
s = pl.Series("a", [[1, 2, 3], [4, 5, 6], None], pl.Array(pl.Int64, 3))
578
assert_series_equal(
579
s.explode(), pl.Series("a", list(range(1, 7)) + [None], pl.Int64)
580
)
581
assert_series_equal(
582
s.explode(keep_nulls=False), pl.Series("a", list(range(1, 7)), pl.Int64)
583
)
584
585
s = pl.Series("a", [[], [], None], pl.Array(pl.Int64, 0))
586
assert_series_equal(s.explode(), pl.Series("a", [None] * 3, pl.Int64))
587
assert_series_equal(
588
s.explode(keep_nulls=False), pl.Series("a", [None] * 2, pl.Int64)
589
)
590
assert_series_equal(
591
s.explode(empty_as_null=False), pl.Series("a", [None], pl.Int64)
592
)
593
assert_series_equal(
594
s.explode(empty_as_null=False, keep_nulls=False), pl.Series("a", [], pl.Int64)
595
)
596
597
598
def test_explode_params() -> None:
599
df = pl.DataFrame({"a": [[1, 2, 3], None, [4, 5, 6], []], "b": [1, 2, 3, 4]})
600
601
assert_frame_equal(
602
df.explode("a"),
603
pl.DataFrame(
604
{"a": [1, 2, 3, None, 4, 5, 6, None], "b": [1, 1, 1, 2, 3, 3, 3, 4]}
605
),
606
)
607
assert_frame_equal(
608
df.explode("a", empty_as_null=False),
609
pl.DataFrame({"a": [1, 2, 3, None, 4, 5, 6], "b": [1, 1, 1, 2, 3, 3, 3]}),
610
)
611
assert_frame_equal(
612
df.explode("a", keep_nulls=False),
613
pl.DataFrame({"a": [1, 2, 3, 4, 5, 6, None], "b": [1, 1, 1, 3, 3, 3, 4]}),
614
)
615
assert_frame_equal(
616
df.explode("a", empty_as_null=False, keep_nulls=False),
617
pl.DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [1, 1, 1, 3, 3, 3]}),
618
)
619
620