Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/test_pivot.py
6939 views
1
from __future__ import annotations
2
3
from datetime import date, timedelta
4
from typing import TYPE_CHECKING, Any
5
6
import pytest
7
8
import polars as pl
9
import polars.selectors as cs
10
from polars.exceptions import ComputeError, DuplicateError
11
from polars.testing import assert_frame_equal
12
13
if TYPE_CHECKING:
14
from polars._typing import PivotAgg, PolarsIntegerType, PolarsTemporalType
15
16
17
def test_pivot() -> None:
18
df = pl.DataFrame(
19
{
20
"foo": ["A", "A", "B", "B", "C"],
21
"bar": ["k", "l", "m", "n", "o"],
22
"N": [1, 2, 2, 4, 2],
23
}
24
)
25
result = df.pivot("bar", values="N", aggregate_function=None)
26
27
expected = pl.DataFrame(
28
[
29
("A", 1, 2, None, None, None),
30
("B", None, None, 2, 4, None),
31
("C", None, None, None, None, 2),
32
],
33
schema=["foo", "k", "l", "m", "n", "o"],
34
orient="row",
35
)
36
assert_frame_equal(result, expected)
37
38
39
def test_pivot_no_values() -> None:
40
df = pl.DataFrame(
41
{
42
"foo": ["A", "A", "B", "B", "C"],
43
"bar": ["k", "l", "m", "n", "o"],
44
"N1": [1, 2, 2, 4, 2],
45
"N2": [1, 2, 2, 4, 2],
46
}
47
)
48
result = df.pivot(on="bar", index="foo", aggregate_function=None)
49
expected = pl.DataFrame(
50
{
51
"foo": ["A", "B", "C"],
52
"N1_k": [1, None, None],
53
"N1_l": [2, None, None],
54
"N1_m": [None, 2, None],
55
"N1_n": [None, 4, None],
56
"N1_o": [None, None, 2],
57
"N2_k": [1, None, None],
58
"N2_l": [2, None, None],
59
"N2_m": [None, 2, None],
60
"N2_n": [None, 4, None],
61
"N2_o": [None, None, 2],
62
}
63
)
64
65
assert_frame_equal(result, expected)
66
67
68
def test_pivot_list() -> None:
69
df = pl.DataFrame({"a": [1, 2, 3], "b": [[1, 1], [2, 2], [3, 3]]})
70
71
expected = pl.DataFrame(
72
{
73
"a": [1, 2, 3],
74
"1": [[1, 1], None, None],
75
"2": [None, [2, 2], None],
76
"3": [None, None, [3, 3]],
77
}
78
)
79
out = df.pivot(
80
index="a",
81
on="a",
82
values="b",
83
aggregate_function="first",
84
sort_columns=True,
85
)
86
assert_frame_equal(out, expected)
87
88
89
@pytest.mark.parametrize(
90
("agg_fn", "expected_rows"),
91
[
92
("first", [("a", 2, None, None), ("b", None, None, 10)]),
93
("len", [("a", 2, 0, 0), ("b", 0, 2, 1)]),
94
("min", [("a", 2, None, None), ("b", None, 8, 10)]),
95
("max", [("a", 4, None, None), ("b", None, 8, 10)]),
96
("sum", [("a", 6, 0, 0), ("b", 0, 8, 10)]),
97
("mean", [("a", 3.0, None, None), ("b", None, 8.0, 10.0)]),
98
("median", [("a", 3.0, None, None), ("b", None, 8.0, 10.0)]),
99
],
100
)
101
def test_pivot_aggregate(agg_fn: PivotAgg, expected_rows: list[tuple[Any]]) -> None:
102
df = pl.DataFrame(
103
{
104
"a": [1, 1, 2, 2, 3],
105
"b": ["a", "a", "b", "b", "b"],
106
"c": [2, 4, None, 8, 10],
107
}
108
)
109
result = df.pivot(
110
index="b", on="a", values="c", aggregate_function=agg_fn, sort_columns=True
111
)
112
assert result.rows() == expected_rows
113
114
115
def test_pivot_categorical_3968() -> None:
116
df = pl.DataFrame(
117
{
118
"foo": ["one", "one", "one", "two", "two", "two"],
119
"bar": ["A", "B", "C", "A", "B", "C"],
120
"baz": [1, 2, 3, 4, 5, 6],
121
}
122
)
123
124
result = df.with_columns(pl.col("baz").cast(str).cast(pl.Categorical))
125
126
expected = pl.DataFrame(
127
{
128
"foo": ["one", "one", "one", "two", "two", "two"],
129
"bar": ["A", "B", "C", "A", "B", "C"],
130
"baz": ["1", "2", "3", "4", "5", "6"],
131
},
132
schema_overrides={"baz": pl.Categorical},
133
)
134
assert_frame_equal(result, expected, categorical_as_str=True)
135
136
137
def test_pivot_categorical_index() -> None:
138
df = pl.DataFrame(
139
{"A": ["Fire", "Water", "Water", "Fire"], "B": ["Car", "Car", "Car", "Ship"]},
140
schema=[("A", pl.Categorical), ("B", pl.Categorical)],
141
)
142
143
result = df.pivot(index=["A"], on="B", values="B", aggregate_function="len")
144
expected = {"A": ["Fire", "Water"], "Car": [1, 2], "Ship": [1, 0]}
145
assert result.to_dict(as_series=False) == expected
146
147
# test expression dispatch
148
result = df.pivot(index=["A"], on="B", values="B", aggregate_function=pl.len())
149
assert result.to_dict(as_series=False) == expected
150
151
df = pl.DataFrame(
152
{
153
"A": ["Fire", "Water", "Water", "Fire"],
154
"B": ["Car", "Car", "Car", "Ship"],
155
"C": ["Paper", "Paper", "Paper", "Paper"],
156
},
157
schema=[("A", pl.Categorical), ("B", pl.Categorical), ("C", pl.Categorical)],
158
)
159
result = df.pivot(index=["A", "C"], on="B", values="B", aggregate_function="len")
160
expected = {
161
"A": ["Fire", "Water"],
162
"C": ["Paper", "Paper"],
163
"Car": [1, 2],
164
"Ship": [1, 0],
165
}
166
assert result.to_dict(as_series=False) == expected
167
168
169
def test_pivot_multiple_values_column_names_5116() -> None:
170
df = pl.DataFrame(
171
{
172
"x1": [1, 2, 3, 4, 5, 6, 7, 8],
173
"x2": [8, 7, 6, 5, 4, 3, 2, 1],
174
"c1": ["A", "B"] * 4,
175
"c2": ["C", "C", "D", "D"] * 2,
176
}
177
)
178
179
with pytest.raises(ComputeError, match="found multiple elements in the same group"):
180
df.pivot(
181
index="c1",
182
on="c2",
183
values=["x1", "x2"],
184
separator="|",
185
aggregate_function=None,
186
)
187
188
result = df.pivot(
189
index="c1",
190
on="c2",
191
values=["x1", "x2"],
192
separator="|",
193
aggregate_function="first",
194
)
195
expected = {
196
"c1": ["A", "B"],
197
"x1|C": [1, 2],
198
"x1|D": [3, 4],
199
"x2|C": [8, 7],
200
"x2|D": [6, 5],
201
}
202
assert result.to_dict(as_series=False) == expected
203
204
205
def test_pivot_duplicate_names_7731() -> None:
206
df = pl.DataFrame(
207
{
208
"a": [1, 4],
209
"b": [1.5, 2.5],
210
"c": ["x", "x"],
211
"d": [7, 8],
212
"e": ["x", "y"],
213
}
214
)
215
result = df.pivot(
216
index=cs.float(),
217
on=cs.string(),
218
values=cs.integer(),
219
aggregate_function="first",
220
).to_dict(as_series=False)
221
expected = {
222
"b": [1.5, 2.5],
223
'a_{"x","x"}': [1, None],
224
'a_{"x","y"}': [None, 4],
225
'd_{"x","x"}': [7, None],
226
'd_{"x","y"}': [None, 8],
227
}
228
assert result == expected
229
230
231
def test_pivot_duplicate_names_11663() -> None:
232
df = pl.DataFrame({"a": [1, 2], "b": [1, 2], "c": ["x", "x"], "d": ["x", "y"]})
233
result = df.pivot(index="b", on=["c", "d"], values="a").to_dict(as_series=False)
234
expected = {"b": [1, 2], '{"x","x"}': [1, None], '{"x","y"}': [None, 2]}
235
assert result == expected
236
237
238
def test_pivot_multiple_columns_12407() -> None:
239
df = pl.DataFrame(
240
{
241
"a": ["beep", "bop"],
242
"b": ["a", "b"],
243
"c": ["s", "f"],
244
"d": [7, 8],
245
"e": ["x", "y"],
246
}
247
)
248
result = df.pivot(
249
index="b", on=["c", "e"], values=["a"], aggregate_function="len"
250
).to_dict(as_series=False)
251
expected = {"b": ["a", "b"], '{"s","x"}': [1, 0], '{"f","y"}': [0, 1]}
252
assert result == expected
253
254
255
def test_pivot_struct_13120() -> None:
256
df = pl.DataFrame(
257
{
258
"index": [1, 2, 3, 1, 2, 3],
259
"item_type": ["a", "a", "a", "b", "b", "b"],
260
"item_id": [123, 123, 123, 456, 456, 456],
261
"values": [4, 5, 6, 7, 8, 9],
262
}
263
)
264
df = df.with_columns(pl.struct(["item_type", "item_id"]).alias("columns")).drop(
265
"item_type", "item_id"
266
)
267
result = df.pivot(index="index", on="columns", values="values").to_dict(
268
as_series=False
269
)
270
expected = {"index": [1, 2, 3], '{"a",123}': [4, 5, 6], '{"b",456}': [7, 8, 9]}
271
assert result == expected
272
273
274
def test_pivot_index_struct_14101() -> None:
275
df = pl.DataFrame(
276
{
277
"a": [1, 2, 1],
278
"b": [{"a": 1}, {"a": 1}, {"a": 2}],
279
"c": ["x", "y", "y"],
280
"d": [1, 1, 3],
281
}
282
)
283
result = df.pivot(index="b", on="c", values="a")
284
expected = pl.DataFrame({"b": [{"a": 1}, {"a": 2}], "x": [1, None], "y": [2, 1]})
285
assert_frame_equal(result, expected)
286
287
288
def test_pivot_nested_struct_17065() -> None:
289
df = pl.DataFrame(
290
{
291
"foo": ["one", "two", "one", "two"],
292
"bar": ["x", "x", "y", "y"],
293
"baz": [
294
{"a": 1, "b": {"c": 2}},
295
{"a": 3, "b": {"c": 4}},
296
{"a": 5, "b": {"c": 6}},
297
{"a": 7, "b": {"c": 8}},
298
],
299
}
300
)
301
result = df.pivot(on="bar", index="foo", values="baz")
302
expected = pl.DataFrame(
303
{
304
"foo": ["one", "two"],
305
"x": [
306
{"a": 1, "b": {"c": 2}},
307
{"a": 3, "b": {"c": 4}},
308
],
309
"y": [
310
{"a": 5, "b": {"c": 6}},
311
{"a": 7, "b": {"c": 8}},
312
],
313
}
314
)
315
assert_frame_equal(result, expected)
316
317
318
def test_pivot_name_already_exists() -> None:
319
# This should be extremely rare...but still, good to check it
320
df = pl.DataFrame(
321
{
322
"a": ["a", "b"],
323
"b": ["a", "b"],
324
'{"a","b"}': [1, 2],
325
}
326
)
327
with pytest.raises(ComputeError, match="already exists in the DataFrame"):
328
df.pivot(
329
values='{"a","b"}',
330
index="a",
331
on=["a", "b"],
332
aggregate_function="first",
333
)
334
335
336
def test_pivot_floats() -> None:
337
df = pl.DataFrame(
338
{
339
"article": ["a", "a", "a", "b", "b", "b"],
340
"weight": [1.0, 1.0, 4.4, 1.0, 8.8, 1.0],
341
"quantity": [1.0, 5.0, 1.0, 1.0, 1.0, 7.5],
342
"price": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
343
}
344
)
345
346
with pytest.raises(ComputeError, match="found multiple elements in the same group"):
347
result = df.pivot(
348
index="weight", on="quantity", values="price", aggregate_function=None
349
)
350
351
result = df.pivot(
352
index="weight", on="quantity", values="price", aggregate_function="first"
353
)
354
expected = {
355
"weight": [1.0, 4.4, 8.8],
356
"1.0": [1.0, 3.0, 5.0],
357
"5.0": [2.0, None, None],
358
"7.5": [6.0, None, None],
359
}
360
assert result.to_dict(as_series=False) == expected
361
362
result = df.pivot(
363
index=["article", "weight"],
364
on="quantity",
365
values="price",
366
aggregate_function=None,
367
)
368
expected = {
369
"article": ["a", "a", "b", "b"],
370
"weight": [1.0, 4.4, 1.0, 8.8],
371
"1.0": [1.0, 3.0, 4.0, 5.0],
372
"5.0": [2.0, None, None, None],
373
"7.5": [None, None, 6.0, None],
374
}
375
assert result.to_dict(as_series=False) == expected
376
377
378
def test_pivot_reinterpret_5907() -> None:
379
df = pl.DataFrame(
380
{
381
"A": pl.Series([3, -2, 3, -2], dtype=pl.Int32),
382
"B": ["x", "x", "y", "y"],
383
"C": [100, 50, 500, -80],
384
}
385
)
386
387
result = df.pivot(
388
index=["A"], on=["B"], values=["C"], aggregate_function=pl.element().sum()
389
)
390
expected = {"A": [3, -2], "x": [100, 50], "y": [500, -80]}
391
assert result.to_dict(as_series=False) == expected
392
393
394
@pytest.mark.parametrize(
395
"dtype",
396
[
397
pl.Date,
398
pl.Datetime("ms"),
399
pl.Datetime("ns"),
400
pl.Datetime("us"),
401
pl.Datetime("ms", time_zone="Asia/Shanghai"),
402
pl.Duration("ms"),
403
pl.Duration("us"),
404
pl.Duration("ns"),
405
pl.Time,
406
],
407
)
408
def test_pivot_temporal_logical_types(dtype: PolarsTemporalType) -> None:
409
idx = pl.Series([7, 8, 9, 0, 1, 2, 3, 4]).cast(dtype)
410
df = pl.DataFrame(
411
{
412
"idx": idx,
413
"foo": ["a"] * 3 + ["b"] * 5,
414
"value": [0] * 8,
415
}
416
)
417
assert df.pivot(
418
index="idx", on="foo", values="value", aggregate_function=None
419
).to_dict(as_series=False) == {
420
"idx": idx.to_list(),
421
"a": [0, 0, 0, None, None, None, None, None],
422
"b": [None, None, None, 0, 0, 0, 0, 0],
423
}
424
425
426
def test_pivot_negative_duration() -> None:
427
df1 = pl.DataFrame({"root": [date(2020, i, 15) for i in (1, 2)]})
428
df2 = pl.DataFrame({"delta": [timedelta(days=i) for i in (-2, -1, 0, 1)]})
429
430
df = df1.join(df2, how="cross", maintain_order="left_right").with_columns(
431
pl.Series(name="value", values=range(len(df1) * len(df2)))
432
)
433
assert df.pivot(
434
index="delta", on="root", values="value", aggregate_function=None
435
).to_dict(as_series=False) == {
436
"delta": [
437
timedelta(days=-2),
438
timedelta(days=-1),
439
timedelta(0),
440
timedelta(days=1),
441
],
442
"2020-01-15": [0, 1, 2, 3],
443
"2020-02-15": [4, 5, 6, 7],
444
}
445
446
447
def test_aggregate_function_default() -> None:
448
df = pl.DataFrame({"a": [1, 2], "b": ["foo", "foo"], "c": ["x", "x"]})
449
with pytest.raises(ComputeError, match="found multiple elements in the same group"):
450
df.pivot(index="b", on="c", values="a")
451
452
453
def test_pivot_aggregate_function_count_deprecated() -> None:
454
df = pl.DataFrame(
455
{
456
"foo": ["A", "A", "B", "B", "C"],
457
"N": [1, 2, 2, 4, 2],
458
"bar": ["k", "l", "m", "n", "o"],
459
}
460
)
461
with pytest.deprecated_call():
462
df.pivot(index="foo", on="bar", values="N", aggregate_function="count") # type: ignore[arg-type]
463
464
465
def test_pivot_struct() -> None:
466
data = {
467
"id": ["a", "a", "b", "c", "c", "c"],
468
"week": ["1", "2", "3", "4", "3", "1"],
469
"num1": [1, 3, 5, 4, 3, 6],
470
"num2": [4, 5, 3, 4, 6, 6],
471
}
472
df = pl.DataFrame(data).with_columns(nums=pl.struct(["num1", "num2"]))
473
474
assert df.pivot(
475
values="nums", index="id", on="week", aggregate_function="first"
476
).to_dict(as_series=False) == {
477
"id": ["a", "b", "c"],
478
"1": [
479
{"num1": 1, "num2": 4},
480
None,
481
{"num1": 6, "num2": 6},
482
],
483
"2": [
484
{"num1": 3, "num2": 5},
485
None,
486
None,
487
],
488
"3": [
489
None,
490
{"num1": 5, "num2": 3},
491
{"num1": 3, "num2": 6},
492
],
493
"4": [
494
None,
495
None,
496
{"num1": 4, "num2": 4},
497
],
498
}
499
500
501
def test_duplicate_column_names_which_should_raise_14305() -> None:
502
df = pl.DataFrame({"a": [1, 3, 2], "c": ["a", "a", "a"], "d": [7, 8, 9]})
503
with pytest.raises(DuplicateError, match="has more than one occurrence"):
504
df.pivot(index="a", on="c", values="d")
505
506
507
def test_multi_index_containing_struct() -> None:
508
df = pl.DataFrame(
509
{
510
"a": [1, 2, 1],
511
"b": [{"a": 1}, {"a": 1}, {"a": 2}],
512
"c": ["x", "y", "y"],
513
"d": [1, 1, 3],
514
}
515
)
516
result = df.pivot(index=("b", "d"), on="c", values="a")
517
expected = pl.DataFrame(
518
{"b": [{"a": 1}, {"a": 2}], "d": [1, 3], "x": [1, None], "y": [2, 1]}
519
)
520
assert_frame_equal(result, expected)
521
522
523
def test_list_pivot() -> None:
524
df = pl.DataFrame(
525
{
526
"a": [1, 2, 3, 1],
527
"b": [[1, 2], [3, 4], [5, 6], [1, 2]],
528
"c": ["x", "x", "y", "y"],
529
"d": [1, 2, 3, 4],
530
}
531
)
532
assert df.pivot(
533
index=["a", "b"],
534
on="c",
535
values="d",
536
).to_dict(as_series=False) == {
537
"a": [1, 2, 3],
538
"b": [[1, 2], [3, 4], [5, 6]],
539
"x": [1, 2, None],
540
"y": [4, None, 3],
541
}
542
543
544
def test_pivot_string_17081() -> None:
545
df = pl.DataFrame(
546
{
547
"a": ["1", "2", "3"],
548
"b": ["4", "5", "6"],
549
"c": ["7", "8", "9"],
550
}
551
)
552
assert df.pivot(index="a", on="b", values="c", aggregate_function="min").to_dict(
553
as_series=False
554
) == {
555
"a": ["1", "2", "3"],
556
"4": ["7", None, None],
557
"5": [None, "8", None],
558
"6": [None, None, "9"],
559
}
560
561
562
def test_pivot_invalid() -> None:
563
with pytest.raises(
564
pl.exceptions.InvalidOperationError,
565
match="`index` and `values` cannot both be None in `pivot` operation",
566
):
567
pl.DataFrame({"a": [1, 2], "b": [2, 3], "c": [3, 4]}).pivot("a")
568
569
570
@pytest.mark.parametrize(
571
"dtype",
572
[pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64],
573
)
574
def test_pivot_empty_index_dtypes(dtype: PolarsIntegerType) -> None:
575
index = pl.Series([], dtype=dtype)
576
df = pl.DataFrame({"index": index, "on": [], "values": []})
577
result = df.pivot(index="index", on="on", values="values")
578
expected = pl.DataFrame({"index": index})
579
assert_frame_equal(result, expected)
580
581
582
def test_pivot_agg_column_ref_invalid_22479() -> None:
583
df = pl.DataFrame(
584
{"a": ["x", "x", "x"], "b": [1, 1, 1], "c": [7, 8, 9], "d": [0, 2, 1]}
585
)
586
with pytest.raises(
587
pl.exceptions.InvalidOperationError,
588
match="explicit column references are not allowed in aggregate_function",
589
):
590
df.pivot(
591
on="a", index="b", values="c", aggregate_function=pl.element().sort_by("d")
592
)
593
594
595
def test_pivot_agg_null_methods_23408() -> None:
596
df = pl.DataFrame(
597
{
598
"idx": [0, 0, 1, 1],
599
"on": ["a", "b", "a", "c"],
600
"val": ["aa", "bb", "aa", "cc"],
601
}
602
)
603
out = df.pivot(
604
on="on",
605
index="idx",
606
values="val",
607
aggregate_function=pl.element().first().is_null(),
608
)
609
expected = pl.DataFrame(
610
{"idx": [0, 1], "a": [False, False], "b": [False, True], "c": [True, False]}
611
)
612
assert_frame_equal(out, expected)
613
614
out = df.pivot(
615
on="on",
616
index="idx",
617
values="val",
618
aggregate_function=pl.element().first().fill_null("xx"),
619
)
620
expected = pl.DataFrame(
621
{"idx": [0, 1], "a": ["aa", "aa"], "b": ["bb", "xx"], "c": ["xx", "cc"]}
622
)
623
assert_frame_equal(out, expected)
624
625