CoCalc -- test

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/test_pivot.py
⁶⁹³⁹ views
1
from __future__ import annotations
2

3
from datetime import date, timedelta
4
from typing import TYPE_CHECKING, Any
5

6
import pytest
7

8
import polars as pl
9
import polars.selectors as cs
10
from polars.exceptions import ComputeError, DuplicateError
11
from polars.testing import assert_frame_equal
12

13
if TYPE_CHECKING:
14
    from polars._typing import PivotAgg, PolarsIntegerType, PolarsTemporalType
15

16

17
def test_pivot() -> None:
18
    df = pl.DataFrame(
19
        {
20
            "foo": ["A", "A", "B", "B", "C"],
21
            "bar": ["k", "l", "m", "n", "o"],
22
            "N": [1, 2, 2, 4, 2],
23
        }
24
    )
25
    result = df.pivot("bar", values="N", aggregate_function=None)
26

27
    expected = pl.DataFrame(
28
        [
29
            ("A", 1, 2, None, None, None),
30
            ("B", None, None, 2, 4, None),
31
            ("C", None, None, None, None, 2),
32
        ],
33
        schema=["foo", "k", "l", "m", "n", "o"],
34
        orient="row",
35
    )
36
    assert_frame_equal(result, expected)
37

38

39
def test_pivot_no_values() -> None:
40
    df = pl.DataFrame(
41
        {
42
            "foo": ["A", "A", "B", "B", "C"],
43
            "bar": ["k", "l", "m", "n", "o"],
44
            "N1": [1, 2, 2, 4, 2],
45
            "N2": [1, 2, 2, 4, 2],
46
        }
47
    )
48
    result = df.pivot(on="bar", index="foo", aggregate_function=None)
49
    expected = pl.DataFrame(
50
        {
51
            "foo": ["A", "B", "C"],
52
            "N1_k": [1, None, None],
53
            "N1_l": [2, None, None],
54
            "N1_m": [None, 2, None],
55
            "N1_n": [None, 4, None],
56
            "N1_o": [None, None, 2],
57
            "N2_k": [1, None, None],
58
            "N2_l": [2, None, None],
59
            "N2_m": [None, 2, None],
60
            "N2_n": [None, 4, None],
61
            "N2_o": [None, None, 2],
62
        }
63
    )
64

65
    assert_frame_equal(result, expected)
66

67

68
def test_pivot_list() -> None:
69
    df = pl.DataFrame({"a": [1, 2, 3], "b": [[1, 1], [2, 2], [3, 3]]})
70

71
    expected = pl.DataFrame(
72
        {
73
            "a": [1, 2, 3],
74
            "1": [[1, 1], None, None],
75
            "2": [None, [2, 2], None],
76
            "3": [None, None, [3, 3]],
77
        }
78
    )
79
    out = df.pivot(
80
        index="a",
81
        on="a",
82
        values="b",
83
        aggregate_function="first",
84
        sort_columns=True,
85
    )
86
    assert_frame_equal(out, expected)
87

88

89
@pytest.mark.parametrize(
90
    ("agg_fn", "expected_rows"),
91
    [
92
        ("first", [("a", 2, None, None), ("b", None, None, 10)]),
93
        ("len", [("a", 2, 0, 0), ("b", 0, 2, 1)]),
94
        ("min", [("a", 2, None, None), ("b", None, 8, 10)]),
95
        ("max", [("a", 4, None, None), ("b", None, 8, 10)]),
96
        ("sum", [("a", 6, 0, 0), ("b", 0, 8, 10)]),
97
        ("mean", [("a", 3.0, None, None), ("b", None, 8.0, 10.0)]),
98
        ("median", [("a", 3.0, None, None), ("b", None, 8.0, 10.0)]),
99
    ],
100
)
101
def test_pivot_aggregate(agg_fn: PivotAgg, expected_rows: list[tuple[Any]]) -> None:
102
    df = pl.DataFrame(
103
        {
104
            "a": [1, 1, 2, 2, 3],
105
            "b": ["a", "a", "b", "b", "b"],
106
            "c": [2, 4, None, 8, 10],
107
        }
108
    )
109
    result = df.pivot(
110
        index="b", on="a", values="c", aggregate_function=agg_fn, sort_columns=True
111
    )
112
    assert result.rows() == expected_rows
113

114

115
def test_pivot_categorical_3968() -> None:
116
    df = pl.DataFrame(
117
        {
118
            "foo": ["one", "one", "one", "two", "two", "two"],
119
            "bar": ["A", "B", "C", "A", "B", "C"],
120
            "baz": [1, 2, 3, 4, 5, 6],
121
        }
122
    )
123

124
    result = df.with_columns(pl.col("baz").cast(str).cast(pl.Categorical))
125

126
    expected = pl.DataFrame(
127
        {
128
            "foo": ["one", "one", "one", "two", "two", "two"],
129
            "bar": ["A", "B", "C", "A", "B", "C"],
130
            "baz": ["1", "2", "3", "4", "5", "6"],
131
        },
132
        schema_overrides={"baz": pl.Categorical},
133
    )
134
    assert_frame_equal(result, expected, categorical_as_str=True)
135

136

137
def test_pivot_categorical_index() -> None:
138
    df = pl.DataFrame(
139
        {"A": ["Fire", "Water", "Water", "Fire"], "B": ["Car", "Car", "Car", "Ship"]},
140
        schema=[("A", pl.Categorical), ("B", pl.Categorical)],
141
    )
142

143
    result = df.pivot(index=["A"], on="B", values="B", aggregate_function="len")
144
    expected = {"A": ["Fire", "Water"], "Car": [1, 2], "Ship": [1, 0]}
145
    assert result.to_dict(as_series=False) == expected
146

147
    # test expression dispatch
148
    result = df.pivot(index=["A"], on="B", values="B", aggregate_function=pl.len())
149
    assert result.to_dict(as_series=False) == expected
150

151
    df = pl.DataFrame(
152
        {
153
            "A": ["Fire", "Water", "Water", "Fire"],
154
            "B": ["Car", "Car", "Car", "Ship"],
155
            "C": ["Paper", "Paper", "Paper", "Paper"],
156
        },
157
        schema=[("A", pl.Categorical), ("B", pl.Categorical), ("C", pl.Categorical)],
158
    )
159
    result = df.pivot(index=["A", "C"], on="B", values="B", aggregate_function="len")
160
    expected = {
161
        "A": ["Fire", "Water"],
162
        "C": ["Paper", "Paper"],
163
        "Car": [1, 2],
164
        "Ship": [1, 0],
165
    }
166
    assert result.to_dict(as_series=False) == expected
167

168

169
def test_pivot_multiple_values_column_names_5116() -> None:
170
    df = pl.DataFrame(
171
        {
172
            "x1": [1, 2, 3, 4, 5, 6, 7, 8],
173
            "x2": [8, 7, 6, 5, 4, 3, 2, 1],
174
            "c1": ["A", "B"] * 4,
175
            "c2": ["C", "C", "D", "D"] * 2,
176
        }
177
    )
178

179
    with pytest.raises(ComputeError, match="found multiple elements in the same group"):
180
        df.pivot(
181
            index="c1",
182
            on="c2",
183
            values=["x1", "x2"],
184
            separator="|",
185
            aggregate_function=None,
186
        )
187

188
    result = df.pivot(
189
        index="c1",
190
        on="c2",
191
        values=["x1", "x2"],
192
        separator="|",
193
        aggregate_function="first",
194
    )
195
    expected = {
196
        "c1": ["A", "B"],
197
        "x1|C": [1, 2],
198
        "x1|D": [3, 4],
199
        "x2|C": [8, 7],
200
        "x2|D": [6, 5],
201
    }
202
    assert result.to_dict(as_series=False) == expected
203

204

205
def test_pivot_duplicate_names_7731() -> None:
206
    df = pl.DataFrame(
207
        {
208
            "a": [1, 4],
209
            "b": [1.5, 2.5],
210
            "c": ["x", "x"],
211
            "d": [7, 8],
212
            "e": ["x", "y"],
213
        }
214
    )
215
    result = df.pivot(
216
        index=cs.float(),
217
        on=cs.string(),
218
        values=cs.integer(),
219
        aggregate_function="first",
220
    ).to_dict(as_series=False)
221
    expected = {
222
        "b": [1.5, 2.5],
223
        'a_{"x","x"}': [1, None],
224
        'a_{"x","y"}': [None, 4],
225
        'd_{"x","x"}': [7, None],
226
        'd_{"x","y"}': [None, 8],
227
    }
228
    assert result == expected
229

230

231
def test_pivot_duplicate_names_11663() -> None:
232
    df = pl.DataFrame({"a": [1, 2], "b": [1, 2], "c": ["x", "x"], "d": ["x", "y"]})
233
    result = df.pivot(index="b", on=["c", "d"], values="a").to_dict(as_series=False)
234
    expected = {"b": [1, 2], '{"x","x"}': [1, None], '{"x","y"}': [None, 2]}
235
    assert result == expected
236

237

238
def test_pivot_multiple_columns_12407() -> None:
239
    df = pl.DataFrame(
240
        {
241
            "a": ["beep", "bop"],
242
            "b": ["a", "b"],
243
            "c": ["s", "f"],
244
            "d": [7, 8],
245
            "e": ["x", "y"],
246
        }
247
    )
248
    result = df.pivot(
249
        index="b", on=["c", "e"], values=["a"], aggregate_function="len"
250
    ).to_dict(as_series=False)
251
    expected = {"b": ["a", "b"], '{"s","x"}': [1, 0], '{"f","y"}': [0, 1]}
252
    assert result == expected
253

254

255
def test_pivot_struct_13120() -> None:
256
    df = pl.DataFrame(
257
        {
258
            "index": [1, 2, 3, 1, 2, 3],
259
            "item_type": ["a", "a", "a", "b", "b", "b"],
260
            "item_id": [123, 123, 123, 456, 456, 456],
261
            "values": [4, 5, 6, 7, 8, 9],
262
        }
263
    )
264
    df = df.with_columns(pl.struct(["item_type", "item_id"]).alias("columns")).drop(
265
        "item_type", "item_id"
266
    )
267
    result = df.pivot(index="index", on="columns", values="values").to_dict(
268
        as_series=False
269
    )
270
    expected = {"index": [1, 2, 3], '{"a",123}': [4, 5, 6], '{"b",456}': [7, 8, 9]}
271
    assert result == expected
272

273

274
def test_pivot_index_struct_14101() -> None:
275
    df = pl.DataFrame(
276
        {
277
            "a": [1, 2, 1],
278
            "b": [{"a": 1}, {"a": 1}, {"a": 2}],
279
            "c": ["x", "y", "y"],
280
            "d": [1, 1, 3],
281
        }
282
    )
283
    result = df.pivot(index="b", on="c", values="a")
284
    expected = pl.DataFrame({"b": [{"a": 1}, {"a": 2}], "x": [1, None], "y": [2, 1]})
285
    assert_frame_equal(result, expected)
286

287

288
def test_pivot_nested_struct_17065() -> None:
289
    df = pl.DataFrame(
290
        {
291
            "foo": ["one", "two", "one", "two"],
292
            "bar": ["x", "x", "y", "y"],
293
            "baz": [
294
                {"a": 1, "b": {"c": 2}},
295
                {"a": 3, "b": {"c": 4}},
296
                {"a": 5, "b": {"c": 6}},
297
                {"a": 7, "b": {"c": 8}},
298
            ],
299
        }
300
    )
301
    result = df.pivot(on="bar", index="foo", values="baz")
302
    expected = pl.DataFrame(
303
        {
304
            "foo": ["one", "two"],
305
            "x": [
306
                {"a": 1, "b": {"c": 2}},
307
                {"a": 3, "b": {"c": 4}},
308
            ],
309
            "y": [
310
                {"a": 5, "b": {"c": 6}},
311
                {"a": 7, "b": {"c": 8}},
312
            ],
313
        }
314
    )
315
    assert_frame_equal(result, expected)
316

317

318
def test_pivot_name_already_exists() -> None:
319
    # This should be extremely rare...but still, good to check it
320
    df = pl.DataFrame(
321
        {
322
            "a": ["a", "b"],
323
            "b": ["a", "b"],
324
            '{"a","b"}': [1, 2],
325
        }
326
    )
327
    with pytest.raises(ComputeError, match="already exists in the DataFrame"):
328
        df.pivot(
329
            values='{"a","b"}',
330
            index="a",
331
            on=["a", "b"],
332
            aggregate_function="first",
333
        )
334

335

336
def test_pivot_floats() -> None:
337
    df = pl.DataFrame(
338
        {
339
            "article": ["a", "a", "a", "b", "b", "b"],
340
            "weight": [1.0, 1.0, 4.4, 1.0, 8.8, 1.0],
341
            "quantity": [1.0, 5.0, 1.0, 1.0, 1.0, 7.5],
342
            "price": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
343
        }
344
    )
345

346
    with pytest.raises(ComputeError, match="found multiple elements in the same group"):
347
        result = df.pivot(
348
            index="weight", on="quantity", values="price", aggregate_function=None
349
        )
350

351
    result = df.pivot(
352
        index="weight", on="quantity", values="price", aggregate_function="first"
353
    )
354
    expected = {
355
        "weight": [1.0, 4.4, 8.8],
356
        "1.0": [1.0, 3.0, 5.0],
357
        "5.0": [2.0, None, None],
358
        "7.5": [6.0, None, None],
359
    }
360
    assert result.to_dict(as_series=False) == expected
361

362
    result = df.pivot(
363
        index=["article", "weight"],
364
        on="quantity",
365
        values="price",
366
        aggregate_function=None,
367
    )
368
    expected = {
369
        "article": ["a", "a", "b", "b"],
370
        "weight": [1.0, 4.4, 1.0, 8.8],
371
        "1.0": [1.0, 3.0, 4.0, 5.0],
372
        "5.0": [2.0, None, None, None],
373
        "7.5": [None, None, 6.0, None],
374
    }
375
    assert result.to_dict(as_series=False) == expected
376

377

378
def test_pivot_reinterpret_5907() -> None:
379
    df = pl.DataFrame(
380
        {
381
            "A": pl.Series([3, -2, 3, -2], dtype=pl.Int32),
382
            "B": ["x", "x", "y", "y"],
383
            "C": [100, 50, 500, -80],
384
        }
385
    )
386

387
    result = df.pivot(
388
        index=["A"], on=["B"], values=["C"], aggregate_function=pl.element().sum()
389
    )
390
    expected = {"A": [3, -2], "x": [100, 50], "y": [500, -80]}
391
    assert result.to_dict(as_series=False) == expected
392

393

394
@pytest.mark.parametrize(
395
    "dtype",
396
    [
397
        pl.Date,
398
        pl.Datetime("ms"),
399
        pl.Datetime("ns"),
400
        pl.Datetime("us"),
401
        pl.Datetime("ms", time_zone="Asia/Shanghai"),
402
        pl.Duration("ms"),
403
        pl.Duration("us"),
404
        pl.Duration("ns"),
405
        pl.Time,
406
    ],
407
)
408
def test_pivot_temporal_logical_types(dtype: PolarsTemporalType) -> None:
409
    idx = pl.Series([7, 8, 9, 0, 1, 2, 3, 4]).cast(dtype)
410
    df = pl.DataFrame(
411
        {
412
            "idx": idx,
413
            "foo": ["a"] * 3 + ["b"] * 5,
414
            "value": [0] * 8,
415
        }
416
    )
417
    assert df.pivot(
418
        index="idx", on="foo", values="value", aggregate_function=None
419
    ).to_dict(as_series=False) == {
420
        "idx": idx.to_list(),
421
        "a": [0, 0, 0, None, None, None, None, None],
422
        "b": [None, None, None, 0, 0, 0, 0, 0],
423
    }
424

425

426
def test_pivot_negative_duration() -> None:
427
    df1 = pl.DataFrame({"root": [date(2020, i, 15) for i in (1, 2)]})
428
    df2 = pl.DataFrame({"delta": [timedelta(days=i) for i in (-2, -1, 0, 1)]})
429

430
    df = df1.join(df2, how="cross", maintain_order="left_right").with_columns(
431
        pl.Series(name="value", values=range(len(df1) * len(df2)))
432
    )
433
    assert df.pivot(
434
        index="delta", on="root", values="value", aggregate_function=None
435
    ).to_dict(as_series=False) == {
436
        "delta": [
437
            timedelta(days=-2),
438
            timedelta(days=-1),
439
            timedelta(0),
440
            timedelta(days=1),
441
        ],
442
        "2020-01-15": [0, 1, 2, 3],
443
        "2020-02-15": [4, 5, 6, 7],
444
    }
445

446

447
def test_aggregate_function_default() -> None:
448
    df = pl.DataFrame({"a": [1, 2], "b": ["foo", "foo"], "c": ["x", "x"]})
449
    with pytest.raises(ComputeError, match="found multiple elements in the same group"):
450
        df.pivot(index="b", on="c", values="a")
451

452

453
def test_pivot_aggregate_function_count_deprecated() -> None:
454
    df = pl.DataFrame(
455
        {
456
            "foo": ["A", "A", "B", "B", "C"],
457
            "N": [1, 2, 2, 4, 2],
458
            "bar": ["k", "l", "m", "n", "o"],
459
        }
460
    )
461
    with pytest.deprecated_call():
462
        df.pivot(index="foo", on="bar", values="N", aggregate_function="count")  # type: ignore[arg-type]
463

464

465
def test_pivot_struct() -> None:
466
    data = {
467
        "id": ["a", "a", "b", "c", "c", "c"],
468
        "week": ["1", "2", "3", "4", "3", "1"],
469
        "num1": [1, 3, 5, 4, 3, 6],
470
        "num2": [4, 5, 3, 4, 6, 6],
471
    }
472
    df = pl.DataFrame(data).with_columns(nums=pl.struct(["num1", "num2"]))
473

474
    assert df.pivot(
475
        values="nums", index="id", on="week", aggregate_function="first"
476
    ).to_dict(as_series=False) == {
477
        "id": ["a", "b", "c"],
478
        "1": [
479
            {"num1": 1, "num2": 4},
480
            None,
481
            {"num1": 6, "num2": 6},
482
        ],
483
        "2": [
484
            {"num1": 3, "num2": 5},
485
            None,
486
            None,
487
        ],
488
        "3": [
489
            None,
490
            {"num1": 5, "num2": 3},
491
            {"num1": 3, "num2": 6},
492
        ],
493
        "4": [
494
            None,
495
            None,
496
            {"num1": 4, "num2": 4},
497
        ],
498
    }
499

500

501
def test_duplicate_column_names_which_should_raise_14305() -> None:
502
    df = pl.DataFrame({"a": [1, 3, 2], "c": ["a", "a", "a"], "d": [7, 8, 9]})
503
    with pytest.raises(DuplicateError, match="has more than one occurrence"):
504
        df.pivot(index="a", on="c", values="d")
505

506

507
def test_multi_index_containing_struct() -> None:
508
    df = pl.DataFrame(
509
        {
510
            "a": [1, 2, 1],
511
            "b": [{"a": 1}, {"a": 1}, {"a": 2}],
512
            "c": ["x", "y", "y"],
513
            "d": [1, 1, 3],
514
        }
515
    )
516
    result = df.pivot(index=("b", "d"), on="c", values="a")
517
    expected = pl.DataFrame(
518
        {"b": [{"a": 1}, {"a": 2}], "d": [1, 3], "x": [1, None], "y": [2, 1]}
519
    )
520
    assert_frame_equal(result, expected)
521

522

523
def test_list_pivot() -> None:
524
    df = pl.DataFrame(
525
        {
526
            "a": [1, 2, 3, 1],
527
            "b": [[1, 2], [3, 4], [5, 6], [1, 2]],
528
            "c": ["x", "x", "y", "y"],
529
            "d": [1, 2, 3, 4],
530
        }
531
    )
532
    assert df.pivot(
533
        index=["a", "b"],
534
        on="c",
535
        values="d",
536
    ).to_dict(as_series=False) == {
537
        "a": [1, 2, 3],
538
        "b": [[1, 2], [3, 4], [5, 6]],
539
        "x": [1, 2, None],
540
        "y": [4, None, 3],
541
    }
542

543

544
def test_pivot_string_17081() -> None:
545
    df = pl.DataFrame(
546
        {
547
            "a": ["1", "2", "3"],
548
            "b": ["4", "5", "6"],
549
            "c": ["7", "8", "9"],
550
        }
551
    )
552
    assert df.pivot(index="a", on="b", values="c", aggregate_function="min").to_dict(
553
        as_series=False
554
    ) == {
555
        "a": ["1", "2", "3"],
556
        "4": ["7", None, None],
557
        "5": [None, "8", None],
558
        "6": [None, None, "9"],
559
    }
560

561

562
def test_pivot_invalid() -> None:
563
    with pytest.raises(
564
        pl.exceptions.InvalidOperationError,
565
        match="`index` and `values` cannot both be None in `pivot` operation",
566
    ):
567
        pl.DataFrame({"a": [1, 2], "b": [2, 3], "c": [3, 4]}).pivot("a")
568

569

570
@pytest.mark.parametrize(
571
    "dtype",
572
    [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64],
573
)
574
def test_pivot_empty_index_dtypes(dtype: PolarsIntegerType) -> None:
575
    index = pl.Series([], dtype=dtype)
576
    df = pl.DataFrame({"index": index, "on": [], "values": []})
577
    result = df.pivot(index="index", on="on", values="values")
578
    expected = pl.DataFrame({"index": index})
579
    assert_frame_equal(result, expected)
580

581

582
def test_pivot_agg_column_ref_invalid_22479() -> None:
583
    df = pl.DataFrame(
584
        {"a": ["x", "x", "x"], "b": [1, 1, 1], "c": [7, 8, 9], "d": [0, 2, 1]}
585
    )
586
    with pytest.raises(
587
        pl.exceptions.InvalidOperationError,
588
        match="explicit column references are not allowed in aggregate_function",
589
    ):
590
        df.pivot(
591
            on="a", index="b", values="c", aggregate_function=pl.element().sort_by("d")
592
        )
593

594

595
def test_pivot_agg_null_methods_23408() -> None:
596
    df = pl.DataFrame(
597
        {
598
            "idx": [0, 0, 1, 1],
599
            "on": ["a", "b", "a", "c"],
600
            "val": ["aa", "bb", "aa", "cc"],
601
        }
602
    )
603
    out = df.pivot(
604
        on="on",
605
        index="idx",
606
        values="val",
607
        aggregate_function=pl.element().first().is_null(),
608
    )
609
    expected = pl.DataFrame(
610
        {"idx": [0, 1], "a": [False, False], "b": [False, True], "c": [True, False]}
611
    )
612
    assert_frame_equal(out, expected)
613

614
    out = df.pivot(
615
        on="on",
616
        index="idx",
617
        values="val",
618
        aggregate_function=pl.element().first().fill_null("xx"),
619
    )
620
    expected = pl.DataFrame(
621
        {"idx": [0, 1], "a": ["aa", "aa"], "b": ["bb", "xx"], "c": ["xx", "cc"]}
622
    )
623
    assert_frame_equal(out, expected)
624

625
Product

Resources

Company