CoCalc -- test

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/namespaces/array/test_array.py
⁸³⁵³ views
1
from __future__ import annotations
2

3
import datetime
4
from typing import Any
5

6
import pytest
7

8
import polars as pl
9
from polars.exceptions import ComputeError, InvalidOperationError
10
from polars.testing import assert_frame_equal, assert_series_equal
11

12

13
def test_arr_min_max() -> None:
14
    s = pl.Series("a", [[1, 2], [4, 3]], dtype=pl.Array(pl.Int64, 2))
15
    assert s.arr.max().to_list() == [2, 4]
16
    assert s.arr.min().to_list() == [1, 3]
17

18
    s_with_null = pl.Series("a", [[None, 2], None, [3, 4]], dtype=pl.Array(pl.Int64, 2))
19
    assert s_with_null.arr.max().to_list() == [2, None, 4]
20
    assert s_with_null.arr.min().to_list() == [2, None, 3]
21

22

23
def test_arr_mean_median_var_std() -> None:
24
    s = pl.Series("a", [[1, 2], [4, 3]], dtype=pl.Array(pl.Int64, 2))
25
    assert s.arr.mean().to_list() == [1.5, 3.5]
26
    assert s.arr.median().to_list() == [1.5, 3.5]
27
    assert s.arr.var().to_list() == [0.5, 0.5]
28
    assert round(s.arr.std().to_list()[0], 5) == 0.70711
29

30
    s_with_null = pl.Series("a", [[3, 4], None, [None, 2]], dtype=pl.Array(pl.Int64, 2))
31
    assert s_with_null.arr.mean().to_list() == [3.5, None, 2.0]
32
    assert s_with_null.arr.median().to_list() == [3.5, None, 2.0]
33
    assert s_with_null.arr.var().to_list() == [0.5, None, None]
34
    assert round(s_with_null.arr.std().to_list()[0], 5) == 0.70711
35

36

37
def test_array_min_max_dtype_12123() -> None:
38
    df = pl.LazyFrame(
39
        [pl.Series("a", [[1.0, 3.0], [2.0, 5.0]]), pl.Series("b", [1.0, 2.0])],
40
        schema_overrides={
41
            "a": pl.Array(pl.Float64, 2),
42
        },
43
    )
44

45
    df = df.with_columns(
46
        max=pl.col("a").arr.max().alias("max"),
47
        min=pl.col("a").arr.min().alias("min"),
48
    )
49

50
    assert df.collect_schema() == {
51
        "a": pl.Array(pl.Float64, 2),
52
        "b": pl.Float64,
53
        "max": pl.Float64,
54
        "min": pl.Float64,
55
    }
56

57
    out = df.select(pl.col("max") * pl.col("b"), pl.col("min") * pl.col("b")).collect()
58

59
    assert_frame_equal(out, pl.DataFrame({"max": [3.0, 10.0], "min": [1.0, 4.0]}))
60

61

62
@pytest.mark.parametrize(
63
    ("data", "expected_sum", "dtype"),
64
    [
65
        ([[1, 2], [4, 3]], [3, 7], pl.Int64),
66
        ([[1, None], [None, 3], [None, None]], [1, 3, 0], pl.Int64),
67
        ([[1.0, 2.0], [4.0, 3.0]], [3.0, 7.0], pl.Float32),
68
        ([[1.0, None], [None, 3.0], [None, None]], [1.0, 3.0, 0], pl.Float32),
69
        ([[True, False], [True, True], [False, False]], [1, 2, 0], pl.Boolean),
70
        ([[True, None], [None, False], [None, None]], [1, 0, 0], pl.Boolean),
71
    ],
72
)
73
def test_arr_sum(
74
    data: list[list[Any]], expected_sum: list[Any], dtype: pl.DataType
75
) -> None:
76
    s = pl.Series("a", data, dtype=pl.Array(dtype, 2))
77
    assert s.arr.sum().to_list() == expected_sum
78

79

80
@pytest.mark.may_fail_cloud
81
def test_array_lengths_zwa() -> None:
82
    assert pl.Series("a", [[], []], pl.Array(pl.Null, 0)).arr.len().to_list() == [0, 0]
83
    assert pl.Series("a", [None, []], pl.Array(pl.Null, 0)).arr.len().to_list() == [
84
        None,
85
        0,
86
    ]
87
    assert pl.Series("a", [None], pl.Array(pl.Null, 0)).arr.len().to_list() == [None]
88

89
    assert pl.Series("a", [], pl.Array(pl.Null, 0)).arr.len().to_list() == []
90

91

92
def test_array_lengths() -> None:
93
    df = pl.DataFrame(
94
        [
95
            pl.Series("a", [[1, 2, 3]], dtype=pl.Array(pl.Int64, 3)),
96
            pl.Series("b", [[4, 5]], dtype=pl.Array(pl.Int64, 2)),
97
        ]
98
    )
99
    out = df.select(pl.col("a").arr.len(), pl.col("b").arr.len())
100
    expected_df = pl.DataFrame(
101
        {"a": [3], "b": [2]},
102
        schema={"a": pl.get_index_type(), "b": pl.get_index_type()},
103
    )
104
    assert_frame_equal(out, expected_df)
105

106
    assert pl.Series("a", [], pl.Array(pl.Null, 1)).arr.len().to_list() == []
107
    assert pl.Series(
108
        "a", [[1, 2, 3], None, [7, 8, 9]], pl.Array(pl.get_index_type(), 3)
109
    ).arr.len().to_list() == [3, None, 3]
110

111

112
@pytest.mark.parametrize(
113
    ("as_array"),
114
    [True, False],
115
)
116
def test_arr_slice(as_array: bool) -> None:
117
    df = pl.DataFrame(
118
        {
119
            "arr": [[1, 2, 3], [10, 2, 1]],
120
        },
121
        schema={"arr": pl.Array(pl.Int64, 3)},
122
    )
123

124
    assert df.select([pl.col("arr").arr.slice(0, 1, as_array=as_array)]).to_dict(
125
        as_series=False
126
    ) == {"arr": [[1], [10]]}
127
    assert df.select([pl.col("arr").arr.slice(1, 1, as_array=as_array)]).to_dict(
128
        as_series=False
129
    ) == {"arr": [[2], [2]]}
130
    assert df.select([pl.col("arr").arr.slice(-1, 1, as_array=as_array)]).to_dict(
131
        as_series=False
132
    ) == {"arr": [[3], [1]]}
133
    assert df.select([pl.col("arr").arr.slice(-2, 1, as_array=as_array)]).to_dict(
134
        as_series=False
135
    ) == {"arr": [[2], [2]]}
136
    assert df.select([pl.col("arr").arr.slice(-2, 2, as_array=as_array)]).to_dict(
137
        as_series=False
138
    ) == {"arr": [[2, 3], [2, 1]]}
139
    return
140

141

142
@pytest.mark.parametrize(
143
    ("as_array"),
144
    [True, False],
145
)
146
def test_arr_slice_on_series(as_array: bool) -> None:
147
    vals = [[1, 2, 3, 4], [10, 2, 1, 2]]
148
    s = pl.Series("a", vals, dtype=pl.Array(pl.Int64, 4))
149
    assert s.arr.head(2, as_array=as_array).to_list() == [[1, 2], [10, 2]]
150
    assert s.arr.tail(2, as_array=as_array).to_list() == [[3, 4], [1, 2]]
151
    assert s.arr.tail(10, as_array=as_array).to_list() == vals
152
    assert s.arr.head(10, as_array=as_array).to_list() == vals
153
    assert s.arr.slice(1, 2, as_array=as_array).to_list() == [[2, 3], [2, 1]]
154
    assert s.arr.slice(-5, 2, as_array=as_array).to_list() == [[1], [10]]
155
    # TODO: there is a bug in list.slice that does not allow negative values for head
156
    if as_array:
157
        assert s.arr.tail(-1, as_array=as_array).to_list() == [[2, 3, 4], [2, 1, 2]]
158
        assert s.arr.tail(-2, as_array=as_array).to_list() == [[3, 4], [1, 2]]
159
        assert s.arr.tail(-3, as_array=as_array).to_list() == [[4], [2]]
160
        assert s.arr.head(-1, as_array=as_array).to_list() == [[1, 2, 3], [10, 2, 1]]
161
        assert s.arr.head(-2, as_array=as_array).to_list() == [[1, 2], [10, 2]]
162
        assert s.arr.head(-3, as_array=as_array).to_list() == [[1], [10]]
163

164

165
def test_arr_unique() -> None:
166
    df = pl.DataFrame(
167
        {"a": pl.Series("a", [[1, 1], [4, 3]], dtype=pl.Array(pl.Int64, 2))}
168
    )
169

170
    out = df.select(pl.col("a").arr.unique(maintain_order=True))
171
    expected = pl.DataFrame({"a": [[1], [4, 3]]})
172
    assert_frame_equal(out, expected)
173

174

175
def test_array_any_all() -> None:
176
    s = pl.Series(
177
        [[True, True], [False, True], [False, False], [None, None], None],
178
        dtype=pl.Array(pl.Boolean, 2),
179
    )
180

181
    expected_any = pl.Series([True, True, False, False, None])
182
    assert_series_equal(s.arr.any(), expected_any)
183

184
    expected_all = pl.Series([True, False, False, True, None])
185
    assert_series_equal(s.arr.all(), expected_all)
186

187
    s = pl.Series([[1, 2], [3, 4], [5, 6]], dtype=pl.Array(pl.Int64, 2))
188
    with pytest.raises(ComputeError, match="expected boolean elements in array"):
189
        s.arr.any()
190
    with pytest.raises(ComputeError, match="expected boolean elements in array"):
191
        s.arr.all()
192

193

194
def test_array_sort() -> None:
195
    s = pl.Series([[2, None, 1], [1, 3, 2]], dtype=pl.Array(pl.UInt32, 3))
196

197
    desc = s.arr.sort(descending=True)
198
    expected = pl.Series([[None, 2, 1], [3, 2, 1]], dtype=pl.Array(pl.UInt32, 3))
199
    assert_series_equal(desc, expected)
200

201
    asc = s.arr.sort(descending=False)
202
    expected = pl.Series([[None, 1, 2], [1, 2, 3]], dtype=pl.Array(pl.UInt32, 3))
203
    assert_series_equal(asc, expected)
204

205
    # test nulls_last
206
    s = pl.Series([[None, 1, 2], [-1, None, 9]], dtype=pl.Array(pl.Int8, 3))
207
    assert_series_equal(
208
        s.arr.sort(nulls_last=True),
209
        pl.Series([[1, 2, None], [-1, 9, None]], dtype=pl.Array(pl.Int8, 3)),
210
    )
211
    assert_series_equal(
212
        s.arr.sort(nulls_last=False),
213
        pl.Series([[None, 1, 2], [None, -1, 9]], dtype=pl.Array(pl.Int8, 3)),
214
    )
215

216

217
def test_array_reverse() -> None:
218
    s = pl.Series([[2, None, 1], [1, None, 2]], dtype=pl.Array(pl.UInt32, 3))
219

220
    s = s.arr.reverse()
221
    expected = pl.Series([[1, None, 2], [2, None, 1]], dtype=pl.Array(pl.UInt32, 3))
222
    assert_series_equal(s, expected)
223

224

225
def test_array_arg_min_max() -> None:
226
    s = pl.Series("a", [[1, 2, 4], [3, 2, 1]], dtype=pl.Array(pl.UInt32, 3))
227
    expected = pl.Series("a", [0, 2], dtype=pl.get_index_type())
228
    assert_series_equal(s.arr.arg_min(), expected)
229
    expected = pl.Series("a", [2, 0], dtype=pl.get_index_type())
230
    assert_series_equal(s.arr.arg_max(), expected)
231

232

233
def test_array_get() -> None:
234
    s = pl.Series(
235
        "a",
236
        [[1, 2, 3, 4], [5, 6, None, None], [7, 8, 9, 10]],
237
        dtype=pl.Array(pl.Int64, 4),
238
    )
239

240
    # Test index literal.
241
    out = s.arr.get(1, null_on_oob=False)
242
    expected = pl.Series("a", [2, 6, 8], dtype=pl.Int64)
243
    assert_series_equal(out, expected)
244

245
    # Null index literal.
246
    out_df = s.to_frame().select(pl.col.a.arr.get(pl.lit(None), null_on_oob=False))
247
    expected_df = pl.Series("a", [None, None, None], dtype=pl.Int64).to_frame()
248
    assert_frame_equal(out_df, expected_df)
249

250
    # Out-of-bounds index literal.
251
    with pytest.raises(ComputeError, match="get index is out of bounds"):
252
        out = s.arr.get(100, null_on_oob=False)
253

254
    # Negative index literal.
255
    out = s.arr.get(-2, null_on_oob=False)
256
    expected = pl.Series("a", [3, None, 9], dtype=pl.Int64)
257
    assert_series_equal(out, expected)
258

259
    # Test index expr.
260
    with pytest.raises(ComputeError, match="get index is out of bounds"):
261
        out = s.arr.get(pl.Series([1, -2, 100]), null_on_oob=False)
262

263
    out = s.arr.get(pl.Series([1, -2, 0]), null_on_oob=False)
264
    expected = pl.Series("a", [2, None, 7], dtype=pl.Int64)
265
    assert_series_equal(out, expected)
266

267
    # Test logical type.
268
    s = pl.Series(
269
        "a",
270
        [
271
            [datetime.date(1999, 1, 1), datetime.date(2000, 1, 1)],
272
            [datetime.date(2001, 10, 1), None],
273
            [None, None],
274
        ],
275
        dtype=pl.Array(pl.Date, 2),
276
    )
277
    with pytest.raises(ComputeError, match="get index is out of bounds"):
278
        out = s.arr.get(pl.Series([1, -2, 4]), null_on_oob=False)
279

280

281
def test_array_get_null_on_oob() -> None:
282
    s = pl.Series(
283
        "a",
284
        [[1, 2, 3, 4], [5, 6, None, None], [7, 8, 9, 10]],
285
        dtype=pl.Array(pl.Int64, 4),
286
    )
287

288
    # Test index literal.
289
    out = s.arr.get(1, null_on_oob=True)
290
    expected = pl.Series("a", [2, 6, 8], dtype=pl.Int64)
291
    assert_series_equal(out, expected)
292

293
    # Null index literal.
294
    out_df = s.to_frame().select(pl.col.a.arr.get(pl.lit(None), null_on_oob=True))
295
    expected_df = pl.Series("a", [None, None, None], dtype=pl.Int64).to_frame()
296
    assert_frame_equal(out_df, expected_df)
297

298
    # Out-of-bounds index literal.
299
    out = s.arr.get(100, null_on_oob=True)
300
    expected = pl.Series("a", [None, None, None], dtype=pl.Int64)
301
    assert_series_equal(out, expected)
302

303
    # Negative index literal.
304
    out = s.arr.get(-2, null_on_oob=True)
305
    expected = pl.Series("a", [3, None, 9], dtype=pl.Int64)
306
    assert_series_equal(out, expected)
307

308
    # Test index expr.
309
    out = s.arr.get(pl.Series([1, -2, 100]), null_on_oob=True)
310
    expected = pl.Series("a", [2, None, None], dtype=pl.Int64)
311
    assert_series_equal(out, expected)
312

313
    # Test logical type.
314
    s = pl.Series(
315
        "a",
316
        [
317
            [datetime.date(1999, 1, 1), datetime.date(2000, 1, 1)],
318
            [datetime.date(2001, 10, 1), None],
319
            [None, None],
320
        ],
321
        dtype=pl.Array(pl.Date, 2),
322
    )
323
    out = s.arr.get(pl.Series([1, -2, 4]), null_on_oob=True)
324
    expected = pl.Series(
325
        "a",
326
        [datetime.date(2000, 1, 1), datetime.date(2001, 10, 1), None],
327
        dtype=pl.Date,
328
    )
329
    assert_series_equal(out, expected)
330

331

332
def test_arr_first_last() -> None:
333
    s = pl.Series(
334
        "a",
335
        [[1, 2, 3], [None, 5, 6], [None, None, None]],
336
        dtype=pl.Array(pl.Int64, 3),
337
    )
338

339
    first = s.arr.first()
340
    expected_first = pl.Series(
341
        "a",
342
        [1, None, None],
343
        dtype=pl.Int64,
344
    )
345
    assert_series_equal(first, expected_first)
346

347
    last = s.arr.last()
348
    expected_last = pl.Series(
349
        "a",
350
        [3, 6, None],
351
        dtype=pl.Int64,
352
    )
353
    assert_series_equal(last, expected_last)
354

355

356
@pytest.mark.parametrize(
357
    ("data", "set", "dtype"),
358
    [
359
        ([1, 2], [[1, 2], [3, 4]], pl.Int64),
360
        ([True, False], [[True, False], [True, True]], pl.Boolean),
361
        (["a", "b"], [["a", "b"], ["c", "d"]], pl.String),
362
        ([b"a", b"b"], [[b"a", b"b"], [b"c", b"d"]], pl.Binary),
363
        (
364
            [{"a": 1}, {"a": 2}],
365
            [[{"a": 1}, {"a": 2}], [{"b": 1}, {"a": 3}]],
366
            pl.Struct([pl.Field("a", pl.Int64)]),
367
        ),
368
    ],
369
)
370
def test_is_in_array(data: list[Any], set: list[list[Any]], dtype: pl.DataType) -> None:
371
    df = pl.DataFrame(
372
        {"a": data, "arr": set},
373
        schema={"a": dtype, "arr": pl.Array(dtype, 2)},
374
    )
375
    out = df.select(is_in=pl.col("a").is_in(pl.col("arr"))).to_series()
376
    expected = pl.Series("is_in", [True, False])
377
    assert_series_equal(out, expected)
378

379

380
def test_array_join() -> None:
381
    df = pl.DataFrame(
382
        {
383
            "a": [["ab", "c", "d"], ["e", "f", "g"], [None, None, None], None],
384
            "separator": ["&", None, "*", "_"],
385
        },
386
        schema={
387
            "a": pl.Array(pl.String, 3),
388
            "separator": pl.String,
389
        },
390
    )
391
    out = df.select(pl.col("a").arr.join("-"))
392
    assert out.to_dict(as_series=False) == {"a": ["ab-c-d", "e-f-g", "", None]}
393
    out = df.select(pl.col("a").arr.join(pl.col("separator")))
394
    assert out.to_dict(as_series=False) == {"a": ["ab&c&d", None, "", None]}
395

396
    # test ignore_nulls argument
397
    df = pl.DataFrame(
398
        {
399
            "a": [
400
                ["a", None, "b", None],
401
                None,
402
                [None, None, None, None],
403
                ["c", "d", "e", "f"],
404
            ],
405
            "separator": ["-", "&", " ", "@"],
406
        },
407
        schema={
408
            "a": pl.Array(pl.String, 4),
409
            "separator": pl.String,
410
        },
411
    )
412
    # ignore nulls
413
    out = df.select(pl.col("a").arr.join("-", ignore_nulls=True))
414
    assert out.to_dict(as_series=False) == {"a": ["a-b", None, "", "c-d-e-f"]}
415
    out = df.select(pl.col("a").arr.join(pl.col("separator"), ignore_nulls=True))
416
    assert out.to_dict(as_series=False) == {"a": ["a-b", None, "", "c@d@e@f"]}
417
    # propagate nulls
418
    out = df.select(pl.col("a").arr.join("-", ignore_nulls=False))
419
    assert out.to_dict(as_series=False) == {"a": [None, None, None, "c-d-e-f"]}
420
    out = df.select(pl.col("a").arr.join(pl.col("separator"), ignore_nulls=False))
421
    assert out.to_dict(as_series=False) == {"a": [None, None, None, "c@d@e@f"]}
422

423

424
def test_array_explode() -> None:
425
    df = pl.DataFrame(
426
        {
427
            "str": [["a", "b"], ["c", None], None],
428
            "nested": [[[1, 2], [3]], [[], [4, None]], None],
429
            "logical": [
430
                [datetime.date(1998, 1, 1), datetime.date(2000, 10, 1)],
431
                [datetime.date(2024, 1, 1), None],
432
                None,
433
            ],
434
        },
435
        schema={
436
            "str": pl.Array(pl.String, 2),
437
            "nested": pl.Array(pl.List(pl.Int64), 2),
438
            "logical": pl.Array(pl.Date, 2),
439
        },
440
    )
441
    out = df.select(pl.all().arr.explode())
442
    expected = pl.DataFrame(
443
        {
444
            "str": ["a", "b", "c", None, None],
445
            "nested": [[1, 2], [3], [], [4, None], None],
446
            "logical": [
447
                datetime.date(1998, 1, 1),
448
                datetime.date(2000, 10, 1),
449
                datetime.date(2024, 1, 1),
450
                None,
451
                None,
452
            ],
453
        }
454
    )
455
    assert_frame_equal(out, expected)
456

457
    # test no-null fast path
458
    s = pl.Series(
459
        [
460
            [datetime.date(1998, 1, 1), datetime.date(1999, 1, 3)],
461
            [datetime.date(2000, 1, 1), datetime.date(2023, 10, 1)],
462
        ],
463
        dtype=pl.Array(pl.Date, 2),
464
    )
465
    out_s = s.arr.explode()
466
    expected_s = pl.Series(
467
        [
468
            datetime.date(1998, 1, 1),
469
            datetime.date(1999, 1, 3),
470
            datetime.date(2000, 1, 1),
471
            datetime.date(2023, 10, 1),
472
        ],
473
        dtype=pl.Date,
474
    )
475
    assert_series_equal(out_s, expected_s)
476

477

478
@pytest.mark.parametrize(
479
    ("arr", "data", "expected", "dtype"),
480
    [
481
        ([[1, 2], [3, None], None], 1, [1, 0, None], pl.Int64),
482
        ([[True, False], [True, None], None], True, [1, 1, None], pl.Boolean),
483
        ([["a", "b"], ["c", None], None], "a", [1, 0, None], pl.String),
484
        ([[b"a", b"b"], [b"c", None], None], b"a", [1, 0, None], pl.Binary),
485
    ],
486
)
487
def test_array_count_matches(
488
    arr: list[list[Any] | None], data: Any, expected: list[Any], dtype: pl.DataType
489
) -> None:
490
    df = pl.DataFrame({"arr": arr}, schema={"arr": pl.Array(dtype, 2)})
491
    out = df.select(count_matches=pl.col("arr").arr.count_matches(data))
492
    assert out.to_dict(as_series=False) == {"count_matches": expected}
493

494

495
def test_array_count_matches_wildcard_expansion() -> None:
496
    df = pl.DataFrame(
497
        {"a": [[1, 2]], "b": [[3, 4]]},
498
        schema={"a": pl.Array(pl.Int64, 2), "b": pl.Array(pl.Int64, 2)},
499
    )
500
    assert df.select(pl.all().arr.count_matches(3)).to_dict(as_series=False) == {
501
        "a": [0],
502
        "b": [1],
503
    }
504

505

506
def test_array_to_struct() -> None:
507
    df = pl.DataFrame(
508
        {"a": [[1, 2, 3], [4, 5, None]]}, schema={"a": pl.Array(pl.Int8, 3)}
509
    )
510
    assert df.select([pl.col("a").arr.to_struct()]).to_series().to_list() == [
511
        {"field_0": 1, "field_1": 2, "field_2": 3},
512
        {"field_0": 4, "field_1": 5, "field_2": None},
513
    ]
514

515
    df = pl.DataFrame(
516
        {"a": [[1, 2, None], [1, 2, 3]]}, schema={"a": pl.Array(pl.Int8, 3)}
517
    )
518
    assert df.select(
519
        pl.col("a").arr.to_struct(fields=lambda idx: f"col_name_{idx}")
520
    ).to_series().to_list() == [
521
        {"col_name_0": 1, "col_name_1": 2, "col_name_2": None},
522
        {"col_name_0": 1, "col_name_1": 2, "col_name_2": 3},
523
    ]
524

525
    assert df.lazy().select(pl.col("a").arr.to_struct()).unnest(
526
        "a"
527
    ).sum().collect().columns == ["field_0", "field_1", "field_2"]
528

529

530
def test_array_shift() -> None:
531
    df = pl.DataFrame(
532
        {"a": [[1, 2, 3], None, [4, 5, 6], [7, 8, 9]], "n": [None, 1, 1, -2]},
533
        schema={"a": pl.Array(pl.Int64, 3), "n": pl.Int64},
534
    )
535

536
    out = df.select(
537
        lit=pl.col("a").arr.shift(1), expr=pl.col("a").arr.shift(pl.col("n"))
538
    )
539
    expected = pl.DataFrame(
540
        {
541
            "lit": [[None, 1, 2], None, [None, 4, 5], [None, 7, 8]],
542
            "expr": [None, None, [None, 4, 5], [9, None, None]],
543
        },
544
        schema={"lit": pl.Array(pl.Int64, 3), "expr": pl.Array(pl.Int64, 3)},
545
    )
546
    assert_frame_equal(out, expected)
547

548

549
def test_array_n_unique() -> None:
550
    df = pl.DataFrame(
551
        {
552
            "a": [[1, 1, 2], [3, 3, 3], [None, None, None], None],
553
        },
554
        schema={"a": pl.Array(pl.Int64, 3)},
555
    )
556

557
    out = df.select(n_unique=pl.col("a").arr.n_unique())
558
    expected = pl.DataFrame(
559
        {"n_unique": [2, 1, 1, None]}, schema={"n_unique": pl.get_index_type()}
560
    )
561
    assert_frame_equal(out, expected)
562

563

564
def test_explode_19049() -> None:
565
    df = pl.DataFrame({"a": [[1, 2, 3]]}, schema={"a": pl.Array(pl.Int64, 3)})
566
    result_df = df.select(pl.col.a.arr.explode())
567
    expected_df = pl.DataFrame({"a": [1, 2, 3]}, schema={"a": pl.Int64})
568
    assert_frame_equal(result_df, expected_df)
569

570
    df = pl.DataFrame({"a": [1, 2, 3]}, schema={"a": pl.Int64})
571
    with pytest.raises(
572
        InvalidOperationError,
573
        match="expected Array datatype for array operation, got: Int64",
574
    ):
575
        df.select(pl.col.a.arr.explode())
576

577

578
def test_array_join_unequal_lengths_22018() -> None:
579
    df = pl.DataFrame(
580
        [
581
            pl.Series(
582
                "a",
583
                [
584
                    ["a", "b", "d"],
585
                    ["ya", "x", "y"],
586
                    ["ya", "x", "y"],
587
                ],
588
                pl.Array(pl.String, 3),
589
            ),
590
        ]
591
    )
592
    with pytest.raises(pl.exceptions.ShapeError):
593
        df.select(pl.col.a.arr.join(pl.Series([",", "-"])))
594

595

596
def test_array_shift_unequal_lengths_22018() -> None:
597
    with pytest.raises(pl.exceptions.ShapeError):
598
        pl.Series(
599
            "a",
600
            [
601
                ["a", "b", "d"],
602
                ["a", "b", "d"],
603
                ["a", "b", "d"],
604
            ],
605
            pl.Array(pl.String, 3),
606
        ).arr.shift(pl.Series([1, 2]))
607

608

609
def test_array_shift_self_broadcast_22124() -> None:
610
    assert_series_equal(
611
        pl.Series(
612
            "a",
613
            [
614
                ["a", "b", "d"],
615
            ],
616
            pl.Array(pl.String, 3),
617
        ).arr.shift(pl.Series([1, 2])),
618
        pl.Series(
619
            "a",
620
            [
621
                [None, "a", "b"],
622
                [None, None, "a"],
623
            ],
624
            pl.Array(pl.String, 3),
625
        ),
626
    )
627

628

629
def test_arr_contains() -> None:
630
    s = pl.Series([[1, 2, None], [None, None, None], None], dtype=pl.Array(pl.Int64, 3))
631

632
    assert_series_equal(
633
        s.arr.contains(None, nulls_equal=False),
634
        pl.Series([None, None, None], dtype=pl.Boolean),
635
    )
636
    assert_series_equal(
637
        s.arr.contains(None, nulls_equal=True),
638
        pl.Series([True, True, None], dtype=pl.Boolean),
639
    )
640
    assert_series_equal(
641
        s.arr.contains(1, nulls_equal=False),
642
        pl.Series([True, False, None], dtype=pl.Boolean),
643
    )
644
    assert_series_equal(
645
        s.arr.contains(1, nulls_equal=True),
646
        pl.Series([True, False, None], dtype=pl.Boolean),
647
    )
648

649

650
@pytest.mark.parametrize(
651
    "expr",
652
    [
653
        pl.col("a").arr.contains("z"),
654
        pl.col("a").arr.explode(),
655
        pl.col("a").arr.sum(),
656
        pl.col("a").arr.to_list(),
657
        pl.col("a").arr.to_struct(),
658
        pl.col("a").arr.unique(),
659
        pl.col("a").arr.all(),
660
        pl.col("a").arr.any(),
661
        pl.col("a").arr.arg_max(),
662
        pl.col("a").arr.arg_min(),
663
        pl.col("a").arr.count_matches("z"),
664
        pl.col("a").arr.first(),
665
        pl.col("a").arr.get(0),
666
        pl.col("a").arr.join(""),
667
        pl.col("a").arr.last(),
668
        pl.col("a").arr.len(),
669
        pl.col("a").arr.max(),
670
        pl.col("a").arr.mean(),
671
        pl.col("a").arr.median(),
672
        pl.col("a").arr.min(),
673
        pl.col("a").arr.n_unique(),
674
        pl.col("a").arr.reverse(),
675
        pl.col("a").arr.shift(1),
676
        pl.col("a").arr.sort(),
677
        pl.col("a").arr.std(),
678
        pl.col("a").arr.var(),
679
    ],
680
)
681
def test_schema_non_array(expr: pl.Expr) -> None:
682
    lf = pl.LazyFrame({"a": ["a", "b", "c"]})
683

684
    with pytest.raises(
685
        InvalidOperationError,
686
        match="expected Array datatype for array operation, got: String",
687
    ):
688
        lf.select(expr).collect_schema()
689

690

691
def test_array_get_broadcast_26217() -> None:
692
    df = pl.DataFrame({"idx": [0, 1, 2, 1, 2, 0, 1]})
693
    out = df.select(pl.lit([42, 13, 37], pl.Array(pl.UInt8, 3)).arr.get(pl.col.idx))
694
    expected = pl.DataFrame(
695
        {"literal": [42, 13, 37, 13, 37, 42, 13]}, schema={"literal": pl.UInt8}
696
    )
697
    assert_frame_equal(out, expected)
698

699
Product

Resources

Company