CoCalc -- test_getitem.py

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/dataframe/test_getitem.py
⁶⁹³⁹ views
1
from __future__ import annotations
2

3
from typing import Any
4

5
import hypothesis.strategies as st
6
import numpy as np
7
import pytest
8
from hypothesis import given
9

10
import polars as pl
11
from polars.testing import assert_frame_equal, assert_series_equal
12
from polars.testing.parametric import column, dataframes
13
from tests.unit.conftest import INTEGER_DTYPES, SIGNED_INTEGER_DTYPES
14

15

16
@given(
17
    df=dataframes(
18
        max_size=10,
19
        cols=[
20
            column(
21
                "start",
22
                dtype=pl.Int8,
23
                allow_null=True,
24
                strategy=st.integers(min_value=-8, max_value=8),
25
            ),
26
            column(
27
                "stop",
28
                dtype=pl.Int8,
29
                allow_null=True,
30
                strategy=st.integers(min_value=-6, max_value=6),
31
            ),
32
            column(
33
                "step",
34
                dtype=pl.Int8,
35
                allow_null=True,
36
                strategy=st.integers(min_value=-4, max_value=4).filter(
37
                    lambda x: x != 0
38
                ),
39
            ),
40
            column("misc", dtype=pl.Int32),
41
        ],
42
    )
43
    # generated dataframe example -
44
    # ┌───────┬──────┬──────┬───────┐
45
    # │ start ┆ stop ┆ step ┆ misc  │
46
    # │ ---   ┆ ---  ┆ ---  ┆ ---   │
47
    # │ i8    ┆ i8   ┆ i8   ┆ i32   │
48
    # ╞═══════╪══════╪══════╪═══════╡
49
    # │ 2     ┆ -1   ┆ null ┆ -55   │
50
    # │ -3    ┆ 0    ┆ -2   ┆ 61582 │
51
    # │ null  ┆ 1    ┆ 2    ┆ 5865  │
52
    # └───────┴──────┴──────┴───────┘
53
)
54
def test_df_getitem_row_slice(df: pl.DataFrame) -> None:
55
    # take strategy-generated integer values from the frame as slice bounds.
56
    # use these bounds to slice the same frame, and then validate the result
57
    # against a py-native slice of the same data using the same bounds.
58
    #
59
    # given the average number of rows in the frames, and the value of
60
    # max_examples, this will result in close to 5000 test permutations,
61
    # running in around ~1.5 secs (depending on hardware/etc).
62
    py_data = df.rows()
63

64
    for start, stop, step, _ in py_data:
65
        s = slice(start, stop, step)
66
        sliced_py_data = py_data[s]
67
        sliced_df_data = df[s].rows()
68

69
        assert sliced_py_data == sliced_df_data, (
70
            f"slice [{start}:{stop}:{step}] failed on df w/len={df.height}"
71
        )
72

73

74
def test_df_getitem_col_single_name() -> None:
75
    df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
76
    result = df[:, "a"]
77
    expected = df.select("a").to_series()
78
    assert_series_equal(result, expected)
79

80

81
@pytest.mark.parametrize(
82
    ("input", "expected_cols"),
83
    [
84
        (["a"], ["a"]),
85
        (["a", "d"], ["a", "d"]),
86
        (slice("b", "d"), ["b", "c", "d"]),
87
        (pl.Series(["a", "b"]), ["a", "b"]),
88
        (np.array(["c", "d"]), ["c", "d"]),
89
    ],
90
)
91
def test_df_getitem_col_multiple_names(input: Any, expected_cols: list[str]) -> None:
92
    df = pl.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8]})
93
    result = df[:, input]
94
    expected = df.select(expected_cols)
95
    assert_frame_equal(result, expected)
96

97

98
def test_df_getitem_col_single_index() -> None:
99
    df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
100
    result = df[:, 1]
101
    expected = df.select("b").to_series()
102
    assert_series_equal(result, expected)
103

104

105
def test_df_getitem_col_two_entries() -> None:
106
    df = pl.DataFrame({"x": [1.0], "y": [1.0]})
107

108
    assert_frame_equal(df["x", "y"], df)
109
    assert_frame_equal(df[True, True], df)
110

111

112
@pytest.mark.parametrize(
113
    ("input", "expected_cols"),
114
    [
115
        ([0], ["a"]),
116
        ([0, 3], ["a", "d"]),
117
        (slice(1, 4), ["b", "c", "d"]),
118
        (pl.Series([0, 1]), ["a", "b"]),
119
        (np.array([2, 3]), ["c", "d"]),
120
    ],
121
)
122
def test_df_getitem_col_multiple_indices(input: Any, expected_cols: list[str]) -> None:
123
    df = pl.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8]})
124
    result = df[:, input]
125
    expected = df.select(expected_cols)
126
    assert_frame_equal(result, expected)
127

128

129
@pytest.mark.parametrize(
130
    "mask",
131
    [
132
        [True, False, True],
133
        pl.Series([True, False, True]),
134
        np.array([True, False, True]),
135
    ],
136
)
137
def test_df_getitem_col_boolean_mask(mask: Any) -> None:
138
    df = pl.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]})
139
    result = df[:, mask]
140
    expected = df.select("a", "c")
141
    assert_frame_equal(result, expected)
142

143

144
@pytest.mark.parametrize(
145
    ("rng", "expected_cols"),
146
    [
147
        (range(2), ["a", "b"]),
148
        (range(1, 4), ["b", "c", "d"]),
149
        (range(3, 0, -2), ["d", "b"]),
150
    ],
151
)
152
def test_df_getitem_col_range(rng: range, expected_cols: list[str]) -> None:
153
    df = pl.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8]})
154
    result = df[:, rng]
155
    expected = df.select(expected_cols)
156
    assert_frame_equal(result, expected)
157

158

159
@pytest.mark.parametrize(
160
    "input", [[], (), pl.Series(dtype=pl.Int64), np.array([], dtype=np.uint32)]
161
)
162
def test_df_getitem_col_empty_inputs(input: Any) -> None:
163
    df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0]})
164
    result = df[:, input]
165
    expected = pl.DataFrame()
166
    assert_frame_equal(result, expected)
167

168

169
@pytest.mark.parametrize(
170
    ("input", "match"),
171
    [
172
        (
173
            [0.0, 1.0],
174
            "cannot select columns using Sequence with elements of type 'float'",
175
        ),
176
        (
177
            pl.Series([[1, 2], [3, 4]]),
178
            "cannot select columns using Series of type List\\(Int64\\)",
179
        ),
180
        (
181
            np.array([0.0, 1.0]),
182
            "cannot select columns using NumPy array of type float64",
183
        ),
184
        (object(), "cannot select columns using key of type 'object'"),
185
    ],
186
)
187
def test_df_getitem_col_invalid_inputs(input: Any, match: str) -> None:
188
    df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0]})
189
    with pytest.raises(TypeError, match=match):
190
        df[:, input]
191

192

193
@pytest.mark.parametrize(
194
    ("input", "match"),
195
    [
196
        (["a", 2], "'int' object cannot be converted to 'PyString'"),
197
        ([1, "c"], "'str' object cannot be interpreted as an integer"),
198
    ],
199
)
200
def test_df_getitem_col_mixed_inputs(input: list[Any], match: str) -> None:
201
    df = pl.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]})
202
    with pytest.raises(TypeError, match=match):
203
        df[:, input]
204

205

206
@pytest.mark.parametrize(
207
    ("input", "match"),
208
    [
209
        ([0.0, 1.0], "unexpected value while building Series of type Int64"),
210
        (
211
            pl.Series([[1, 2], [3, 4]]),
212
            "cannot treat Series of type List\\(Int64\\) as indices",
213
        ),
214
        (np.array([0.0, 1.0]), "cannot treat NumPy array of type float64 as indices"),
215
        (object(), "cannot select rows using key of type 'object'"),
216
    ],
217
)
218
def test_df_getitem_row_invalid_inputs(input: Any, match: str) -> None:
219
    df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0]})
220
    with pytest.raises(TypeError, match=match):
221
        df[input, :]
222

223

224
def test_df_getitem_row_range() -> None:
225
    df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5.0, 6.0, 7.0, 8.0]})
226
    result = df[range(3, 0, -2), :]
227
    expected = pl.DataFrame({"a": [4, 2], "b": [8.0, 6.0]})
228
    assert_frame_equal(result, expected)
229

230

231
def test_df_getitem_row_range_single_input() -> None:
232
    df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5.0, 6.0, 7.0, 8.0]})
233
    result = df[range(1, 3)]
234
    expected = pl.DataFrame({"a": [2, 3], "b": [6.0, 7.0]})
235
    assert_frame_equal(result, expected)
236

237

238
def test_df_getitem_row_empty_list_single_input() -> None:
239
    df = pl.DataFrame({"a": [1, 2], "b": [5.0, 6.0]})
240
    result = df[[]]
241
    expected = df.clear()
242
    assert_frame_equal(result, expected)
243

244

245
def test_df_getitem() -> None:
246
    """Test all the methods to use [] on a dataframe."""
247
    df = pl.DataFrame({"a": [1.0, 2.0, 3.0, 4.0], "b": [3, 4, 5, 6]})
248

249
    # multiple slices.
250
    # The first element refers to the rows, the second element to columns
251
    assert_frame_equal(df[:, :], df)
252

253
    # str, always refers to a column name
254
    assert_series_equal(df["a"], pl.Series("a", [1.0, 2.0, 3.0, 4.0]))
255

256
    # int, always refers to a row index (zero-based): index=1 => second row
257
    assert_frame_equal(df[1], pl.DataFrame({"a": [2.0], "b": [4]}))
258

259
    # int, int.
260
    # The first element refers to the rows, the second element to columns
261
    assert df[2, 1] == 5
262
    assert df[2, -2] == 3.0
263

264
    with pytest.raises(IndexError):
265
        # Column index out of bounds
266
        df[2, 2]
267

268
    with pytest.raises(IndexError):
269
        # Column index out of bounds
270
        df[2, -3]
271

272
    # int, list[int].
273
    # The first element refers to the rows, the second element to columns
274
    assert_frame_equal(df[2, [1, 0]], pl.DataFrame({"b": [5], "a": [3.0]}))
275
    assert_frame_equal(df[2, [-1, -2]], pl.DataFrame({"b": [5], "a": [3.0]}))
276

277
    with pytest.raises(IndexError):
278
        # Column index out of bounds
279
        df[2, [2, 0]]
280

281
    with pytest.raises(IndexError):
282
        # Column index out of bounds
283
        df[2, [2, -3]]
284

285
    # slice. Below an example of taking every second row
286
    assert_frame_equal(df[1::2], pl.DataFrame({"a": [2.0, 4.0], "b": [4, 6]}))
287

288
    # slice, empty slice
289
    assert df[:0].columns == ["a", "b"]
290
    assert len(df[:0]) == 0
291

292
    # make mypy happy
293
    empty: list[int] = []
294

295
    # empty list with column selector drops rows but keeps columns
296
    assert_frame_equal(df[empty, :], df[:0])
297

298
    # sequences (lists or tuples; tuple only if length != 2)
299
    # if strings or list of expressions, assumed to be column names
300
    # if bools, assumed to be a row mask
301
    # if integers, assumed to be row indices
302
    assert_frame_equal(df[["a", "b"]], df)
303
    assert_frame_equal(df.select([pl.col("a"), pl.col("b")]), df)
304
    assert_frame_equal(
305
        df[[1, -4, -1, 2, 1]],
306
        pl.DataFrame({"a": [2.0, 1.0, 4.0, 3.0, 2.0], "b": [4, 3, 6, 5, 4]}),
307
    )
308

309
    # pl.Series: strings for column selections.
310
    assert_frame_equal(df[pl.Series("", ["a", "b"])], df)
311

312
    # pl.Series: positive idxs or empty idxs for row selection.
313
    for pl_dtype in INTEGER_DTYPES:
314
        assert_frame_equal(
315
            df[pl.Series("", [1, 0, 3, 2, 3, 0], dtype=pl_dtype)],
316
            pl.DataFrame(
317
                {"a": [2.0, 1.0, 4.0, 3.0, 4.0, 1.0], "b": [4, 3, 6, 5, 6, 3]}
318
            ),
319
        )
320
        assert df[pl.Series("", [], dtype=pl_dtype)].columns == ["a", "b"]
321

322
    # pl.Series: positive and negative idxs for row selection.
323
    for pl_dtype in SIGNED_INTEGER_DTYPES:
324
        assert_frame_equal(
325
            df[pl.Series("", [-1, 0, -3, -2, 3, -4], dtype=pl_dtype)],
326
            pl.DataFrame(
327
                {"a": [4.0, 1.0, 2.0, 3.0, 4.0, 1.0], "b": [6, 3, 4, 5, 6, 3]}
328
            ),
329
        )
330

331
    # Boolean masks for rows not supported
332
    with pytest.raises(TypeError):
333
        df[[True, False, True], [False, True]]
334
    with pytest.raises(TypeError):
335
        df[pl.Series([True, False, True]), "b"]
336

337
    assert_frame_equal(df[np.array([True, False])], df[:, :1])
338

339
    # wrong length boolean mask for column selection
340
    with pytest.raises(
341
        ValueError,
342
        match=f"expected {df.width} values when selecting columns by boolean mask",
343
    ):
344
        df[:, [True, False, True]]
345

346

347
def test_df_getitem_numpy() -> None:
348
    # nupmy getitem: assumed to be row indices if integers, or columns if strings
349
    df = pl.DataFrame({"a": [1.0, 2.0, 3.0, 4.0], "b": [3, 4, 5, 6]})
350

351
    # numpy array: positive idxs and empty idx
352
    for np_dtype in (
353
        np.int8,
354
        np.int16,
355
        np.int32,
356
        np.int64,
357
        np.uint8,
358
        np.uint16,
359
        np.uint32,
360
        np.uint64,
361
    ):
362
        assert_frame_equal(
363
            df[np.array([1, 0, 3, 2, 3, 0], dtype=np_dtype)],
364
            pl.DataFrame(
365
                {"a": [2.0, 1.0, 4.0, 3.0, 4.0, 1.0], "b": [4, 3, 6, 5, 6, 3]}
366
            ),
367
        )
368
        assert df[np.array([], dtype=np_dtype)].columns == ["a", "b"]
369

370
    # numpy array: positive and negative idxs.
371
    for np_dtype in (np.int8, np.int16, np.int32, np.int64):
372
        assert_frame_equal(
373
            df[np.array([-1, 0, -3, -2, 3, -4], dtype=np_dtype)],
374
            pl.DataFrame(
375
                {"a": [4.0, 1.0, 2.0, 3.0, 4.0, 1.0], "b": [6, 3, 4, 5, 6, 3]}
376
            ),
377
        )
378

379
    # zero-dimensional array indexing is equivalent to int row selection
380
    assert_frame_equal(df[np.array(0)], pl.DataFrame({"a": [1.0], "b": [3]}))
381
    assert_frame_equal(df[np.array(1)], pl.DataFrame({"a": [2.0], "b": [4]}))
382

383
    # note that we cannot use floats (even if they could be cast to int without loss)
384
    with pytest.raises(
385
        TypeError,
386
        match="cannot select columns using NumPy array of type float",
387
    ):
388
        _ = df[np.array([1.0])]
389

390
    with pytest.raises(
391
        TypeError,
392
        match="multi-dimensional NumPy arrays not supported as index",
393
    ):
394
        df[np.array([[0], [1]])]
395

396

397
def test_df_getitem_extended() -> None:
398
    df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]})
399

400
    # select columns by mask
401
    assert df[:2, :1].rows() == [(1,), (2,)]
402
    assert df[:2, ["a"]].rows() == [(1,), (2,)]
403

404
    # column selection by string(s) in first dimension
405
    assert df["a"].to_list() == [1, 2, 3]
406
    assert df["b"].to_list() == [1.0, 2.0, 3.0]
407
    assert df["c"].to_list() == ["a", "b", "c"]
408

409
    # row selection by integers(s) in first dimension
410
    assert_frame_equal(df[0], pl.DataFrame({"a": [1], "b": [1.0], "c": ["a"]}))
411
    assert_frame_equal(df[-1], pl.DataFrame({"a": [3], "b": [3.0], "c": ["c"]}))
412

413
    # row, column selection when using two dimensions
414
    assert df[:, "a"].to_list() == [1, 2, 3]
415
    assert df[:, 1].to_list() == [1.0, 2.0, 3.0]
416
    assert df[:2, 2].to_list() == ["a", "b"]
417

418
    assert_frame_equal(
419
        df[[1, 2]], pl.DataFrame({"a": [2, 3], "b": [2.0, 3.0], "c": ["b", "c"]})
420
    )
421
    assert_frame_equal(
422
        df[[-1, -2]], pl.DataFrame({"a": [3, 2], "b": [3.0, 2.0], "c": ["c", "b"]})
423
    )
424

425
    assert df[["a", "b"]].columns == ["a", "b"]
426
    assert_frame_equal(
427
        df[[1, 2], [1, 2]], pl.DataFrame({"b": [2.0, 3.0], "c": ["b", "c"]})
428
    )
429
    assert df[1, 2] == "b"
430
    assert df[1, 1] == 2.0
431
    assert df[2, 0] == 3
432

433
    assert df[[2], ["a", "b"]].rows() == [(3, 3.0)]
434
    assert df.to_series(0).name == "a"
435
    assert (df["a"] == df["a"]).sum() == 3
436
    assert (df["c"] == df["a"].cast(str)).sum() == 0
437
    assert df[:, "a":"b"].rows() == [(1, 1.0), (2, 2.0), (3, 3.0)]  # type: ignore[index, misc]
438
    assert df[:, "a":"c"].columns == ["a", "b", "c"]  # type: ignore[index, misc]
439
    assert df[:, []].shape == (0, 0)
440
    expect = pl.DataFrame({"c": ["b"]})
441
    assert_frame_equal(df[1, [2]], expect)
442
    expect = pl.DataFrame({"b": [1.0, 3.0]})
443
    assert_frame_equal(df[[0, 2], [1]], expect)
444
    assert df[0, "c"] == "a"
445
    assert df[1, "c"] == "b"
446
    assert df[2, "c"] == "c"
447
    assert df[0, "a"] == 1
448

449
    # more slicing
450
    expect = pl.DataFrame({"a": [3, 2, 1], "b": [3.0, 2.0, 1.0], "c": ["c", "b", "a"]})
451
    assert_frame_equal(df[::-1], expect)
452
    expect = pl.DataFrame({"a": [1, 2], "b": [1.0, 2.0], "c": ["a", "b"]})
453
    assert_frame_equal(df[:-1], expect)
454

455
    expect = pl.DataFrame({"a": [1, 3], "b": [1.0, 3.0], "c": ["a", "c"]})
456
    assert_frame_equal(df[::2], expect)
457

458
    # only allow boolean values in column position
459
    df = pl.DataFrame(
460
        {
461
            "a": [1, 2],
462
            "b": [2, 3],
463
            "c": [3, 4],
464
        }
465
    )
466

467
    assert df[:, [False, True, True]].columns == ["b", "c"]
468
    assert df[:, pl.Series([False, True, True])].columns == ["b", "c"]
469
    assert df[:, pl.Series([False, False, False])].columns == []
470

471

472
def test_df_getitem_5343() -> None:
473
    # https://github.com/pola-rs/polars/issues/5343
474
    df = pl.DataFrame(
475
        {
476
            f"foo{col}": [n**col for n in range(5)]  # 5 rows
477
            for col in range(12)  # 12 columns
478
        }
479
    )
480
    assert df[4, 4] == 256
481
    assert df[4, 5] == 1024
482
    assert_frame_equal(df[4, [2]], pl.DataFrame({"foo2": [16]}))
483
    assert_frame_equal(df[4, [5]], pl.DataFrame({"foo5": [1024]}))
484

485

486
def test_no_deadlock_19358() -> None:
487
    s = pl.Series(["text"] * 100 + [1] * 100, dtype=pl.Object)
488
    result = s.to_frame()[[0, -1]]
489
    assert result[""].to_list() == ["text", 1]
490

491
Product

Resources

Company