Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/dataframe/test_getitem.py
6939 views
1
from __future__ import annotations
2
3
from typing import Any
4
5
import hypothesis.strategies as st
6
import numpy as np
7
import pytest
8
from hypothesis import given
9
10
import polars as pl
11
from polars.testing import assert_frame_equal, assert_series_equal
12
from polars.testing.parametric import column, dataframes
13
from tests.unit.conftest import INTEGER_DTYPES, SIGNED_INTEGER_DTYPES
14
15
16
@given(
17
df=dataframes(
18
max_size=10,
19
cols=[
20
column(
21
"start",
22
dtype=pl.Int8,
23
allow_null=True,
24
strategy=st.integers(min_value=-8, max_value=8),
25
),
26
column(
27
"stop",
28
dtype=pl.Int8,
29
allow_null=True,
30
strategy=st.integers(min_value=-6, max_value=6),
31
),
32
column(
33
"step",
34
dtype=pl.Int8,
35
allow_null=True,
36
strategy=st.integers(min_value=-4, max_value=4).filter(
37
lambda x: x != 0
38
),
39
),
40
column("misc", dtype=pl.Int32),
41
],
42
)
43
# generated dataframe example -
44
# ┌───────┬──────┬──────┬───────┐
45
# │ start ┆ stop ┆ step ┆ misc │
46
# │ --- ┆ --- ┆ --- ┆ --- │
47
# │ i8 ┆ i8 ┆ i8 ┆ i32 │
48
# ╞═══════╪══════╪══════╪═══════╡
49
# │ 2 ┆ -1 ┆ null ┆ -55 │
50
# │ -3 ┆ 0 ┆ -2 ┆ 61582 │
51
# │ null ┆ 1 ┆ 2 ┆ 5865 │
52
# └───────┴──────┴──────┴───────┘
53
)
54
def test_df_getitem_row_slice(df: pl.DataFrame) -> None:
55
# take strategy-generated integer values from the frame as slice bounds.
56
# use these bounds to slice the same frame, and then validate the result
57
# against a py-native slice of the same data using the same bounds.
58
#
59
# given the average number of rows in the frames, and the value of
60
# max_examples, this will result in close to 5000 test permutations,
61
# running in around ~1.5 secs (depending on hardware/etc).
62
py_data = df.rows()
63
64
for start, stop, step, _ in py_data:
65
s = slice(start, stop, step)
66
sliced_py_data = py_data[s]
67
sliced_df_data = df[s].rows()
68
69
assert sliced_py_data == sliced_df_data, (
70
f"slice [{start}:{stop}:{step}] failed on df w/len={df.height}"
71
)
72
73
74
def test_df_getitem_col_single_name() -> None:
75
df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
76
result = df[:, "a"]
77
expected = df.select("a").to_series()
78
assert_series_equal(result, expected)
79
80
81
@pytest.mark.parametrize(
82
("input", "expected_cols"),
83
[
84
(["a"], ["a"]),
85
(["a", "d"], ["a", "d"]),
86
(slice("b", "d"), ["b", "c", "d"]),
87
(pl.Series(["a", "b"]), ["a", "b"]),
88
(np.array(["c", "d"]), ["c", "d"]),
89
],
90
)
91
def test_df_getitem_col_multiple_names(input: Any, expected_cols: list[str]) -> None:
92
df = pl.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8]})
93
result = df[:, input]
94
expected = df.select(expected_cols)
95
assert_frame_equal(result, expected)
96
97
98
def test_df_getitem_col_single_index() -> None:
99
df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
100
result = df[:, 1]
101
expected = df.select("b").to_series()
102
assert_series_equal(result, expected)
103
104
105
def test_df_getitem_col_two_entries() -> None:
106
df = pl.DataFrame({"x": [1.0], "y": [1.0]})
107
108
assert_frame_equal(df["x", "y"], df)
109
assert_frame_equal(df[True, True], df)
110
111
112
@pytest.mark.parametrize(
113
("input", "expected_cols"),
114
[
115
([0], ["a"]),
116
([0, 3], ["a", "d"]),
117
(slice(1, 4), ["b", "c", "d"]),
118
(pl.Series([0, 1]), ["a", "b"]),
119
(np.array([2, 3]), ["c", "d"]),
120
],
121
)
122
def test_df_getitem_col_multiple_indices(input: Any, expected_cols: list[str]) -> None:
123
df = pl.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8]})
124
result = df[:, input]
125
expected = df.select(expected_cols)
126
assert_frame_equal(result, expected)
127
128
129
@pytest.mark.parametrize(
130
"mask",
131
[
132
[True, False, True],
133
pl.Series([True, False, True]),
134
np.array([True, False, True]),
135
],
136
)
137
def test_df_getitem_col_boolean_mask(mask: Any) -> None:
138
df = pl.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]})
139
result = df[:, mask]
140
expected = df.select("a", "c")
141
assert_frame_equal(result, expected)
142
143
144
@pytest.mark.parametrize(
145
("rng", "expected_cols"),
146
[
147
(range(2), ["a", "b"]),
148
(range(1, 4), ["b", "c", "d"]),
149
(range(3, 0, -2), ["d", "b"]),
150
],
151
)
152
def test_df_getitem_col_range(rng: range, expected_cols: list[str]) -> None:
153
df = pl.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6], "d": [7, 8]})
154
result = df[:, rng]
155
expected = df.select(expected_cols)
156
assert_frame_equal(result, expected)
157
158
159
@pytest.mark.parametrize(
160
"input", [[], (), pl.Series(dtype=pl.Int64), np.array([], dtype=np.uint32)]
161
)
162
def test_df_getitem_col_empty_inputs(input: Any) -> None:
163
df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0]})
164
result = df[:, input]
165
expected = pl.DataFrame()
166
assert_frame_equal(result, expected)
167
168
169
@pytest.mark.parametrize(
170
("input", "match"),
171
[
172
(
173
[0.0, 1.0],
174
"cannot select columns using Sequence with elements of type 'float'",
175
),
176
(
177
pl.Series([[1, 2], [3, 4]]),
178
"cannot select columns using Series of type List\\(Int64\\)",
179
),
180
(
181
np.array([0.0, 1.0]),
182
"cannot select columns using NumPy array of type float64",
183
),
184
(object(), "cannot select columns using key of type 'object'"),
185
],
186
)
187
def test_df_getitem_col_invalid_inputs(input: Any, match: str) -> None:
188
df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0]})
189
with pytest.raises(TypeError, match=match):
190
df[:, input]
191
192
193
@pytest.mark.parametrize(
194
("input", "match"),
195
[
196
(["a", 2], "'int' object cannot be converted to 'PyString'"),
197
([1, "c"], "'str' object cannot be interpreted as an integer"),
198
],
199
)
200
def test_df_getitem_col_mixed_inputs(input: list[Any], match: str) -> None:
201
df = pl.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]})
202
with pytest.raises(TypeError, match=match):
203
df[:, input]
204
205
206
@pytest.mark.parametrize(
207
("input", "match"),
208
[
209
([0.0, 1.0], "unexpected value while building Series of type Int64"),
210
(
211
pl.Series([[1, 2], [3, 4]]),
212
"cannot treat Series of type List\\(Int64\\) as indices",
213
),
214
(np.array([0.0, 1.0]), "cannot treat NumPy array of type float64 as indices"),
215
(object(), "cannot select rows using key of type 'object'"),
216
],
217
)
218
def test_df_getitem_row_invalid_inputs(input: Any, match: str) -> None:
219
df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0]})
220
with pytest.raises(TypeError, match=match):
221
df[input, :]
222
223
224
def test_df_getitem_row_range() -> None:
225
df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5.0, 6.0, 7.0, 8.0]})
226
result = df[range(3, 0, -2), :]
227
expected = pl.DataFrame({"a": [4, 2], "b": [8.0, 6.0]})
228
assert_frame_equal(result, expected)
229
230
231
def test_df_getitem_row_range_single_input() -> None:
232
df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5.0, 6.0, 7.0, 8.0]})
233
result = df[range(1, 3)]
234
expected = pl.DataFrame({"a": [2, 3], "b": [6.0, 7.0]})
235
assert_frame_equal(result, expected)
236
237
238
def test_df_getitem_row_empty_list_single_input() -> None:
239
df = pl.DataFrame({"a": [1, 2], "b": [5.0, 6.0]})
240
result = df[[]]
241
expected = df.clear()
242
assert_frame_equal(result, expected)
243
244
245
def test_df_getitem() -> None:
246
"""Test all the methods to use [] on a dataframe."""
247
df = pl.DataFrame({"a": [1.0, 2.0, 3.0, 4.0], "b": [3, 4, 5, 6]})
248
249
# multiple slices.
250
# The first element refers to the rows, the second element to columns
251
assert_frame_equal(df[:, :], df)
252
253
# str, always refers to a column name
254
assert_series_equal(df["a"], pl.Series("a", [1.0, 2.0, 3.0, 4.0]))
255
256
# int, always refers to a row index (zero-based): index=1 => second row
257
assert_frame_equal(df[1], pl.DataFrame({"a": [2.0], "b": [4]}))
258
259
# int, int.
260
# The first element refers to the rows, the second element to columns
261
assert df[2, 1] == 5
262
assert df[2, -2] == 3.0
263
264
with pytest.raises(IndexError):
265
# Column index out of bounds
266
df[2, 2]
267
268
with pytest.raises(IndexError):
269
# Column index out of bounds
270
df[2, -3]
271
272
# int, list[int].
273
# The first element refers to the rows, the second element to columns
274
assert_frame_equal(df[2, [1, 0]], pl.DataFrame({"b": [5], "a": [3.0]}))
275
assert_frame_equal(df[2, [-1, -2]], pl.DataFrame({"b": [5], "a": [3.0]}))
276
277
with pytest.raises(IndexError):
278
# Column index out of bounds
279
df[2, [2, 0]]
280
281
with pytest.raises(IndexError):
282
# Column index out of bounds
283
df[2, [2, -3]]
284
285
# slice. Below an example of taking every second row
286
assert_frame_equal(df[1::2], pl.DataFrame({"a": [2.0, 4.0], "b": [4, 6]}))
287
288
# slice, empty slice
289
assert df[:0].columns == ["a", "b"]
290
assert len(df[:0]) == 0
291
292
# make mypy happy
293
empty: list[int] = []
294
295
# empty list with column selector drops rows but keeps columns
296
assert_frame_equal(df[empty, :], df[:0])
297
298
# sequences (lists or tuples; tuple only if length != 2)
299
# if strings or list of expressions, assumed to be column names
300
# if bools, assumed to be a row mask
301
# if integers, assumed to be row indices
302
assert_frame_equal(df[["a", "b"]], df)
303
assert_frame_equal(df.select([pl.col("a"), pl.col("b")]), df)
304
assert_frame_equal(
305
df[[1, -4, -1, 2, 1]],
306
pl.DataFrame({"a": [2.0, 1.0, 4.0, 3.0, 2.0], "b": [4, 3, 6, 5, 4]}),
307
)
308
309
# pl.Series: strings for column selections.
310
assert_frame_equal(df[pl.Series("", ["a", "b"])], df)
311
312
# pl.Series: positive idxs or empty idxs for row selection.
313
for pl_dtype in INTEGER_DTYPES:
314
assert_frame_equal(
315
df[pl.Series("", [1, 0, 3, 2, 3, 0], dtype=pl_dtype)],
316
pl.DataFrame(
317
{"a": [2.0, 1.0, 4.0, 3.0, 4.0, 1.0], "b": [4, 3, 6, 5, 6, 3]}
318
),
319
)
320
assert df[pl.Series("", [], dtype=pl_dtype)].columns == ["a", "b"]
321
322
# pl.Series: positive and negative idxs for row selection.
323
for pl_dtype in SIGNED_INTEGER_DTYPES:
324
assert_frame_equal(
325
df[pl.Series("", [-1, 0, -3, -2, 3, -4], dtype=pl_dtype)],
326
pl.DataFrame(
327
{"a": [4.0, 1.0, 2.0, 3.0, 4.0, 1.0], "b": [6, 3, 4, 5, 6, 3]}
328
),
329
)
330
331
# Boolean masks for rows not supported
332
with pytest.raises(TypeError):
333
df[[True, False, True], [False, True]]
334
with pytest.raises(TypeError):
335
df[pl.Series([True, False, True]), "b"]
336
337
assert_frame_equal(df[np.array([True, False])], df[:, :1])
338
339
# wrong length boolean mask for column selection
340
with pytest.raises(
341
ValueError,
342
match=f"expected {df.width} values when selecting columns by boolean mask",
343
):
344
df[:, [True, False, True]]
345
346
347
def test_df_getitem_numpy() -> None:
348
# nupmy getitem: assumed to be row indices if integers, or columns if strings
349
df = pl.DataFrame({"a": [1.0, 2.0, 3.0, 4.0], "b": [3, 4, 5, 6]})
350
351
# numpy array: positive idxs and empty idx
352
for np_dtype in (
353
np.int8,
354
np.int16,
355
np.int32,
356
np.int64,
357
np.uint8,
358
np.uint16,
359
np.uint32,
360
np.uint64,
361
):
362
assert_frame_equal(
363
df[np.array([1, 0, 3, 2, 3, 0], dtype=np_dtype)],
364
pl.DataFrame(
365
{"a": [2.0, 1.0, 4.0, 3.0, 4.0, 1.0], "b": [4, 3, 6, 5, 6, 3]}
366
),
367
)
368
assert df[np.array([], dtype=np_dtype)].columns == ["a", "b"]
369
370
# numpy array: positive and negative idxs.
371
for np_dtype in (np.int8, np.int16, np.int32, np.int64):
372
assert_frame_equal(
373
df[np.array([-1, 0, -3, -2, 3, -4], dtype=np_dtype)],
374
pl.DataFrame(
375
{"a": [4.0, 1.0, 2.0, 3.0, 4.0, 1.0], "b": [6, 3, 4, 5, 6, 3]}
376
),
377
)
378
379
# zero-dimensional array indexing is equivalent to int row selection
380
assert_frame_equal(df[np.array(0)], pl.DataFrame({"a": [1.0], "b": [3]}))
381
assert_frame_equal(df[np.array(1)], pl.DataFrame({"a": [2.0], "b": [4]}))
382
383
# note that we cannot use floats (even if they could be cast to int without loss)
384
with pytest.raises(
385
TypeError,
386
match="cannot select columns using NumPy array of type float",
387
):
388
_ = df[np.array([1.0])]
389
390
with pytest.raises(
391
TypeError,
392
match="multi-dimensional NumPy arrays not supported as index",
393
):
394
df[np.array([[0], [1]])]
395
396
397
def test_df_getitem_extended() -> None:
398
df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]})
399
400
# select columns by mask
401
assert df[:2, :1].rows() == [(1,), (2,)]
402
assert df[:2, ["a"]].rows() == [(1,), (2,)]
403
404
# column selection by string(s) in first dimension
405
assert df["a"].to_list() == [1, 2, 3]
406
assert df["b"].to_list() == [1.0, 2.0, 3.0]
407
assert df["c"].to_list() == ["a", "b", "c"]
408
409
# row selection by integers(s) in first dimension
410
assert_frame_equal(df[0], pl.DataFrame({"a": [1], "b": [1.0], "c": ["a"]}))
411
assert_frame_equal(df[-1], pl.DataFrame({"a": [3], "b": [3.0], "c": ["c"]}))
412
413
# row, column selection when using two dimensions
414
assert df[:, "a"].to_list() == [1, 2, 3]
415
assert df[:, 1].to_list() == [1.0, 2.0, 3.0]
416
assert df[:2, 2].to_list() == ["a", "b"]
417
418
assert_frame_equal(
419
df[[1, 2]], pl.DataFrame({"a": [2, 3], "b": [2.0, 3.0], "c": ["b", "c"]})
420
)
421
assert_frame_equal(
422
df[[-1, -2]], pl.DataFrame({"a": [3, 2], "b": [3.0, 2.0], "c": ["c", "b"]})
423
)
424
425
assert df[["a", "b"]].columns == ["a", "b"]
426
assert_frame_equal(
427
df[[1, 2], [1, 2]], pl.DataFrame({"b": [2.0, 3.0], "c": ["b", "c"]})
428
)
429
assert df[1, 2] == "b"
430
assert df[1, 1] == 2.0
431
assert df[2, 0] == 3
432
433
assert df[[2], ["a", "b"]].rows() == [(3, 3.0)]
434
assert df.to_series(0).name == "a"
435
assert (df["a"] == df["a"]).sum() == 3
436
assert (df["c"] == df["a"].cast(str)).sum() == 0
437
assert df[:, "a":"b"].rows() == [(1, 1.0), (2, 2.0), (3, 3.0)] # type: ignore[index, misc]
438
assert df[:, "a":"c"].columns == ["a", "b", "c"] # type: ignore[index, misc]
439
assert df[:, []].shape == (0, 0)
440
expect = pl.DataFrame({"c": ["b"]})
441
assert_frame_equal(df[1, [2]], expect)
442
expect = pl.DataFrame({"b": [1.0, 3.0]})
443
assert_frame_equal(df[[0, 2], [1]], expect)
444
assert df[0, "c"] == "a"
445
assert df[1, "c"] == "b"
446
assert df[2, "c"] == "c"
447
assert df[0, "a"] == 1
448
449
# more slicing
450
expect = pl.DataFrame({"a": [3, 2, 1], "b": [3.0, 2.0, 1.0], "c": ["c", "b", "a"]})
451
assert_frame_equal(df[::-1], expect)
452
expect = pl.DataFrame({"a": [1, 2], "b": [1.0, 2.0], "c": ["a", "b"]})
453
assert_frame_equal(df[:-1], expect)
454
455
expect = pl.DataFrame({"a": [1, 3], "b": [1.0, 3.0], "c": ["a", "c"]})
456
assert_frame_equal(df[::2], expect)
457
458
# only allow boolean values in column position
459
df = pl.DataFrame(
460
{
461
"a": [1, 2],
462
"b": [2, 3],
463
"c": [3, 4],
464
}
465
)
466
467
assert df[:, [False, True, True]].columns == ["b", "c"]
468
assert df[:, pl.Series([False, True, True])].columns == ["b", "c"]
469
assert df[:, pl.Series([False, False, False])].columns == []
470
471
472
def test_df_getitem_5343() -> None:
473
# https://github.com/pola-rs/polars/issues/5343
474
df = pl.DataFrame(
475
{
476
f"foo{col}": [n**col for n in range(5)] # 5 rows
477
for col in range(12) # 12 columns
478
}
479
)
480
assert df[4, 4] == 256
481
assert df[4, 5] == 1024
482
assert_frame_equal(df[4, [2]], pl.DataFrame({"foo2": [16]}))
483
assert_frame_equal(df[4, [5]], pl.DataFrame({"foo5": [1024]}))
484
485
486
def test_no_deadlock_19358() -> None:
487
s = pl.Series(["text"] * 100 + [1] * 100, dtype=pl.Object)
488
result = s.to_frame()[[0, -1]]
489
assert result[""].to_list() == ["text", 1]
490
491