Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/dataframe/test_df.py
8406 views
1
from __future__ import annotations
2
3
import sys
4
import typing
5
from collections import OrderedDict
6
from collections.abc import Iterator, Mapping
7
from datetime import date, datetime, time, timedelta, timezone
8
from decimal import Decimal
9
from io import BytesIO
10
from itertools import chain, repeat
11
from operator import floordiv, truediv
12
from typing import TYPE_CHECKING, Any, cast
13
from zoneinfo import ZoneInfo
14
15
import numpy as np
16
import pyarrow as pa
17
import pytest
18
19
import polars as pl
20
import polars.selectors as cs
21
from polars._plr import PySeries
22
from polars._utils.construction import iterable_to_pydf
23
from polars.datatypes import DTYPE_TEMPORAL_UNITS
24
from polars.exceptions import (
25
ColumnNotFoundError,
26
ComputeError,
27
DuplicateError,
28
InvalidOperationError,
29
OutOfBoundsError,
30
ShapeError,
31
)
32
from polars.testing import (
33
assert_frame_equal,
34
assert_frame_not_equal,
35
assert_series_equal,
36
)
37
from tests.unit.conftest import FLOAT_DTYPES, INTEGER_DTYPES
38
39
if TYPE_CHECKING:
40
from collections.abc import Callable, Iterator, Sequence
41
42
from polars import Expr
43
from polars._typing import JoinStrategy, UniqueKeepStrategy
44
from tests.conftest import PlMonkeyPatch
45
46
47
class MappingObject(Mapping[str, Any]): # noqa: D101
48
def __init__(self, **values: Any) -> None:
49
self._data = {**values}
50
51
def __getitem__(self, key: str) -> Any:
52
return self._data[key]
53
54
def __iter__(self) -> Iterator[str]:
55
yield from self._data
56
57
def __len__(self) -> int:
58
return len(self._data)
59
60
61
def test_version() -> None:
62
isinstance(pl.__version__, str)
63
64
65
def test_null_count() -> None:
66
df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", None]})
67
assert df.null_count().shape == (1, 2)
68
assert df.null_count().row(0) == (0, 1)
69
assert df.null_count().row(np.int64(0)) == (0, 1) # type: ignore[call-overload]
70
71
72
@pytest.mark.parametrize("input", [None, (), [], {}, pa.Table.from_arrays([])])
73
def test_init_empty(input: Any) -> None:
74
# test various flavours of empty init
75
df = pl.DataFrame(input)
76
assert df.shape == (0, 0)
77
assert df.is_empty()
78
79
80
def test_df_bool_ambiguous() -> None:
81
empty_df = pl.DataFrame()
82
with pytest.raises(TypeError, match="ambiguous"):
83
not empty_df
84
85
86
def test_special_char_colname_init() -> None:
87
from string import punctuation
88
89
cols = [(c, pl.Int8) for c in punctuation]
90
df = pl.DataFrame(schema=cols)
91
92
assert len(cols) == df.width
93
assert len(df.rows()) == 0
94
assert df.is_empty()
95
96
97
def test_comparisons() -> None:
98
df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
99
100
# Constants
101
assert_frame_equal(df == 2, pl.DataFrame({"a": [False, True], "b": [False, False]}))
102
assert_frame_equal(df != 2, pl.DataFrame({"a": [True, False], "b": [True, True]}))
103
assert_frame_equal(df < 3.0, pl.DataFrame({"a": [True, True], "b": [False, False]}))
104
assert_frame_equal(df >= 2, pl.DataFrame({"a": [False, True], "b": [True, True]}))
105
assert_frame_equal(df <= 2, pl.DataFrame({"a": [True, True], "b": [False, False]}))
106
107
with pytest.raises(ComputeError):
108
df > "2" # noqa: B015
109
110
# Series
111
s = pl.Series([3, 1])
112
assert_frame_equal(df >= s, pl.DataFrame({"a": [False, True], "b": [True, True]}))
113
114
# DataFrame
115
other = pl.DataFrame({"a": [1, 2], "b": [2, 3]})
116
assert_frame_equal(
117
df == other, pl.DataFrame({"a": [True, True], "b": [False, False]})
118
)
119
assert_frame_equal(
120
df != other, pl.DataFrame({"a": [False, False], "b": [True, True]})
121
)
122
assert_frame_equal(
123
df > other, pl.DataFrame({"a": [False, False], "b": [True, True]})
124
)
125
assert_frame_equal(
126
df < other, pl.DataFrame({"a": [False, False], "b": [False, False]})
127
)
128
assert_frame_equal(
129
df >= other, pl.DataFrame({"a": [True, True], "b": [True, True]})
130
)
131
assert_frame_equal(
132
df <= other, pl.DataFrame({"a": [True, True], "b": [False, False]})
133
)
134
135
# DataFrame columns mismatch
136
with pytest.raises(ValueError):
137
df == pl.DataFrame({"a": [1, 2], "c": [3, 4]}) # noqa: B015
138
with pytest.raises(ValueError):
139
df == pl.DataFrame({"b": [3, 4], "a": [1, 2]}) # noqa: B015
140
141
# DataFrame shape mismatch
142
with pytest.raises(ValueError):
143
df == pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) # noqa: B015
144
145
# Type mismatch
146
with pytest.raises(ComputeError):
147
df == pl.DataFrame({"a": [1, 2], "b": ["x", "y"]}) # noqa: B015
148
149
150
def test_column_selection() -> None:
151
df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]})
152
153
# get column by name
154
b = pl.Series("b", [1.0, 2.0, 3.0])
155
assert_series_equal(df["b"], b)
156
assert_series_equal(df.get_column("b"), b)
157
158
with pytest.raises(ColumnNotFoundError, match="x"):
159
df.get_column("x")
160
161
default_series = pl.Series("x", ["?", "?", "?"])
162
assert_series_equal(df.get_column("x", default=default_series), default_series)
163
164
assert df.get_column("x", default=None) is None
165
166
# get column by index
167
assert_series_equal(df.to_series(1), pl.Series("b", [1.0, 2.0, 3.0]))
168
assert_series_equal(df.to_series(-1), pl.Series("c", ["a", "b", "c"]))
169
170
171
def test_mixed_sequence_selection() -> None:
172
df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
173
result = df.select(["a", pl.col("b"), pl.lit("c")])
174
expected = pl.DataFrame({"a": [1, 2], "b": [3, 4], "literal": ["c", "c"]})
175
assert_frame_equal(result, expected)
176
177
178
def test_from_arrow(plmonkeypatch: PlMonkeyPatch) -> None:
179
tbl = pa.table(
180
{
181
"a": pa.array([1, 2], pa.timestamp("s")),
182
"b": pa.array([1, 2], pa.timestamp("ms")),
183
"c": pa.array([1, 2], pa.timestamp("us")),
184
"d": pa.array([1, 2], pa.timestamp("ns")),
185
"e": pa.array([1, 2], pa.int32()),
186
"decimal1": pa.array([1, 2], pa.decimal128(2, 1)),
187
"struct": pa.array(
188
[{"a": 1}, {"a": 2}], pa.struct([pa.field("a", pa.int32())])
189
),
190
}
191
)
192
record_batches = tbl.to_batches(max_chunksize=1)
193
expected_schema = {
194
"a": pl.Datetime("ms"),
195
"b": pl.Datetime("ms"),
196
"c": pl.Datetime("us"),
197
"d": pl.Datetime("ns"),
198
"e": pl.Int32,
199
"decimal1": pl.Decimal(2, 1),
200
"struct": pl.Struct({"a": pl.Int32()}),
201
}
202
expected_data = [
203
(
204
datetime(1970, 1, 1, 0, 0, 1),
205
datetime(1970, 1, 1, 0, 0, 0, 1000),
206
datetime(1970, 1, 1, 0, 0, 0, 1),
207
datetime(1970, 1, 1, 0, 0),
208
1,
209
Decimal("1.0"),
210
{"a": 1},
211
),
212
(
213
datetime(1970, 1, 1, 0, 0, 2),
214
datetime(1970, 1, 1, 0, 0, 0, 2000),
215
datetime(1970, 1, 1, 0, 0, 0, 2),
216
datetime(1970, 1, 1, 0, 0),
217
2,
218
Decimal("2.0"),
219
{"a": 2},
220
),
221
]
222
for arrow_data in (tbl, record_batches, (rb for rb in record_batches)):
223
df = cast("pl.DataFrame", pl.from_arrow(arrow_data))
224
assert df.schema == expected_schema
225
assert df.rows() == expected_data
226
227
# record batches (inc. empty)
228
for b, n_expected in (
229
(record_batches[0], 1),
230
(record_batches[0][:0], 0),
231
):
232
df = cast("pl.DataFrame", pl.from_arrow(b))
233
assert df.schema == expected_schema
234
assert df.rows() == expected_data[:n_expected]
235
236
empty_tbl = tbl[:0] # no rows
237
df = cast("pl.DataFrame", pl.from_arrow(empty_tbl))
238
assert df.schema == expected_schema
239
assert df.rows() == []
240
241
# try a single column dtype override
242
for t in (tbl, empty_tbl):
243
df = pl.DataFrame(t, schema_overrides={"e": pl.Int8})
244
override_schema = expected_schema.copy()
245
override_schema["e"] = pl.Int8
246
assert df.schema == override_schema
247
assert df.rows() == expected_data[: (df.height)]
248
249
# init from record batches with overrides
250
df = pl.DataFrame(
251
{
252
"id": ["a123", "b345", "c567", "d789", "e101"],
253
"points": [99, 45, 50, 85, 35],
254
}
255
)
256
tbl = df.to_arrow()
257
batches = tbl.to_batches(max_chunksize=3)
258
259
df0: pl.DataFrame = pl.from_arrow(batches) # type: ignore[assignment]
260
df1: pl.DataFrame = pl.from_arrow( # type: ignore[assignment]
261
data=batches,
262
schema=["x", "y"],
263
schema_overrides={"y": pl.Int32},
264
)
265
df2: pl.DataFrame = pl.from_arrow( # type: ignore[assignment]
266
data=batches[0],
267
schema=["x", "y"],
268
schema_overrides={"y": pl.Int32},
269
)
270
271
assert df0.rows() == df.rows()
272
assert df1.rows() == df.rows()
273
assert df2.rows() == df.rows()[:3]
274
275
assert df0.schema == {"id": pl.String, "points": pl.Int64}
276
print(df1.schema)
277
assert df1.schema == {"x": pl.String, "y": pl.Int32}
278
assert df2.schema == {"x": pl.String, "y": pl.Int32}
279
280
with pytest.raises(TypeError, match="Cannot convert str"):
281
pl.from_arrow(data="xyz")
282
283
with pytest.raises(TypeError, match="Cannot convert int"):
284
pl.from_arrow(data=(x for x in (1, 2, 3)))
285
286
287
@pytest.mark.parametrize(
288
"data",
289
[
290
pa.Table.from_pydict(
291
{
292
"struct": pa.array(
293
[{"a": 1}, {"a": 2}], pa.struct([pa.field("a", pa.int32())])
294
),
295
}
296
),
297
pa.Table.from_pydict(
298
{
299
"struct": pa.chunked_array(
300
[[{"a": 1}], [{"a": 2}]], pa.struct([pa.field("a", pa.int32())])
301
),
302
}
303
),
304
],
305
)
306
def test_from_arrow_struct_column(data: pa.Table) -> None:
307
df = cast("pl.DataFrame", pl.from_arrow(data=data))
308
expected_schema = pl.Schema({"struct": pl.Struct({"a": pl.Int32()})})
309
expected_data = [({"a": 1},), ({"a": 2},)]
310
assert df.schema == expected_schema
311
assert df.rows() == expected_data
312
313
314
def test_dataframe_membership_operator() -> None:
315
# cf. issue #4032
316
df = pl.DataFrame({"name": ["Jane", "John"], "age": [20, 30]})
317
assert "name" in df
318
assert "phone" not in df
319
assert df._ipython_key_completions_() == ["name", "age"]
320
321
322
def test_sort() -> None:
323
df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3]})
324
expected = pl.DataFrame({"a": [1, 2, 3], "b": [2, 1, 3]})
325
assert_frame_equal(df.sort("a"), expected)
326
assert_frame_equal(df.sort(["a", "b"]), expected)
327
328
329
def test_sort_multi_output_exprs_01() -> None:
330
df = pl.DataFrame(
331
{
332
"dts": [date(2077, 10, 3), date(2077, 10, 2), date(2077, 10, 2)],
333
"strs": ["abc", "def", "ghi"],
334
"vals": [10.5, 20.3, 15.7],
335
}
336
)
337
338
expected = pl.DataFrame(
339
{
340
"dts": [date(2077, 10, 2), date(2077, 10, 2), date(2077, 10, 3)],
341
"strs": ["ghi", "def", "abc"],
342
"vals": [15.7, 20.3, 10.5],
343
}
344
)
345
assert_frame_equal(expected, df.sort(pl.col("^(d|v).*$")))
346
assert_frame_equal(expected, df.sort(cs.temporal() | cs.numeric()))
347
assert_frame_equal(expected, df.sort(cs.temporal(), cs.numeric(), cs.binary()))
348
349
expected = pl.DataFrame(
350
{
351
"dts": [date(2077, 10, 3), date(2077, 10, 2), date(2077, 10, 2)],
352
"strs": ["abc", "def", "ghi"],
353
"vals": [10.5, 20.3, 15.7],
354
}
355
)
356
assert_frame_equal(
357
expected,
358
df.sort(pl.col("^(d|v).*$"), descending=[True]),
359
)
360
assert_frame_equal(
361
expected,
362
df.sort(cs.temporal() | cs.numeric(), descending=[True]),
363
)
364
assert_frame_equal(
365
expected,
366
df.sort(cs.temporal(), cs.numeric(), descending=[True, True]),
367
)
368
369
with pytest.raises(
370
ValueError,
371
match=r"the length of `descending` \(2\) does not match the length of `by` \(1\)",
372
):
373
df.sort(by=[cs.temporal()], descending=[True, False])
374
375
with pytest.raises(
376
ValueError,
377
match=r"the length of `nulls_last` \(3\) does not match the length of `by` \(2\)",
378
):
379
df.sort("dts", "strs", nulls_last=[True, False, True])
380
381
# No columns selected - return original input.
382
assert_frame_equal(df, df.sort(pl.col("^xxx$")))
383
384
385
@pytest.mark.parametrize(
386
("by_explicit", "desc_explicit", "by_multi", "desc_multi"),
387
[
388
(
389
["w", "x", "y", "z"],
390
[False, False, True, True],
391
[cs.integer(), cs.string()],
392
[False, True],
393
),
394
(
395
["w", "y", "z"],
396
[True, True, False],
397
[pl.col("^(w|y)$"), pl.col("^z.*$")],
398
[True, False],
399
),
400
(
401
["z", "w", "x"],
402
[True, False, False],
403
[pl.col("z"), cs.numeric()],
404
[True, False],
405
),
406
],
407
)
408
def test_sort_multi_output_exprs_02(
409
by_explicit: list[str],
410
desc_explicit: list[bool],
411
by_multi: list[Expr],
412
desc_multi: list[bool],
413
) -> None:
414
df = pl.DataFrame(
415
{
416
"w": [100, 100, 100, 100, 200, 200, 200, 200],
417
"x": [888, 888, 444, 444, 888, 888, 444, 888],
418
"y": ["b", "b", "a", "a", "b", "b", "a", "a"],
419
"z": ["x", "y", "x", "y", "x", "y", "x", "y"],
420
}
421
)
422
res1 = df.sort(*by_explicit, descending=desc_explicit)
423
res2 = df.sort(*by_multi, descending=desc_multi)
424
assert_frame_equal(res1, res2)
425
426
427
def test_sort_maintain_order() -> None:
428
l1 = (
429
pl.LazyFrame({"A": [1] * 4, "B": ["A", "B", "C", "D"]})
430
.sort("A", maintain_order=True)
431
.slice(0, 3)
432
.collect()["B"]
433
.to_list()
434
)
435
l2 = (
436
pl.LazyFrame({"A": [1] * 4, "B": ["A", "B", "C", "D"]})
437
.sort("A")
438
.collect()
439
.slice(0, 3)["B"]
440
.to_list()
441
)
442
assert l1 == l2 == ["A", "B", "C"]
443
444
445
@pytest.mark.parametrize("nulls_last", [False, True], ids=["nulls_first", "nulls_last"])
446
def test_sort_maintain_order_descending_repeated_nulls(nulls_last: bool) -> None:
447
got = (
448
pl.LazyFrame({"A": [None, -1, 1, 1, None], "B": [1, 2, 3, 4, 5]})
449
.sort("A", descending=True, maintain_order=True, nulls_last=nulls_last)
450
.collect()
451
)
452
if nulls_last:
453
expect = pl.DataFrame({"A": [1, 1, -1, None, None], "B": [3, 4, 2, 1, 5]})
454
else:
455
expect = pl.DataFrame({"A": [None, None, 1, 1, -1], "B": [1, 5, 3, 4, 2]})
456
assert_frame_equal(got, expect)
457
458
459
def test_replace() -> None:
460
df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3]})
461
s = pl.Series("c", [True, False, True])
462
df._replace("a", s)
463
assert_frame_equal(df, pl.DataFrame({"a": [True, False, True], "b": [1, 2, 3]}))
464
465
466
def test_assignment() -> None:
467
df = pl.DataFrame({"foo": [1, 2, 3], "bar": [2, 3, 4]})
468
df = df.with_columns(pl.col("foo").alias("foo"))
469
# make sure that assignment does not change column order
470
assert df.columns == ["foo", "bar"]
471
df = df.with_columns(
472
pl.when(pl.col("foo") > 1).then(9).otherwise(pl.col("foo")).alias("foo")
473
)
474
assert df["foo"].to_list() == [1, 9, 9]
475
476
477
def test_insert_column() -> None:
478
# insert series
479
df = (
480
pl.DataFrame({"z": [3, 4, 5]})
481
.insert_column(0, pl.Series("x", [1, 2, 3]))
482
.insert_column(-1, pl.Series("y", [2, 3, 4]))
483
)
484
expected_df = pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})
485
assert_frame_equal(expected_df, df)
486
487
# insert expressions
488
df = pl.DataFrame(
489
{
490
"id": ["xx", "yy", "zz"],
491
"v1": [5, 4, 6],
492
"v2": [7, 3, 3],
493
}
494
)
495
df.insert_column(3, (pl.col("v1") * pl.col("v2")).alias("v3"))
496
df.insert_column(1, (pl.col("v2") - pl.col("v1")).alias("v0"))
497
498
expected = pl.DataFrame(
499
{
500
"id": ["xx", "yy", "zz"],
501
"v0": [2, -1, -3],
502
"v1": [5, 4, 6],
503
"v2": [7, 3, 3],
504
"v3": [35, 12, 18],
505
}
506
)
507
assert_frame_equal(df, expected)
508
509
# check that we raise suitable index errors
510
for idx, column in (
511
(10, pl.col("v1").sqrt().alias("v1_sqrt")),
512
(-10, pl.Series("foo", [1, 2, 3])),
513
):
514
with pytest.raises(
515
IndexError,
516
match=rf"column index {idx} is out of range \(frame has 5 columns\)",
517
):
518
df.insert_column(idx, column)
519
520
521
def test_replace_column() -> None:
522
df = (
523
pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})
524
.replace_column(0, pl.Series("a", [4, 5, 6]))
525
.replace_column(-2, pl.Series("b", [5, 6, 7]))
526
.replace_column(-1, pl.Series("c", [6, 7, 8]))
527
)
528
expected_df = pl.DataFrame({"a": [4, 5, 6], "b": [5, 6, 7], "c": [6, 7, 8]})
529
assert_frame_equal(expected_df, df)
530
531
532
def test_to_series() -> None:
533
df = pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})
534
535
assert_series_equal(df.to_series(), df["x"])
536
assert_series_equal(df.to_series(0), df["x"])
537
assert_series_equal(df.to_series(-3), df["x"])
538
539
assert_series_equal(df.to_series(1), df["y"])
540
assert_series_equal(df.to_series(-2), df["y"])
541
542
assert_series_equal(df.to_series(2), df["z"])
543
assert_series_equal(df.to_series(-1), df["z"])
544
545
546
def test_to_series_bad_inputs() -> None:
547
df = pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})
548
549
with pytest.raises(IndexError, match="index 5 is out of bounds"):
550
df.to_series(5)
551
552
with pytest.raises(IndexError, match="index -100 is out of bounds"):
553
df.to_series(-100)
554
555
with pytest.raises(
556
TypeError, match="'str' object cannot be interpreted as an integer"
557
):
558
df.to_series("x") # type: ignore[arg-type]
559
560
561
def test_gather_every() -> None:
562
df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["w", "x", "y", "z"]})
563
expected_df = pl.DataFrame({"a": [1, 3], "b": ["w", "y"]})
564
assert_frame_equal(expected_df, df.gather_every(2))
565
566
expected_df = pl.DataFrame({"a": [2, 4], "b": ["x", "z"]})
567
assert_frame_equal(expected_df, df.gather_every(2, offset=1))
568
569
570
def test_gather_every_agg() -> None:
571
df = pl.DataFrame(
572
{
573
"g": [1, 1, 1, 2, 2, 2],
574
"a": ["a", "b", "c", "d", "e", "f"],
575
}
576
)
577
out = df.group_by(pl.col("g")).agg(pl.col("a").gather_every(2)).sort("g")
578
expected = pl.DataFrame(
579
{
580
"g": [1, 2],
581
"a": [["a", "c"], ["d", "f"]],
582
}
583
)
584
assert_frame_equal(out, expected)
585
586
587
def test_take_misc(fruits_cars: pl.DataFrame) -> None:
588
df = fruits_cars
589
590
# Out of bounds error.
591
with pytest.raises(OutOfBoundsError):
592
df.sort("fruits").select(
593
pl.col("B").reverse().gather([1, 2]).implode().over("fruits"),
594
"fruits",
595
)
596
597
# Null indices.
598
assert_frame_equal(
599
df.select(pl.col("fruits").gather(pl.Series([0, None]))),
600
pl.DataFrame({"fruits": ["banana", None]}),
601
)
602
603
for index in [[0, 1], pl.Series([0, 1]), np.array([0, 1])]:
604
out = df.sort("fruits").select(
605
pl.col("B")
606
.reverse()
607
.gather(index) # type: ignore[arg-type]
608
.over("fruits", mapping_strategy="join"),
609
"fruits",
610
)
611
612
assert out[0, "B"].to_list() == [2, 3]
613
assert out[4, "B"].to_list() == [1, 4]
614
615
out = df.sort("fruits").select(
616
pl.col("B").reverse().get(pl.lit(1)).over("fruits"),
617
"fruits",
618
)
619
assert out[0, "B"] == 3
620
assert out[4, "B"] == 4
621
622
623
def test_pipe() -> None:
624
df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, None, 8]})
625
626
def _multiply(data: pl.DataFrame, mul: int) -> pl.DataFrame:
627
return data * mul
628
629
result = df.pipe(_multiply, mul=3)
630
631
assert_frame_equal(result, df * 3)
632
633
634
def test_explode() -> None:
635
df = pl.DataFrame({"letters": ["c", "a"], "nrs": [[1, 2], [1, 3]]})
636
out = df.explode("nrs")
637
assert out["letters"].to_list() == ["c", "c", "a", "a"]
638
assert out["nrs"].to_list() == [1, 2, 1, 3]
639
640
641
@pytest.mark.parametrize(
642
("stack", "exp_shape", "exp_columns"),
643
[
644
([pl.Series("stacked", [-1, -1, -1])], (3, 3), ["a", "b", "stacked"]),
645
(
646
[pl.Series("stacked2", [-1, -1, -1]), pl.Series("stacked3", [-1, -1, -1])],
647
(3, 4),
648
["a", "b", "stacked2", "stacked3"],
649
),
650
],
651
)
652
@pytest.mark.parametrize("in_place", [True, False])
653
def test_hstack_list_of_series(
654
stack: list[pl.Series],
655
exp_shape: tuple[int, int],
656
exp_columns: list[str],
657
in_place: bool,
658
) -> None:
659
df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"]})
660
if in_place:
661
df.hstack(stack, in_place=True)
662
assert df.shape == exp_shape
663
assert df.columns == exp_columns
664
else:
665
df_out = df.hstack(stack, in_place=False)
666
assert df_out.shape == exp_shape
667
assert df_out.columns == exp_columns
668
669
670
@pytest.mark.parametrize("in_place", [True, False])
671
def test_hstack_dataframe(in_place: bool) -> None:
672
df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"]})
673
df2 = pl.DataFrame({"c": [2, 1, 3], "d": ["a", "b", "c"]})
674
expected = pl.DataFrame(
675
{"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [2, 1, 3], "d": ["a", "b", "c"]}
676
)
677
if in_place:
678
df.hstack(df2, in_place=True)
679
assert_frame_equal(df, expected)
680
else:
681
df_out = df.hstack(df2, in_place=False)
682
assert_frame_equal(df_out, expected)
683
684
685
@pytest.mark.may_fail_cloud
686
def test_file_buffer() -> None:
687
f = BytesIO()
688
f.write(b"1,2,3,4,5,6\n7,8,9,10,11,12")
689
f.seek(0)
690
df = pl.read_csv(f, has_header=False)
691
assert df.shape == (2, 6)
692
693
f = BytesIO()
694
f.write(b"1,2,3,4,5,6\n7,8,9,10,11,12")
695
f.seek(0)
696
# check if not fails on TryClone and Length impl in file.rs
697
with pytest.raises(ComputeError):
698
pl.read_parquet(f)
699
700
701
def test_shift() -> None:
702
df = pl.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5]})
703
a = df.shift(1)
704
b = pl.DataFrame(
705
{"A": [None, "a", "b"], "B": [None, 1, 3]},
706
)
707
assert_frame_equal(a, b)
708
709
710
def test_multiple_columns_drop() -> None:
711
df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
712
# List input
713
out = df.drop(["a", "b"])
714
assert out.columns == ["c"]
715
# Positional input
716
out = df.drop("b", "c")
717
assert out.columns == ["a"]
718
719
720
def test_arg_where() -> None:
721
s = pl.Series([True, False, True, False])
722
assert_series_equal(
723
pl.arg_where(s, eager=True).cast(int),
724
pl.Series([0, 2]),
725
)
726
727
728
def test_to_dummies() -> None:
729
df = pl.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5]})
730
dummies = df.to_dummies()
731
732
assert dummies["A_a"].to_list() == [1, 0, 0]
733
assert dummies["A_b"].to_list() == [0, 1, 0]
734
assert dummies["A_c"].to_list() == [0, 0, 1]
735
736
df = pl.DataFrame({"a": [1, 2, 3]})
737
res = df.to_dummies()
738
739
expected = pl.DataFrame(
740
{"a_1": [1, 0, 0], "a_2": [0, 1, 0], "a_3": [0, 0, 1]}
741
).with_columns(pl.all().cast(pl.UInt8))
742
assert_frame_equal(res, expected)
743
744
df = pl.DataFrame(
745
{
746
"i": [1, 2, 3],
747
"category": ["dog", "cat", "cat"],
748
},
749
schema={"i": pl.Int32, "category": pl.Categorical()},
750
)
751
expected = pl.DataFrame(
752
{
753
"i": [1, 2, 3],
754
"category|cat": [0, 1, 1],
755
"category|dog": [1, 0, 0],
756
},
757
schema={"i": pl.Int32, "category|cat": pl.UInt8, "category|dog": pl.UInt8},
758
)
759
for _cols in ("category", cs.string()):
760
result = df.to_dummies(columns=["category"], separator="|")
761
assert_frame_equal(result, expected)
762
763
# test sorted fast path
764
result = pl.DataFrame({"x": pl.arange(0, 3, eager=True)}).to_dummies("x")
765
expected = pl.DataFrame(
766
{"x_0": [1, 0, 0], "x_1": [0, 1, 0], "x_2": [0, 0, 1]}
767
).with_columns(pl.all().cast(pl.UInt8))
768
assert_frame_equal(result, expected)
769
770
771
def test_to_dummies_drop_first() -> None:
772
df = pl.DataFrame(
773
{
774
"foo": [0, 1, 2],
775
"bar": [3, 4, 5],
776
"baz": ["x", "y", "z"],
777
}
778
)
779
dm = df.to_dummies()
780
dd = df.to_dummies(drop_first=True)
781
782
assert dd.columns == ["foo_1", "foo_2", "bar_4", "bar_5", "baz_y", "baz_z"]
783
assert set(dm.columns) - set(dd.columns) == {"foo_0", "bar_3", "baz_x"}
784
assert_frame_equal(dm.select(dd.columns), dd)
785
assert dd.rows() == [
786
(0, 0, 0, 0, 0, 0),
787
(1, 0, 1, 0, 1, 0),
788
(0, 1, 0, 1, 0, 1),
789
]
790
791
792
def test_to_dummies_drop_nulls() -> None:
793
df = pl.DataFrame(
794
{
795
"foo": [0, 1, None],
796
"bar": [3, None, 5],
797
"baz": [None, "y", "z"],
798
}
799
)
800
801
dm = df.to_dummies(drop_nulls=True)
802
803
expected = pl.DataFrame(
804
{
805
"foo_0": [1, 0, 0],
806
"foo_1": [0, 1, 0],
807
"bar_3": [1, 0, 0],
808
"bar_5": [0, 0, 1],
809
"baz_y": [0, 1, 0],
810
"baz_z": [0, 0, 1],
811
},
812
schema={
813
"foo_0": pl.UInt8,
814
"foo_1": pl.UInt8,
815
"bar_3": pl.UInt8,
816
"bar_5": pl.UInt8,
817
"baz_y": pl.UInt8,
818
"baz_z": pl.UInt8,
819
},
820
)
821
assert_frame_equal(dm, expected)
822
823
824
def test_to_pandas(df: pl.DataFrame) -> None:
825
# pyarrow cannot deal with unsigned dictionary integer yet.
826
# pyarrow cannot convert a time64 w/ non-zero nanoseconds
827
df = df.drop(["cat", "time", "enum"])
828
df.to_arrow()
829
df.to_pandas()
830
# test shifted df
831
df.shift(2).to_pandas()
832
df = pl.DataFrame({"col": pl.Series([True, False, True])})
833
df.shift(2).to_pandas()
834
835
836
def test_from_arrow_table() -> None:
837
data = {"a": [1, 2], "b": [1, 2]}
838
tbl = pa.table(data)
839
840
df = cast("pl.DataFrame", pl.from_arrow(tbl))
841
assert_frame_equal(df, pl.DataFrame(data))
842
843
844
def test_df_stats(df: pl.DataFrame) -> None:
845
df.var()
846
df.std()
847
df.min()
848
df.max()
849
df.sum()
850
df.mean()
851
df.median()
852
df.quantile(0.4, "nearest")
853
854
855
def test_df_fold() -> None:
856
df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})
857
858
assert_series_equal(
859
df.fold(lambda s1, s2: s1 + s2), pl.Series("a", [4.0, 5.0, 9.0])
860
)
861
assert_series_equal(
862
df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)),
863
pl.Series("a", [1.0, 1.0, 3.0]),
864
)
865
866
df = pl.DataFrame({"a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})
867
out = df.fold(lambda s1, s2: s1 + s2)
868
assert_series_equal(out, pl.Series("a", ["foo11.0", "bar22.0", "233.0"]))
869
870
df = pl.DataFrame({"a": [3, 2, 1], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})
871
# just check dispatch. values are tested on rust side.
872
assert len(df.sum_horizontal()) == 3
873
assert len(df.mean_horizontal()) == 3
874
assert len(df.min_horizontal()) == 3
875
assert len(df.max_horizontal()) == 3
876
877
df_width_one = df[["a"]]
878
assert_series_equal(df_width_one.fold(lambda s1, s2: s1), df["a"])
879
880
881
@pytest.mark.may_fail_cloud # TODO: make pickleable
882
def test_fold_filter() -> None:
883
df = pl.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})
884
885
out = df.filter(
886
pl.fold(
887
acc=pl.lit(True),
888
function=lambda a, b: a & b,
889
exprs=[pl.col(c) > 1 for c in df.columns],
890
)
891
)
892
893
assert out.shape == (1, 2)
894
assert out.rows() == [(3, 2)]
895
896
out = df.filter(
897
pl.fold(
898
acc=pl.lit(True),
899
function=lambda a, b: a | b,
900
exprs=[pl.col(c) > 1 for c in df.columns],
901
)
902
)
903
904
assert out.shape == (3, 2)
905
assert out.rows() == [(1, 0), (2, 1), (3, 2)]
906
907
908
def test_column_names() -> None:
909
tbl = pa.table(
910
{
911
"a": pa.array([1, 2, 3, 4, 5], pa.decimal128(38, 2)),
912
"b": pa.array([1, 2, 3, 4, 5], pa.int64()),
913
}
914
)
915
for a in (tbl, tbl[:0]):
916
df = cast("pl.DataFrame", pl.from_arrow(a))
917
assert df.columns == ["a", "b"]
918
919
920
def test_init_series_edge_cases() -> None:
921
# confirm that we don't modify the name of the input series in-place
922
s1 = pl.Series("X", [1, 2, 3])
923
df1 = pl.DataFrame({"A": s1}, schema_overrides={"A": pl.UInt8})
924
assert s1.name == "X"
925
assert df1["A"].name == "A"
926
927
# init same series object under different names
928
df2 = pl.DataFrame({"A": s1, "B": s1})
929
assert df2.rows(named=True) == [
930
{"A": 1, "B": 1},
931
{"A": 2, "B": 2},
932
{"A": 3, "B": 3},
933
]
934
935
# empty series names should not be overwritten
936
s2 = pl.Series([1, 2, 3])
937
s3 = pl.Series([2, 3, 4])
938
df3 = pl.DataFrame([s2, s3])
939
assert s2.name == s3.name == ""
940
assert df3.columns == ["column_0", "column_1"]
941
942
943
def test_head_group_by() -> None:
944
commodity_prices = {
945
"commodity": [
946
"Wheat",
947
"Wheat",
948
"Wheat",
949
"Wheat",
950
"Corn",
951
"Corn",
952
"Corn",
953
"Corn",
954
"Corn",
955
],
956
"location": [
957
"StPaul",
958
"StPaul",
959
"StPaul",
960
"Chicago",
961
"Chicago",
962
"Chicago",
963
"Chicago",
964
"Chicago",
965
"Chicago",
966
],
967
"seller": [
968
"Bob",
969
"Charlie",
970
"Susan",
971
"Paul",
972
"Ed",
973
"Mary",
974
"Paul",
975
"Charlie",
976
"Norman",
977
],
978
"price": [1.0, 0.7, 0.8, 0.55, 2.0, 3.0, 2.4, 1.8, 2.1],
979
}
980
df = pl.DataFrame(commodity_prices)
981
982
# this query flexes the wildcard exclusion quite a bit.
983
keys = ["commodity", "location"]
984
out = (
985
df.sort(by="price", descending=True)
986
.group_by(keys, maintain_order=True)
987
.agg([pl.col("*").exclude(keys).head(2).name.keep()])
988
.explode(cs.all().exclude(keys))
989
)
990
991
assert out.shape == (5, 4)
992
assert out.rows() == [
993
("Corn", "Chicago", "Mary", 3.0),
994
("Corn", "Chicago", "Paul", 2.4),
995
("Wheat", "StPaul", "Bob", 1.0),
996
("Wheat", "StPaul", "Susan", 0.8),
997
("Wheat", "Chicago", "Paul", 0.55),
998
]
999
1000
df = pl.DataFrame(
1001
{"letters": ["c", "c", "a", "c", "a", "b"], "nrs": [1, 2, 3, 4, 5, 6]}
1002
)
1003
out = df.group_by("letters").tail(2).sort("letters")
1004
assert_frame_equal(
1005
out,
1006
pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 2, 4]}),
1007
)
1008
out = df.group_by("letters").head(2).sort("letters")
1009
assert_frame_equal(
1010
out,
1011
pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 1, 2]}),
1012
)
1013
1014
1015
def test_is_null_is_not_null() -> None:
1016
df = pl.DataFrame({"nrs": [1, 2, None]})
1017
assert df.select(pl.col("nrs").is_null())["nrs"].to_list() == [False, False, True]
1018
assert df.select(pl.col("nrs").is_not_null())["nrs"].to_list() == [
1019
True,
1020
True,
1021
False,
1022
]
1023
1024
1025
def test_is_nan_is_not_nan() -> None:
1026
df = pl.DataFrame({"nrs": np.array([1, 2, np.nan])})
1027
assert df.select(pl.col("nrs").is_nan())["nrs"].to_list() == [False, False, True]
1028
assert df.select(pl.col("nrs").is_not_nan())["nrs"].to_list() == [True, True, False]
1029
1030
1031
def test_is_finite_is_infinite() -> None:
1032
df = pl.DataFrame({"nrs": np.array([1, 2, np.inf])})
1033
assert df.select(pl.col("nrs").is_infinite())["nrs"].to_list() == [
1034
False,
1035
False,
1036
True,
1037
]
1038
assert df.select(pl.col("nrs").is_finite())["nrs"].to_list() == [True, True, False]
1039
1040
1041
def test_is_finite_is_infinite_null_series() -> None:
1042
df = pl.DataFrame({"a": pl.Series([None, None, None], dtype=pl.Null)})
1043
result = df.select(
1044
pl.col("a").is_finite().alias("finite"),
1045
pl.col("a").is_infinite().alias("infinite"),
1046
)
1047
expected = pl.DataFrame(
1048
{
1049
"finite": pl.Series([None, None, None], dtype=pl.Boolean),
1050
"infinite": pl.Series([None, None, None], dtype=pl.Boolean),
1051
}
1052
)
1053
assert_frame_equal(result, expected)
1054
1055
1056
def test_is_nan_null_series() -> None:
1057
df = pl.DataFrame({"a": pl.Series([None, None, None], dtype=pl.Null)})
1058
result = df.select(pl.col("a").is_nan())
1059
expected = pl.DataFrame({"a": pl.Series([None, None, None], dtype=pl.Boolean)})
1060
assert_frame_equal(result, expected)
1061
1062
1063
def test_len() -> None:
1064
df = pl.DataFrame({"nrs": [1, 2, 3]})
1065
assert cast("int", df.select(pl.col("nrs").len()).item()) == 3
1066
assert len(pl.DataFrame()) == 0
1067
1068
1069
def test_multiple_column_sort() -> None:
1070
df = pl.DataFrame({"a": ["foo", "bar", "2"], "b": [2, 2, 3], "c": [1.0, 2.0, 3.0]})
1071
out = df.sort([pl.col("b"), pl.col("c").reverse()])
1072
assert list(out["c"]) == [2.0, 1.0, 3.0]
1073
assert list(out["b"]) == [2, 2, 3]
1074
1075
# Explicitly specify numpy dtype because of different defaults on Windows
1076
df = pl.DataFrame({"a": np.arange(1, 4, dtype=np.int64), "b": ["a", "a", "b"]})
1077
1078
assert_frame_equal(
1079
df.sort("a", descending=True),
1080
pl.DataFrame({"a": [3, 2, 1], "b": ["b", "a", "a"]}),
1081
)
1082
assert_frame_equal(
1083
df.sort("b", descending=True, maintain_order=True),
1084
pl.DataFrame({"a": [3, 1, 2], "b": ["b", "a", "a"]}),
1085
)
1086
assert_frame_equal(
1087
df.sort(["b", "a"], descending=[False, True]),
1088
pl.DataFrame({"a": [2, 1, 3], "b": ["a", "a", "b"]}),
1089
)
1090
1091
1092
def test_cast_frame() -> None:
1093
df = pl.DataFrame(
1094
{
1095
"a": [1.0, 2.5, 3.0],
1096
"b": [4, 5, None],
1097
"c": [True, False, True],
1098
"d": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)],
1099
}
1100
)
1101
1102
# cast via col:dtype map
1103
assert df.cast(
1104
dtypes={"b": pl.Float32, "c": pl.String, "d": pl.Datetime("ms")},
1105
).schema == {
1106
"a": pl.Float64,
1107
"b": pl.Float32,
1108
"c": pl.String,
1109
"d": pl.Datetime("ms"),
1110
}
1111
1112
# cast via col:pytype map
1113
assert df.cast(
1114
dtypes={"b": float, "c": str, "d": datetime},
1115
).schema == {
1116
"a": pl.Float64,
1117
"b": pl.Float64,
1118
"c": pl.String,
1119
"d": pl.Datetime("us"),
1120
}
1121
1122
# cast via selector:dtype map
1123
assert df.cast(
1124
{
1125
cs.numeric(): pl.UInt8,
1126
cs.temporal(): pl.String,
1127
}
1128
).rows() == [
1129
(1, 4, True, "2020-01-02"),
1130
(2, 5, False, "2021-03-04"),
1131
(3, None, True, "2022-05-06"),
1132
]
1133
1134
# cast all fields to a single type
1135
assert df.cast(pl.String).to_dict(as_series=False) == {
1136
"a": ["1.0", "2.5", "3.0"],
1137
"b": ["4", "5", None],
1138
"c": ["true", "false", "true"],
1139
"d": ["2020-01-02", "2021-03-04", "2022-05-06"],
1140
}
1141
1142
1143
def test_duration_arithmetic() -> None:
1144
df = pl.DataFrame(
1145
{"a": [datetime(2022, 1, 1, 0, 0, 0), datetime(2022, 1, 2, 0, 0, 0)]}
1146
)
1147
d1 = pl.duration(days=3, microseconds=987000)
1148
d2 = pl.duration(days=6, milliseconds=987)
1149
1150
assert_frame_equal(
1151
df.with_columns(
1152
b=(df["a"] + d1),
1153
c=(pl.col("a") + d2),
1154
),
1155
pl.DataFrame(
1156
{
1157
"a": [
1158
datetime(2022, 1, 1, 0, 0, 0),
1159
datetime(2022, 1, 2, 0, 0, 0),
1160
],
1161
"b": [
1162
datetime(2022, 1, 4, 0, 0, 0, 987000),
1163
datetime(2022, 1, 5, 0, 0, 0, 987000),
1164
],
1165
"c": [
1166
datetime(2022, 1, 7, 0, 0, 0, 987000),
1167
datetime(2022, 1, 8, 0, 0, 0, 987000),
1168
],
1169
}
1170
),
1171
)
1172
1173
1174
def test_assign() -> None:
1175
# check if can assign in case of a single column
1176
df = pl.DataFrame({"a": [1, 2, 3]})
1177
# test if we can assign in case of single column
1178
df = df.with_columns(pl.col("a") * 2)
1179
assert list(df["a"]) == [2, 4, 6]
1180
1181
1182
def test_arg_sort_by(df: pl.DataFrame) -> None:
1183
idx_df = df.select(
1184
pl.arg_sort_by(["int_nulls", "floats"], descending=[False, True]).alias("idx")
1185
)
1186
assert (idx_df["idx"] == [1, 0, 2]).all()
1187
1188
idx_df = df.select(
1189
pl.arg_sort_by(["int_nulls", "floats"], descending=False).alias("idx")
1190
)
1191
assert (idx_df["idx"] == [1, 0, 2]).all()
1192
1193
df = pl.DataFrame({"x": [0, 0, 0, 1, 1, 2], "y": [9, 9, 8, 7, 6, 6]})
1194
for expr, expected in (
1195
(pl.arg_sort_by(["x", "y"]), [2, 0, 1, 4, 3, 5]),
1196
(pl.arg_sort_by(["x", "y"], descending=[True, True]), [5, 3, 4, 0, 1, 2]),
1197
(pl.arg_sort_by(["x", "y"], descending=[True, False]), [5, 4, 3, 2, 0, 1]),
1198
(pl.arg_sort_by(["x", "y"], descending=[False, True]), [0, 1, 2, 3, 4, 5]),
1199
):
1200
assert (df.select(expr.alias("idx"))["idx"] == expected).all()
1201
1202
1203
def test_literal_series() -> None:
1204
df = pl.DataFrame(
1205
{
1206
"a": np.array([21.7, 21.8, 21], dtype=np.float32),
1207
"b": np.array([1, 3, 2], dtype=np.int8),
1208
"c": ["reg1", "reg2", "reg3"],
1209
"d": np.array(
1210
[datetime(2022, 8, 16), datetime(2022, 8, 17), datetime(2022, 8, 18)],
1211
dtype="<M8[ns]",
1212
),
1213
},
1214
schema_overrides={"a": pl.Float64},
1215
)
1216
out = (
1217
df.lazy()
1218
.with_columns(pl.Series("e", [2, 1, 3], pl.Int32))
1219
.with_columns(pl.col("e").cast(pl.Float32))
1220
.collect()
1221
)
1222
expected_schema = {
1223
"a": pl.Float64,
1224
"b": pl.Int8,
1225
"c": pl.String,
1226
"d": pl.Datetime("ns"),
1227
"e": pl.Float32,
1228
}
1229
assert_frame_equal(
1230
pl.DataFrame(
1231
[
1232
(21.7, 1, "reg1", datetime(2022, 8, 16, 0), 2),
1233
(21.8, 3, "reg2", datetime(2022, 8, 17, 0), 1),
1234
(21.0, 2, "reg3", datetime(2022, 8, 18, 0), 3),
1235
],
1236
schema=expected_schema, # type: ignore[arg-type]
1237
orient="row",
1238
),
1239
out,
1240
abs_tol=0.00001,
1241
)
1242
1243
1244
def test_write_csv() -> None:
1245
df = pl.DataFrame(
1246
{
1247
"foo": [1, 2, 3, 4, 5],
1248
"bar": [6, 7, 8, 9, 10],
1249
"ham": ["a", "b", "c", "d", "e"],
1250
}
1251
)
1252
expected = "foo,bar,ham\n1,6,a\n2,7,b\n3,8,c\n4,9,d\n5,10,e\n"
1253
1254
# if no file argument is supplied, write_csv() will return the string
1255
s = df.write_csv()
1256
assert s == expected
1257
1258
# otherwise it will write to the file/iobuffer
1259
file = BytesIO()
1260
df.write_csv(file)
1261
file.seek(0)
1262
s = file.read().decode("utf8")
1263
assert s == expected
1264
1265
1266
def test_from_generator_or_iterable() -> None:
1267
# generator function
1268
def gen(n: int, *, strkey: bool = True) -> Iterator[Any]:
1269
for i in range(n):
1270
yield (str(i) if strkey else i), 1 * i, 2**i, 3**i
1271
1272
def gen_named(n: int, *, strkey: bool = True) -> Iterator[Any]:
1273
for i in range(n):
1274
yield {"a": (str(i) if strkey else i), "b": 1 * i, "c": 2**i, "d": 3**i}
1275
1276
# iterable object
1277
class Rows:
1278
def __init__(self, n: int, *, strkey: bool = True) -> None:
1279
self._n = n
1280
self._strkey = strkey
1281
1282
def __iter__(self) -> Iterator[Any]:
1283
yield from gen(self._n, strkey=self._strkey)
1284
1285
# check init from column-oriented generator
1286
assert_frame_equal(
1287
pl.DataFrame(data=gen(4, strkey=False), orient="col"),
1288
pl.DataFrame(
1289
data=[(0, 0, 1, 1), (1, 1, 2, 3), (2, 2, 4, 9), (3, 3, 8, 27)], orient="col"
1290
),
1291
)
1292
# check init from row-oriented generators (more common)
1293
expected = pl.DataFrame(
1294
data=list(gen(4)), schema=["a", "b", "c", "d"], orient="row"
1295
)
1296
for generated_frame in (
1297
pl.DataFrame(data=gen(4), schema=["a", "b", "c", "d"]),
1298
pl.DataFrame(data=Rows(4), schema=["a", "b", "c", "d"]),
1299
pl.DataFrame(data=(x for x in Rows(4)), schema=["a", "b", "c", "d"]),
1300
):
1301
assert_frame_equal(expected, generated_frame)
1302
assert generated_frame.schema == {
1303
"a": pl.String,
1304
"b": pl.Int64,
1305
"c": pl.Int64,
1306
"d": pl.Int64,
1307
}
1308
1309
# test 'iterable_to_pydf' directly to validate 'chunk_size' behaviour
1310
cols = ["a", "b", ("c", pl.Int8), "d"]
1311
1312
expected_data = [("0", 0, 1, 1), ("1", 1, 2, 3), ("2", 2, 4, 9), ("3", 3, 8, 27)]
1313
expected_schema = [
1314
("a", pl.String),
1315
("b", pl.Int64),
1316
("c", pl.Int8),
1317
("d", pl.Int64),
1318
]
1319
1320
for params in (
1321
{"data": Rows(4)},
1322
{"data": gen(4), "chunk_size": 2},
1323
{"data": Rows(4), "chunk_size": 3},
1324
{"data": gen(4), "infer_schema_length": None},
1325
{"data": Rows(4), "infer_schema_length": 1},
1326
{"data": gen(4), "chunk_size": 2},
1327
{"data": Rows(4), "infer_schema_length": 5},
1328
{"data": gen(4), "infer_schema_length": 3, "chunk_size": 2},
1329
{"data": gen(4), "infer_schema_length": None, "chunk_size": 3},
1330
):
1331
d = iterable_to_pydf(schema=cols, **params) # type: ignore[arg-type]
1332
assert expected_data == d.row_tuples()
1333
assert expected_schema == list(zip(d.columns(), d.dtypes(), strict=True))
1334
1335
# ref: issue #6489 (initial chunk_size cannot be smaller than 'infer_schema_length')
1336
df = pl.DataFrame(
1337
data=iter(([{"col": None}] * 1000) + [{"col": ["a", "b", "c"]}]),
1338
infer_schema_length=1001,
1339
)
1340
assert df.schema == {"col": pl.List(pl.String)}
1341
assert df[-2:]["col"].to_list() == [None, ["a", "b", "c"]]
1342
1343
# ref: issue #23404 (infer_schema_length=None should always scan all data)
1344
d = iterable_to_pydf(
1345
data=chain(repeat({"col": 1}, length_minus_1 := 100), repeat({"col": 1.1}, 1)),
1346
infer_schema_length=None,
1347
chunk_size=length_minus_1,
1348
)
1349
assert d.dtypes() == [pl.Float64()]
1350
1351
# empty iterator
1352
assert_frame_equal(
1353
pl.DataFrame(data=gen(0), schema=["a", "b", "c", "d"]),
1354
pl.DataFrame(schema=["a", "b", "c", "d"]),
1355
)
1356
1357
# schema overrides
1358
assert_frame_equal(
1359
pl.DataFrame(
1360
data=gen_named(1),
1361
schema_overrides={"a": pl.Float64(), "c": pl.Float64()},
1362
),
1363
pl.DataFrame([{"a": 0.0, "b": 0, "c": 1.0, "d": 1}]),
1364
)
1365
1366
1367
def test_from_rows() -> None:
1368
df = pl.from_records([[1, 2, "foo"], [2, 3, "bar"]], orient="row")
1369
assert_frame_equal(
1370
df,
1371
pl.DataFrame(
1372
{"column_0": [1, 2], "column_1": [2, 3], "column_2": ["foo", "bar"]}
1373
),
1374
)
1375
df = pl.from_records(
1376
[[1, datetime.fromtimestamp(100)], [2, datetime.fromtimestamp(2398754908)]],
1377
schema_overrides={"column_0": pl.UInt32},
1378
orient="row",
1379
)
1380
assert df.dtypes == [pl.UInt32, pl.Datetime]
1381
1382
# auto-inference with same num rows/cols
1383
data = [(1, 2, "foo"), (2, 3, "bar"), (3, 4, "baz")]
1384
df = pl.from_records(data, orient="row")
1385
assert data == df.rows()
1386
1387
1388
@pytest.mark.parametrize(
1389
"records",
1390
[
1391
[
1392
{"id": 1, "value": 100, "_meta": "a"},
1393
{"id": 2, "value": 101, "_meta": "b"},
1394
],
1395
[
1396
None,
1397
{"id": 1, "value": 100, "_meta": "a"},
1398
{"id": 2, "value": 101, "_meta": "b"},
1399
],
1400
[
1401
{"id": 1, "value": 100, "_meta": "a"},
1402
{"id": 2, "value": 101, "_meta": "b"},
1403
None,
1404
],
1405
[
1406
MappingObject(id=1, value=100, _meta="a"),
1407
MappingObject(id=2, value=101, _meta="b"),
1408
],
1409
[
1410
None,
1411
MappingObject(id=1, value=100, _meta="a"),
1412
MappingObject(id=2, value=101, _meta="b"),
1413
],
1414
[
1415
MappingObject(id=1, value=100, _meta="a"),
1416
MappingObject(id=2, value=101, _meta="b"),
1417
None,
1418
],
1419
],
1420
)
1421
def test_from_rows_of_dicts(records: Sequence[Mapping[str, Any]]) -> None:
1422
for df_init in (pl.from_dicts, pl.DataFrame):
1423
df1 = df_init(records).remove(pl.col("id").is_null())
1424
assert df1.rows() == [(1, 100, "a"), (2, 101, "b")]
1425
1426
overrides = {
1427
"id": pl.Int16,
1428
"value": pl.Int32,
1429
}
1430
df2 = df_init(records, schema_overrides=overrides).remove(
1431
pl.col("id").is_null()
1432
)
1433
assert df2.rows() == [(1, 100, "a"), (2, 101, "b")]
1434
assert df2.schema == {"id": pl.Int16, "value": pl.Int32, "_meta": pl.String}
1435
1436
df3 = df_init(records, schema=overrides).remove(pl.col("id").is_null())
1437
assert df3.rows() == [(1, 100), (2, 101)]
1438
assert df3.schema == {"id": pl.Int16, "value": pl.Int32}
1439
1440
# explicitly check "anyvalue" conversion for dict/mapping dtypes
1441
py_s = PySeries.new_from_any_values("s", records, True)
1442
assert py_s.dtype() == pl.Struct(
1443
{
1444
"id": pl.Int64,
1445
"value": pl.Int64,
1446
"_meta": pl.String,
1447
}
1448
)
1449
1450
1451
def test_from_records_with_schema_overrides_12032() -> None:
1452
# the 'id' fields contains an int value that exceeds Int64 and doesn't have an exact
1453
# Float64 representation; confirm that the override is applied *during* inference,
1454
# not as a post-inference cast, so we maintain the accuracy of the original value.
1455
rec = [
1456
{"id": 9187643043065364490, "x": 333, "y": None},
1457
{"id": 9223671840084328467, "x": 666.5, "y": 1698177261953686},
1458
{"id": 9187643043065364505, "x": 999, "y": 9223372036854775807},
1459
]
1460
df = pl.from_records(rec, schema_overrides={"x": pl.Float32, "id": pl.UInt64})
1461
assert df.schema == OrderedDict(
1462
[
1463
("id", pl.UInt64),
1464
("x", pl.Float32),
1465
("y", pl.Int64),
1466
]
1467
)
1468
assert rec == df.rows(named=True)
1469
1470
1471
def test_from_large_uint64_misc() -> None:
1472
uint_data = [[9187643043065364490, 9223671840084328467, 9187643043065364505]]
1473
1474
df = pl.DataFrame(uint_data, orient="col", schema_overrides={"column_0": pl.UInt64})
1475
assert df["column_0"].dtype == pl.UInt64
1476
assert df["column_0"].to_list() == uint_data[0]
1477
1478
for overrides in ({}, {"column_1": pl.UInt64}):
1479
df = pl.DataFrame(
1480
uint_data,
1481
orient="row",
1482
schema_overrides=overrides,
1483
)
1484
assert df.schema == OrderedDict(
1485
[
1486
("column_0", pl.Int64),
1487
("column_1", pl.Int128 if overrides == {} else pl.UInt64),
1488
("column_2", pl.Int64),
1489
]
1490
)
1491
assert df.row(0) == tuple(uint_data[0])
1492
1493
1494
def test_repeat_by_unequal_lengths_panic() -> None:
1495
df = pl.DataFrame(
1496
{
1497
"a": ["x", "y", "z"],
1498
}
1499
)
1500
with pytest.raises(ShapeError):
1501
df.select(pl.col("a").repeat_by(pl.Series([2, 2])))
1502
1503
1504
@pytest.mark.parametrize(
1505
("value", "values_expect"),
1506
[
1507
(1.2, [[1.2], [1.2, 1.2], [1.2, 1.2, 1.2]]),
1508
(True, [[True], [True, True], [True, True, True]]),
1509
("x", [["x"], ["x", "x"], ["x", "x", "x"]]),
1510
(b"a", [[b"a"], [b"a", b"a"], [b"a", b"a", b"a"]]),
1511
],
1512
)
1513
def test_repeat_by_broadcast_left(
1514
value: float | bool | str, values_expect: list[list[float | bool | str]]
1515
) -> None:
1516
df = pl.DataFrame(
1517
{
1518
"n": [1, 2, 3],
1519
}
1520
)
1521
expected = pl.DataFrame({"values": values_expect})
1522
result = df.select(pl.lit(value).repeat_by(pl.col("n")).alias("values"))
1523
assert_frame_equal(result, expected)
1524
1525
1526
@pytest.mark.parametrize(
1527
("a", "a_expected"),
1528
[
1529
([1.2, 2.2, 3.3], [[1.2, 1.2, 1.2], [2.2, 2.2, 2.2], [3.3, 3.3, 3.3]]),
1530
([True, False], [[True, True, True], [False, False, False]]),
1531
(["x", "y", "z"], [["x", "x", "x"], ["y", "y", "y"], ["z", "z", "z"]]),
1532
(
1533
[b"a", b"b", b"c"],
1534
[[b"a", b"a", b"a"], [b"b", b"b", b"b"], [b"c", b"c", b"c"]],
1535
),
1536
],
1537
)
1538
def test_repeat_by_broadcast_right(
1539
a: list[float | bool | str], a_expected: list[list[float | bool | str]]
1540
) -> None:
1541
df = pl.DataFrame(
1542
{
1543
"a": a,
1544
}
1545
)
1546
expected = pl.DataFrame({"a": a_expected})
1547
result = df.select(pl.col("a").repeat_by(3))
1548
assert_frame_equal(result, expected)
1549
result = df.select(pl.col("a").repeat_by(pl.lit(3)))
1550
assert_frame_equal(result, expected)
1551
1552
1553
@pytest.mark.parametrize(
1554
("a", "a_expected"),
1555
[
1556
(["foo", "bar"], [["foo", "foo"], ["bar", "bar", "bar"]]),
1557
([1, 2], [[1, 1], [2, 2, 2]]),
1558
([True, False], [[True, True], [False, False, False]]),
1559
(
1560
[b"a", b"b"],
1561
[[b"a", b"a"], [b"b", b"b", b"b"]],
1562
),
1563
],
1564
)
1565
def test_repeat_by(
1566
a: list[float | bool | str], a_expected: list[list[float | bool | str]]
1567
) -> None:
1568
df = pl.DataFrame({"a": a, "n": [2, 3]})
1569
expected = pl.DataFrame({"a": a_expected})
1570
result = df.select(pl.col("a").repeat_by("n"))
1571
assert_frame_equal(result, expected)
1572
1573
1574
def test_join_dates() -> None:
1575
dts_in = pl.datetime_range(
1576
datetime(2021, 6, 24),
1577
datetime(2021, 6, 24, 10, 0, 0),
1578
interval=timedelta(hours=1),
1579
closed="left",
1580
eager=True,
1581
)
1582
dts = (
1583
dts_in.cast(int)
1584
.map_elements(lambda x: x + np.random.randint(1_000 * 60, 60_000 * 60))
1585
.cast(pl.Datetime)
1586
)
1587
1588
# some df with sensor id, (randomish) datetime and some value
1589
df = pl.DataFrame(
1590
{
1591
"sensor": ["a"] * 5 + ["b"] * 5,
1592
"datetime": dts,
1593
"value": [2, 3, 4, 1, 2, 3, 5, 1, 2, 3],
1594
}
1595
)
1596
out = df.join(df, on="datetime")
1597
assert out.height == df.height
1598
1599
1600
def test_asof_cross_join() -> None:
1601
left = pl.DataFrame({"a": [-10, 5, 10], "left_val": ["a", "b", "c"]}).with_columns(
1602
pl.col("a").set_sorted()
1603
)
1604
right = pl.DataFrame(
1605
{"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}
1606
).with_columns(pl.col("a").set_sorted())
1607
1608
# only test dispatch of asof join
1609
out = left.join_asof(right, on="a")
1610
assert out.shape == (3, 3)
1611
1612
left.lazy().join_asof(right.lazy(), on="a").collect()
1613
assert out.shape == (3, 3)
1614
1615
# only test dispatch of cross join
1616
out = left.join(right, how="cross")
1617
assert out.shape == (15, 4)
1618
1619
left.lazy().join(right.lazy(), how="cross").collect()
1620
assert out.shape == (15, 4)
1621
1622
1623
def test_join_bad_input_type() -> None:
1624
left = pl.DataFrame({"a": [1, 2, 3]})
1625
right = pl.DataFrame({"a": [1, 2, 3]})
1626
1627
with pytest.raises(
1628
TypeError,
1629
match=r"expected `other` .*to be a 'DataFrame'.* not 'LazyFrame'",
1630
):
1631
left.join(right.lazy(), on="a") # type: ignore[arg-type]
1632
1633
with pytest.raises(
1634
TypeError,
1635
match=r"expected `other` .*to be a 'DataFrame'.* not 'Series'",
1636
):
1637
left.join(pl.Series([1, 2, 3]), on="a") # type: ignore[arg-type]
1638
1639
class DummyDataFrameSubclass(pl.DataFrame):
1640
pass
1641
1642
right = DummyDataFrameSubclass(right)
1643
1644
left.join(right, on="a")
1645
1646
1647
def test_join_where() -> None:
1648
east = pl.DataFrame(
1649
{
1650
"id": [100, 101, 102],
1651
"dur": [120, 140, 160],
1652
"rev": [12, 14, 16],
1653
"cores": [2, 8, 4],
1654
}
1655
)
1656
west = pl.DataFrame(
1657
{
1658
"t_id": [404, 498, 676, 742],
1659
"time": [90, 130, 150, 170],
1660
"cost": [9, 13, 15, 16],
1661
"cores": [4, 2, 1, 4],
1662
}
1663
)
1664
out = east.join_where(
1665
west,
1666
pl.col("dur") < pl.col("time"),
1667
pl.col("rev") < pl.col("cost"),
1668
)
1669
1670
expected = pl.DataFrame(
1671
{
1672
"id": [100, 100, 100, 101, 101],
1673
"dur": [120, 120, 120, 140, 140],
1674
"rev": [12, 12, 12, 14, 14],
1675
"cores": [2, 2, 2, 8, 8],
1676
"t_id": [498, 676, 742, 676, 742],
1677
"time": [130, 150, 170, 150, 170],
1678
"cost": [13, 15, 16, 15, 16],
1679
"cores_right": [2, 1, 4, 1, 4],
1680
}
1681
)
1682
1683
assert_frame_equal(out, expected)
1684
1685
1686
def test_join_where_bad_input_type() -> None:
1687
east = pl.DataFrame(
1688
{
1689
"id": [100, 101, 102],
1690
"dur": [120, 140, 160],
1691
"rev": [12, 14, 16],
1692
"cores": [2, 8, 4],
1693
}
1694
)
1695
west = pl.DataFrame(
1696
{
1697
"t_id": [404, 498, 676, 742],
1698
"time": [90, 130, 150, 170],
1699
"cost": [9, 13, 15, 16],
1700
"cores": [4, 2, 1, 4],
1701
}
1702
)
1703
with pytest.raises(
1704
TypeError,
1705
match=r"expected `other` .*to be a 'DataFrame'.* not 'LazyFrame'",
1706
):
1707
east.join_where(
1708
west.lazy(), # type: ignore[arg-type]
1709
pl.col("dur") < pl.col("time"),
1710
pl.col("rev") < pl.col("cost"),
1711
)
1712
1713
with pytest.raises(
1714
TypeError,
1715
match=r"expected `other` .*to be a 'DataFrame'.* not 'Series'",
1716
):
1717
east.join_where(
1718
pl.Series(west), # type: ignore[arg-type]
1719
pl.col("dur") < pl.col("time"),
1720
pl.col("rev") < pl.col("cost"),
1721
)
1722
1723
class DummyDataFrameSubclass(pl.DataFrame):
1724
pass
1725
1726
west = DummyDataFrameSubclass(west)
1727
1728
east.join_where(
1729
west,
1730
pl.col("dur") < pl.col("time"),
1731
pl.col("rev") < pl.col("cost"),
1732
)
1733
1734
1735
def test_str_concat() -> None:
1736
df = pl.DataFrame(
1737
{
1738
"nrs": [1, 2, 3, 4],
1739
"name": ["ham", "spam", "foo", None],
1740
}
1741
)
1742
out = df.with_columns((pl.lit("Dr. ") + pl.col("name")).alias("graduated_name"))
1743
assert out["graduated_name"][0] == "Dr. ham"
1744
assert out["graduated_name"][1] == "Dr. spam"
1745
1746
1747
def test_dot_product() -> None:
1748
df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [2, 2, 2, 2]})
1749
1750
assert df["a"].dot(df["b"]) == 20
1751
assert typing.cast("int", df.select([pl.col("a").dot("b")])[0, "a"]) == 20
1752
1753
result = pl.Series([1, 2, 3]) @ pl.Series([4, 5, 6])
1754
assert isinstance(result, int)
1755
assert result == 32
1756
1757
result = pl.Series([1, 2, 3]) @ pl.Series([4.0, 5.0, 6.0])
1758
assert isinstance(result, float)
1759
assert result == 32.0
1760
1761
result = pl.Series([1.0, 2.0, 3.0]) @ pl.Series([4.0, 5.0, 6.0])
1762
assert isinstance(result, float)
1763
assert result == 32.0
1764
1765
with pytest.raises(
1766
InvalidOperationError, match="`dot` operation not supported for dtype `bool`"
1767
):
1768
pl.Series([True, False, False, True]) @ pl.Series([4, 5, 6, 7])
1769
1770
with pytest.raises(
1771
InvalidOperationError, match="`dot` operation not supported for dtype `str`"
1772
):
1773
pl.Series([1, 2, 3, 4]) @ pl.Series(["True", "False", "False", "True"])
1774
1775
1776
def test_hash_rows() -> None:
1777
df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [2, 2, 2, 2]})
1778
assert df.hash_rows().dtype == pl.UInt64
1779
assert df["a"].hash().dtype == pl.UInt64
1780
assert df.select([pl.col("a").hash().alias("foo")])["foo"].dtype == pl.UInt64
1781
1782
1783
def test_reproducible_hash_with_seeds() -> None:
1784
"""
1785
Test the reproducibility of DataFrame.hash_rows, Series.hash, and Expr.hash.
1786
1787
cf. issue #3966, hashes must always be reproducible across sessions when using
1788
the same seeds.
1789
"""
1790
df = pl.DataFrame({"s": [1234, None, 5678]})
1791
seeds = (11, 22, 33, 44)
1792
expected = pl.Series(
1793
"s",
1794
[7829205897147972687, 10151361788274345728, 17508017346787321581],
1795
dtype=pl.UInt64,
1796
)
1797
result = df.hash_rows(*seeds)
1798
assert_series_equal(expected, result, check_names=False, check_exact=True)
1799
result = df["s"].hash(*seeds)
1800
assert_series_equal(expected, result, check_names=False, check_exact=True)
1801
result = df.select([pl.col("s").hash(*seeds)])["s"]
1802
assert_series_equal(expected, result, check_names=False, check_exact=True)
1803
1804
1805
@pytest.mark.slow
1806
@pytest.mark.parametrize(
1807
"e",
1808
[
1809
pl.int_range(1_000_000),
1810
# Test code path for null_count > 0
1811
pl.when(pl.int_range(1_000_000) != 0).then(pl.int_range(1_000_000)),
1812
],
1813
)
1814
def test_hash_collision_multiple_columns_equal_values_15390(e: pl.Expr) -> None:
1815
df = pl.select(e.alias("a"))
1816
1817
for n_columns in (1, 2, 3, 4):
1818
s = df.select(pl.col("a").alias(f"x{i}") for i in range(n_columns)).hash_rows()
1819
1820
vc = s.sort().value_counts(sort=True)
1821
max_bucket_size = vc["count"][0]
1822
1823
assert max_bucket_size == 1
1824
1825
1826
@pytest.mark.may_fail_auto_streaming # Python objects not yet supported in row encoding
1827
@pytest.mark.may_fail_cloud
1828
def test_hashing_on_python_objects() -> None:
1829
# see if we can do a group_by, drop_duplicates on a DataFrame with objects.
1830
# this requires that the hashing and aggregations are done on python objects
1831
1832
df = pl.DataFrame({"a": [1, 1, 3, 4], "b": [1, 1, 2, 2]})
1833
1834
class Foo:
1835
def __hash__(self) -> int:
1836
return 0
1837
1838
def __eq__(self, other: object) -> bool:
1839
return True
1840
1841
df = df.with_columns(pl.col("a").map_elements(lambda x: Foo()).alias("foo"))
1842
assert df.group_by(["foo"]).first().shape == (1, 3)
1843
assert df.unique().shape == (3, 3)
1844
1845
1846
def test_unique_unit_rows() -> None:
1847
df = pl.DataFrame({"a": [1], "b": [None]}, schema={"a": pl.Int64, "b": pl.Float32})
1848
1849
# 'unique' one-row frame should be equal to the original frame
1850
assert_frame_equal(df, df.unique(subset="a"))
1851
for col in df.columns:
1852
assert df.n_unique(subset=[col]) == 1
1853
1854
1855
def test_panic() -> None:
1856
# may contain some tests that yielded a panic in polars or pl_arrow
1857
# https://github.com/pola-rs/polars/issues/1110
1858
a = pl.DataFrame(
1859
{
1860
"col1": ["a"] * 500 + ["b"] * 500,
1861
}
1862
)
1863
a.filter(pl.col("col1") != "b")
1864
1865
1866
def test_horizontal_agg() -> None:
1867
df = pl.DataFrame({"a": [1, None, 3], "b": [1, 2, 3]})
1868
1869
assert_series_equal(df.sum_horizontal(), pl.Series("sum", [2, 2, 6]))
1870
assert_series_equal(
1871
df.sum_horizontal(ignore_nulls=False), pl.Series("sum", [2, None, 6])
1872
)
1873
assert_series_equal(
1874
df.mean_horizontal(ignore_nulls=False), pl.Series("mean", [1.0, None, 3.0])
1875
)
1876
1877
1878
def test_slicing() -> None:
1879
# https://github.com/pola-rs/polars/issues/1322
1880
n = 20
1881
1882
df = pl.DataFrame(
1883
{
1884
"d": ["u", "u", "d", "c", "c", "d", "d"] * n,
1885
"v1": [None, "help", None, None, None, None, None] * n,
1886
}
1887
)
1888
1889
assert (df.filter(pl.col("d") != "d").select([pl.col("v1").unique()])).shape == (
1890
2,
1891
1,
1892
)
1893
1894
1895
def test_group_by_cat_list() -> None:
1896
grouped = (
1897
pl.DataFrame(
1898
[
1899
pl.Series("str_column", ["a", "b", "b", "a", "b"]),
1900
pl.Series("int_column", [1, 1, 2, 2, 3]),
1901
]
1902
)
1903
.with_columns(pl.col("str_column").cast(pl.Categorical).alias("cat_column"))
1904
.group_by("int_column", maintain_order=True)
1905
.agg([pl.col("cat_column")])["cat_column"]
1906
)
1907
1908
out = grouped.explode()
1909
assert out.dtype == pl.Categorical
1910
assert out[0] == "a"
1911
1912
1913
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
1914
def test_group_by_agg_n_unique_floats(dtype: pl.DataType) -> None:
1915
# tests proper dispatch
1916
df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})
1917
1918
out = df.group_by("a", maintain_order=True).agg(
1919
[pl.col("b").cast(dtype).n_unique()]
1920
)
1921
assert out["b"].to_list() == [2, 1]
1922
1923
1924
def test_group_by_agg_n_unique_empty_group_idx_path() -> None:
1925
df = pl.DataFrame(
1926
{
1927
"key": [1, 1, 1, 2, 2, 2],
1928
"value": [1, 2, 3, 4, 5, 6],
1929
"filt": [True, True, True, False, False, False],
1930
}
1931
)
1932
out = df.group_by("key", maintain_order=True).agg(
1933
pl.col("value").filter("filt").n_unique().alias("n_unique")
1934
)
1935
expected = pl.DataFrame(
1936
{
1937
"key": [1, 2],
1938
"n_unique": pl.Series([3, 0], dtype=pl.get_index_type()),
1939
}
1940
)
1941
assert_frame_equal(out, expected)
1942
1943
1944
def test_group_by_agg_n_unique_empty_group_slice_path() -> None:
1945
df = pl.DataFrame(
1946
{
1947
"key": [1, 1, 1, 2, 2, 2],
1948
"value": [1, 2, 3, 4, 5, 6],
1949
"filt": [False, False, False, False, False, False],
1950
}
1951
)
1952
out = df.group_by("key", maintain_order=True).agg(
1953
pl.col("value").filter("filt").n_unique().alias("n_unique")
1954
)
1955
expected = pl.DataFrame(
1956
{
1957
"key": [1, 2],
1958
"n_unique": pl.Series([0, 0], dtype=pl.get_index_type()),
1959
}
1960
)
1961
assert_frame_equal(out, expected)
1962
1963
1964
def test_select_by_dtype(df: pl.DataFrame) -> None:
1965
out = df.select(pl.col(pl.String))
1966
assert out.columns == ["strings", "strings_nulls"]
1967
out = df.select(pl.col([pl.String, pl.Boolean]))
1968
assert out.columns == ["bools", "bools_nulls", "strings", "strings_nulls"]
1969
out = df.select(pl.col(INTEGER_DTYPES))
1970
assert out.columns == ["int", "int_nulls"]
1971
1972
out = df.select(ints=pl.struct(pl.col(INTEGER_DTYPES)))
1973
assert out.schema == {
1974
"ints": pl.Struct([pl.Field("int", pl.Int64), pl.Field("int_nulls", pl.Int64)])
1975
}
1976
1977
1978
def test_with_row_index() -> None:
1979
df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})
1980
1981
out = df.with_row_index()
1982
assert out["index"].to_list() == [0, 1, 2]
1983
1984
out = df.lazy().with_row_index().collect()
1985
assert out["index"].to_list() == [0, 1, 2]
1986
1987
1988
def test_with_row_index_bad_offset() -> None:
1989
df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})
1990
1991
with pytest.raises(ValueError, match="cannot be negative"):
1992
df.with_row_index(offset=-1)
1993
with pytest.raises(
1994
ValueError, match="cannot be greater than the maximum index value"
1995
):
1996
df.with_row_index(offset=2**64)
1997
1998
1999
def test_with_row_index_bad_offset_lazy() -> None:
2000
lf = pl.LazyFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})
2001
2002
with pytest.raises(ValueError, match="cannot be negative"):
2003
lf.with_row_index(offset=-1)
2004
with pytest.raises(
2005
ValueError, match="cannot be greater than the maximum index value"
2006
):
2007
lf.with_row_index(offset=2**64)
2008
2009
2010
def test_with_row_count_deprecated() -> None:
2011
df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})
2012
2013
with pytest.deprecated_call():
2014
out = df.with_row_count()
2015
assert out["row_nr"].to_list() == [0, 1, 2]
2016
2017
with pytest.deprecated_call():
2018
out = df.lazy().with_row_count().collect()
2019
assert out["row_nr"].to_list() == [0, 1, 2]
2020
2021
2022
@pytest.mark.may_fail_cloud
2023
def test_filter_with_all_expansion() -> None:
2024
df = pl.DataFrame(
2025
{
2026
"b": [1, 2, None],
2027
"c": [1, 2, None],
2028
"a": [None, None, None],
2029
}
2030
)
2031
out = df.filter(~pl.fold(True, lambda acc, s: acc & s.is_null(), pl.all()))
2032
assert out.shape == (2, 3)
2033
2034
2035
# TODO: investigate this discrepancy in auto streaming
2036
@pytest.mark.may_fail_auto_streaming
2037
@pytest.mark.may_fail_cloud
2038
def test_extension() -> None:
2039
class Foo:
2040
def __init__(self, value: Any) -> None:
2041
self.value = value
2042
2043
def __repr__(self) -> str:
2044
return f"foo({self.value})"
2045
2046
foos = [Foo(1), Foo(2), Foo(3)]
2047
2048
# foos and sys.getrefcount both have a reference.
2049
base_count = 2
2050
2051
# We compute the refcount on a separate line otherwise pytest's assert magic
2052
# might add reference counts.
2053
rc = sys.getrefcount(foos[0])
2054
assert rc == base_count
2055
2056
df = pl.DataFrame({"groups": [1, 1, 2], "a": foos})
2057
rc = sys.getrefcount(foos[0])
2058
assert rc == base_count + 1
2059
del df
2060
rc = sys.getrefcount(foos[0])
2061
assert rc == base_count
2062
2063
df = pl.DataFrame({"groups": [1, 1, 2], "a": foos})
2064
rc = sys.getrefcount(foos[0])
2065
assert rc == base_count + 1
2066
2067
out = df.group_by("groups", maintain_order=True).agg(pl.col("a").alias("a"))
2068
rc = sys.getrefcount(foos[0])
2069
assert rc == base_count + 2
2070
s = out["a"].list.explode()
2071
rc = sys.getrefcount(foos[0])
2072
assert rc == base_count + 3
2073
del s
2074
rc = sys.getrefcount(foos[0])
2075
assert rc == base_count + 2
2076
2077
assert out["a"].list.explode().to_list() == foos
2078
rc = sys.getrefcount(foos[0])
2079
assert rc == base_count + 2
2080
del out
2081
rc = sys.getrefcount(foos[0])
2082
assert rc == base_count + 1
2083
del df
2084
rc = sys.getrefcount(foos[0])
2085
assert rc == base_count
2086
2087
2088
@pytest.mark.parametrize("name", [None, "n", ""])
2089
def test_group_by_order_dispatch(name: str | None) -> None:
2090
df = pl.DataFrame({"x": list("bab"), "y": range(3)})
2091
lf = df.lazy()
2092
2093
result = df.group_by("x", maintain_order=True).len(name=name)
2094
lazy_result = lf.group_by("x").len(name=name).sort(by="x", descending=True)
2095
2096
name = "len" if name is None else name
2097
expected = pl.DataFrame(
2098
data={"x": ["b", "a"], name: [2, 1]},
2099
schema_overrides={name: pl.get_index_type()},
2100
)
2101
assert_frame_equal(result, expected)
2102
assert_frame_equal(lazy_result.collect(), expected)
2103
2104
result = df.group_by("x", maintain_order=True).all()
2105
expected = pl.DataFrame({"x": ["b", "a"], "y": [[0, 2], [1]]})
2106
assert_frame_equal(result, expected)
2107
2108
2109
def test_partitioned_group_by_order() -> None:
2110
# check if group ordering is maintained.
2111
# we only have 30 groups, so this triggers a partitioned group by
2112
df = pl.DataFrame({"x": [chr(v) for v in range(33, 63)], "y": range(30)})
2113
out = df.group_by("x", maintain_order=True).agg(pl.all().implode())
2114
assert_series_equal(out["x"], df["x"])
2115
2116
2117
def test_schema() -> None:
2118
df = pl.DataFrame(
2119
{"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]}
2120
)
2121
expected = {"foo": pl.Int64, "bar": pl.Float64, "ham": pl.String}
2122
assert df.schema == expected
2123
2124
2125
def test_schema_equality() -> None:
2126
lf = pl.LazyFrame({"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0]})
2127
lf_rev = lf.select("bar", "foo")
2128
2129
assert lf.collect_schema() != lf_rev.collect_schema()
2130
assert lf.collect().schema != lf_rev.collect().schema
2131
2132
2133
def test_df_schema_unique() -> None:
2134
df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
2135
with pytest.raises(DuplicateError):
2136
df.columns = ["a", "a"]
2137
2138
with pytest.raises(DuplicateError):
2139
df.rename({"b": "a"})
2140
2141
2142
def test_empty_projection() -> None:
2143
empty_df = pl.DataFrame({"a": [1, 2], "b": [3, 4]}).select([])
2144
assert empty_df.rows() == []
2145
assert empty_df.schema == {}
2146
assert empty_df.shape == (0, 0)
2147
2148
2149
def test_fill_null() -> None:
2150
df = pl.DataFrame({"a": [1, 2], "b": [3, None]})
2151
assert_frame_equal(df.fill_null(4), pl.DataFrame({"a": [1, 2], "b": [3, 4]}))
2152
assert_frame_equal(
2153
df.fill_null(strategy="max"), pl.DataFrame({"a": [1, 2], "b": [3, 3]})
2154
)
2155
2156
# string and list data
2157
# string goes via binary
2158
df = pl.DataFrame(
2159
{
2160
"c": [
2161
["Apple", "Orange"],
2162
["Apple", "Orange"],
2163
None,
2164
["Carrot"],
2165
None,
2166
None,
2167
],
2168
"b": ["Apple", "Orange", None, "Carrot", None, None],
2169
}
2170
)
2171
2172
assert df.select(
2173
pl.all().fill_null(strategy="forward").name.suffix("_forward"),
2174
pl.all().fill_null(strategy="backward").name.suffix("_backward"),
2175
).to_dict(as_series=False) == {
2176
"c_forward": [
2177
["Apple", "Orange"],
2178
["Apple", "Orange"],
2179
["Apple", "Orange"],
2180
["Carrot"],
2181
["Carrot"],
2182
["Carrot"],
2183
],
2184
"b_forward": ["Apple", "Orange", "Orange", "Carrot", "Carrot", "Carrot"],
2185
"c_backward": [
2186
["Apple", "Orange"],
2187
["Apple", "Orange"],
2188
["Carrot"],
2189
["Carrot"],
2190
None,
2191
None,
2192
],
2193
"b_backward": ["Apple", "Orange", "Carrot", "Carrot", None, None],
2194
}
2195
# categoricals
2196
df = pl.DataFrame(pl.Series("cat", ["a", None], dtype=pl.Categorical))
2197
s = df.select(pl.col("cat").fill_null(strategy="forward"))["cat"]
2198
assert s.dtype == pl.Categorical
2199
assert s.to_list() == ["a", "a"]
2200
2201
2202
def test_fill_nan() -> None:
2203
df = pl.DataFrame({"a": [1, 2], "b": [3.0, float("nan")]})
2204
assert_frame_equal(
2205
df.fill_nan(4),
2206
pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0]}),
2207
)
2208
assert_frame_equal(
2209
df.fill_nan(None),
2210
pl.DataFrame({"a": [1, 2], "b": [3.0, None]}),
2211
)
2212
assert df["b"].fill_nan(5.0).to_list() == [3.0, 5.0]
2213
df = pl.DataFrame(
2214
{
2215
"a": [1.0, np.nan, 3.0],
2216
"b": [datetime(1, 2, 2), datetime(2, 2, 2), datetime(3, 2, 2)],
2217
}
2218
)
2219
assert df.fill_nan(2.0).dtypes == [pl.Float64, pl.Datetime]
2220
2221
2222
#
2223
def test_forward_fill() -> None:
2224
df = pl.DataFrame({"a": [1.0, None, 3.0]})
2225
fill = df.select(pl.col("a").forward_fill())["a"]
2226
assert_series_equal(fill, pl.Series("a", [1, 1, 3]).cast(pl.Float64))
2227
2228
df = pl.DataFrame({"a": [None, 1, None]})
2229
fill = df.select(pl.col("a").forward_fill())["a"]
2230
assert_series_equal(fill, pl.Series("a", [None, 1, 1]).cast(pl.Int64))
2231
2232
2233
def test_backward_fill() -> None:
2234
df = pl.DataFrame({"a": [1.0, None, 3.0]})
2235
fill = df.select(pl.col("a").backward_fill())["a"]
2236
assert_series_equal(fill, pl.Series("a", [1, 3, 3]).cast(pl.Float64))
2237
2238
df = pl.DataFrame({"a": [None, 1, None]})
2239
fill = df.select(pl.col("a").backward_fill())["a"]
2240
assert_series_equal(fill, pl.Series("a", [1, 1, None]).cast(pl.Int64))
2241
2242
2243
def test_shrink_to_fit() -> None:
2244
df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]})
2245
2246
assert df.shrink_to_fit(in_place=True) is df
2247
assert df.shrink_to_fit(in_place=False) is not df
2248
assert_frame_equal(df.shrink_to_fit(in_place=False), df)
2249
2250
2251
def test_add_string() -> None:
2252
df = pl.DataFrame({"a": ["hi", "there"], "b": ["hello", "world"]})
2253
expected = pl.DataFrame(
2254
{"a": ["hi hello", "there hello"], "b": ["hello hello", "world hello"]}
2255
)
2256
assert_frame_equal((df + " hello"), expected)
2257
2258
expected = pl.DataFrame(
2259
{"a": ["hello hi", "hello there"], "b": ["hello hello", "hello world"]}
2260
)
2261
assert_frame_equal(("hello " + df), expected)
2262
2263
2264
def test_df_broadcast() -> None:
2265
df = pl.DataFrame({"a": [1, 2, 3]}, schema_overrides={"a": pl.UInt8})
2266
out = df.with_columns(pl.lit(pl.Series("s", [[1, 2]])).first())
2267
assert out.shape == (3, 2)
2268
assert out.schema == {"a": pl.UInt8, "s": pl.List(pl.Int64)}
2269
assert out.rows() == [(1, [1, 2]), (2, [1, 2]), (3, [1, 2])]
2270
2271
2272
@pytest.mark.may_fail_cloud # not a lazyframe method
2273
def test_product() -> None:
2274
df = pl.DataFrame(
2275
{
2276
"int": [1, 2, 3],
2277
"flt": [-1.0, 12.0, 9.0],
2278
"bool_0": [True, False, True],
2279
"bool_1": [True, True, True],
2280
"str": ["a", "b", "c"],
2281
},
2282
schema_overrides={
2283
"int": pl.UInt16,
2284
"flt": pl.Float32,
2285
},
2286
)
2287
out = df.product()
2288
expected = pl.DataFrame(
2289
{"int": [6], "flt": [-108.0], "bool_0": [0], "bool_1": [1], "str": [None]}
2290
)
2291
assert_frame_not_equal(out, expected, check_dtypes=True)
2292
assert_frame_equal(out, expected, check_dtypes=False)
2293
2294
2295
def test_first_last_nth_expressions(fruits_cars: pl.DataFrame) -> None:
2296
df = fruits_cars
2297
out = df.select(pl.first())
2298
assert out.columns == ["A"]
2299
2300
out = df.select(pl.last())
2301
assert out.columns == ["cars"]
2302
2303
out = df.select(pl.nth(0))
2304
assert out.columns == ["A"]
2305
2306
out = df.select(pl.nth(1))
2307
assert out.columns == ["fruits"]
2308
2309
out = df.select(pl.nth(-2))
2310
assert out.columns == ["B"]
2311
2312
2313
def test_is_between(fruits_cars: pl.DataFrame) -> None:
2314
result = fruits_cars.select(pl.col("A").is_between(2, 4)).to_series()
2315
assert_series_equal(result, pl.Series("A", [False, True, True, True, False]))
2316
2317
result = fruits_cars.select(pl.col("A").is_between(2, 4, closed="none")).to_series()
2318
assert_series_equal(result, pl.Series("A", [False, False, True, False, False]))
2319
2320
result = fruits_cars.select(pl.col("A").is_between(2, 4, closed="both")).to_series()
2321
assert_series_equal(result, pl.Series("A", [False, True, True, True, False]))
2322
2323
result = fruits_cars.select(
2324
pl.col("A").is_between(2, 4, closed="right")
2325
).to_series()
2326
assert_series_equal(result, pl.Series("A", [False, False, True, True, False]))
2327
2328
result = fruits_cars.select(pl.col("A").is_between(2, 4, closed="left")).to_series()
2329
assert_series_equal(result, pl.Series("A", [False, True, True, False, False]))
2330
2331
2332
def test_is_between_data_types() -> None:
2333
df = pl.DataFrame(
2334
{
2335
"flt": [1.4, 1.2, 2.5],
2336
"int": [2, 3, 4],
2337
"str": ["xyz", "str", "abc"],
2338
"date": [date(2020, 1, 1), date(2020, 2, 2), date(2020, 3, 3)],
2339
"datetime": [
2340
datetime(2020, 1, 1, 0, 0, 0),
2341
datetime(2020, 1, 1, 10, 0, 0),
2342
datetime(2020, 1, 1, 12, 0, 0),
2343
],
2344
"tm": [time(10, 30), time(0, 45), time(15, 15)],
2345
}
2346
)
2347
2348
# on purpose, for float and int, we pass in a mixture of bound data types
2349
assert_series_equal(
2350
df.select(pl.col("flt").is_between(1, 2.3))[:, 0],
2351
pl.Series("flt", [True, True, False]),
2352
)
2353
assert_series_equal(
2354
df.select(pl.col("int").is_between(1.5, 3))[:, 0],
2355
pl.Series("int", [True, True, False]),
2356
)
2357
assert_series_equal(
2358
df.select(pl.col("date").is_between(date(2019, 1, 1), date(2020, 2, 5)))[:, 0],
2359
pl.Series("date", [True, True, False]),
2360
)
2361
assert_series_equal(
2362
df.select(
2363
pl.col("datetime").is_between(
2364
datetime(2020, 1, 1, 5, 0, 0), datetime(2020, 1, 1, 11, 0, 0)
2365
)
2366
)[:, 0],
2367
pl.Series("datetime", [False, True, False]),
2368
)
2369
assert_series_equal(
2370
df.select(
2371
pl.col("str").is_between(pl.lit("str"), pl.lit("zzz"), closed="left")
2372
)[:, 0],
2373
pl.Series("str", [True, True, False]),
2374
)
2375
assert_series_equal(
2376
df.select(
2377
pl.col("tm")
2378
.is_between(time(0, 45), time(10, 30), closed="right")
2379
.alias("tm_between")
2380
)[:, 0],
2381
pl.Series("tm_between", [True, False, False]),
2382
)
2383
2384
2385
def test_empty_is_in() -> None:
2386
df_empty_isin = pl.DataFrame({"foo": ["a", "b", "c", "d"]}).filter(
2387
pl.col("foo").is_in([])
2388
)
2389
assert df_empty_isin.shape == (0, 1)
2390
assert df_empty_isin.rows() == []
2391
assert df_empty_isin.schema == {"foo": pl.String}
2392
2393
2394
def test_group_by_slice_expression_args() -> None:
2395
df = pl.DataFrame({"groups": ["a"] * 10 + ["b"] * 20, "vals": range(30)})
2396
2397
out = (
2398
df.group_by("groups", maintain_order=True)
2399
.agg([pl.col("vals").slice((pl.len() * 0.1).cast(int), (pl.len() // 5))])
2400
.explode("vals")
2401
)
2402
2403
expected = pl.DataFrame(
2404
{"groups": ["a", "a", "b", "b", "b", "b"], "vals": [1, 2, 12, 13, 14, 15]}
2405
)
2406
assert_frame_equal(out, expected)
2407
2408
2409
def test_join_suffixes() -> None:
2410
df_a = pl.DataFrame({"A": [1], "B": [1]})
2411
df_b = pl.DataFrame({"A": [1], "B": [1]})
2412
2413
join_strategies: list[JoinStrategy] = ["left", "inner", "full", "cross"]
2414
for how in join_strategies:
2415
# no need for an assert, we error if wrong
2416
df_a.join(df_b, on="A" if how != "cross" else None, suffix="_y", how=how)["B_y"]
2417
2418
df_a.join_asof(df_b, on=pl.col("A").set_sorted(), suffix="_y")["B_y"]
2419
2420
2421
def test_explode_empty() -> None:
2422
df = (
2423
pl.DataFrame({"x": ["a", "a", "b", "b"], "y": [1, 1, 2, 2]})
2424
.group_by("x", maintain_order=True)
2425
.agg(pl.col("y").gather([]))
2426
)
2427
assert df.explode("y").to_dict(as_series=False) == {
2428
"x": ["a", "b"],
2429
"y": [None, None],
2430
}
2431
2432
df = pl.DataFrame({"x": ["1", "2", "4"], "y": [["a", "b", "c"], ["d"], []]})
2433
assert_frame_equal(
2434
df.explode("y"),
2435
pl.DataFrame({"x": ["1", "1", "1", "2", "4"], "y": ["a", "b", "c", "d", None]}),
2436
)
2437
2438
df = pl.DataFrame(
2439
{
2440
"letters": ["a"],
2441
"numbers": [[]],
2442
}
2443
)
2444
assert df.explode("numbers").to_dict(as_series=False) == {
2445
"letters": ["a"],
2446
"numbers": [None],
2447
}
2448
2449
2450
def test_asof_by_multiple_keys() -> None:
2451
lhs = pl.DataFrame(
2452
{
2453
"a": [-20, -19, 8, 12, 14],
2454
"by": [1, 1, 2, 2, 2],
2455
"by2": [1, 1, 2, 2, 2],
2456
}
2457
)
2458
2459
rhs = pl.DataFrame(
2460
{
2461
"a": [-19, -15, 3, 5, 13],
2462
"by": [1, 1, 2, 2, 2],
2463
"by2": [1, 1, 2, 2, 2],
2464
}
2465
)
2466
2467
result = lhs.join_asof(
2468
rhs, on=pl.col("a").set_sorted(), by=["by", "by2"], strategy="backward"
2469
).select(["a", "by"])
2470
expected = pl.DataFrame({"a": [-20, -19, 8, 12, 14], "by": [1, 1, 2, 2, 2]})
2471
assert_frame_equal(
2472
result.group_by("by").agg("a"),
2473
expected.group_by("by").agg("a"),
2474
check_row_order=False,
2475
)
2476
2477
2478
def test_asof_bad_input_type() -> None:
2479
lhs = pl.DataFrame({"a": [1, 2, 3]})
2480
rhs = pl.DataFrame({"a": [1, 2, 3]})
2481
2482
with pytest.raises(
2483
TypeError,
2484
match=r"expected `other` .*to be a 'DataFrame'.* not 'LazyFrame'",
2485
):
2486
lhs.join_asof(rhs.lazy(), on="a") # type: ignore[arg-type]
2487
2488
with pytest.raises(
2489
TypeError,
2490
match=r"expected `other` .*to be a 'DataFrame'.* not 'Series'",
2491
):
2492
lhs.join_asof(pl.Series([1, 2, 3]), on="a") # type: ignore[arg-type]
2493
2494
class DummyDataFrameSubclass(pl.DataFrame):
2495
pass
2496
2497
rhs = DummyDataFrameSubclass(rhs)
2498
2499
lhs.join_asof(rhs, on="a")
2500
2501
2502
def test_list_of_list_of_struct() -> None:
2503
expected = [{"list_of_list_of_struct": [[{"a": 1}, {"a": 2}]]}]
2504
pa_df = pa.Table.from_pylist(expected)
2505
2506
df = pl.from_arrow(pa_df)
2507
assert df.rows() == [([[{"a": 1}, {"a": 2}]],)] # type: ignore[union-attr]
2508
assert df.to_dicts() == expected # type: ignore[union-attr]
2509
2510
df = pl.from_arrow(pa_df[:0])
2511
assert df.to_dicts() == [] # type: ignore[union-attr]
2512
2513
2514
def test_fill_null_limits() -> None:
2515
assert pl.DataFrame(
2516
{
2517
"a": [1, None, None, None, 5, 6, None, None, None, 10],
2518
"b": ["a", None, None, None, "b", "c", None, None, None, "d"],
2519
"c": [True, None, None, None, False, True, None, None, None, False],
2520
}
2521
).select(
2522
pl.all().fill_null(strategy="forward", limit=2),
2523
pl.all().fill_null(strategy="backward", limit=2).name.suffix("_backward"),
2524
).to_dict(as_series=False) == {
2525
"a": [1, 1, 1, None, 5, 6, 6, 6, None, 10],
2526
"b": ["a", "a", "a", None, "b", "c", "c", "c", None, "d"],
2527
"c": [True, True, True, None, False, True, True, True, None, False],
2528
"a_backward": [1, None, 5, 5, 5, 6, None, 10, 10, 10],
2529
"b_backward": ["a", None, "b", "b", "b", "c", None, "d", "d", "d"],
2530
"c_backward": [
2531
True,
2532
None,
2533
False,
2534
False,
2535
False,
2536
True,
2537
None,
2538
False,
2539
False,
2540
False,
2541
],
2542
}
2543
2544
2545
def test_lower_bound_upper_bound(fruits_cars: pl.DataFrame) -> None:
2546
res_expr = fruits_cars.select(pl.col("A").lower_bound())
2547
assert res_expr.item() == -9223372036854775808
2548
2549
res_expr = fruits_cars.select(pl.col("B").upper_bound())
2550
assert res_expr.item() == 9223372036854775807
2551
2552
with pytest.raises(ComputeError):
2553
fruits_cars.select(pl.col("fruits").upper_bound())
2554
2555
2556
def test_selection_misc() -> None:
2557
df = pl.DataFrame({"x": "abc"}, schema={"x": pl.String})
2558
2559
# literal values (as scalar/list)
2560
for zero in (0, [0]):
2561
assert df.select(zero)["literal"].to_list() == [0]
2562
assert df.select(literal=0)["literal"].to_list() == [0]
2563
2564
# expect string values to be interpreted as cols
2565
for x in ("x", ["x"], pl.col("x")):
2566
assert df.select(x).rows() == [("abc",)]
2567
2568
# string col + lit
2569
assert df.with_columns(["x", 0]).to_dicts() == [{"x": "abc", "literal": 0}]
2570
2571
2572
def test_selection_regex_and_multicol() -> None:
2573
test_df = pl.DataFrame(
2574
{
2575
"a": [1, 2, 3, 4],
2576
"b": [5, 6, 7, 8],
2577
"c": [9, 10, 11, 12],
2578
"foo": [13, 14, 15, 16],
2579
},
2580
schema_overrides={"foo": pl.UInt8},
2581
)
2582
2583
# Selection only
2584
test_df.select(
2585
pl.col(["a", "b", "c"]).name.suffix("_list"),
2586
pl.all().exclude("foo").name.suffix("_wild"),
2587
pl.col("^\\w$").name.suffix("_regex"),
2588
)
2589
2590
# Multi * Single
2591
assert test_df.select(pl.col(["a", "b", "c"]) * pl.col("foo")).to_dict(
2592
as_series=False
2593
) == {
2594
"a": [13, 28, 45, 64],
2595
"b": [65, 84, 105, 128],
2596
"c": [117, 140, 165, 192],
2597
}
2598
assert test_df.select(pl.all().exclude("foo") * pl.col("foo")).to_dict(
2599
as_series=False
2600
) == {
2601
"a": [13, 28, 45, 64],
2602
"b": [65, 84, 105, 128],
2603
"c": [117, 140, 165, 192],
2604
}
2605
2606
assert test_df.select(pl.col("^\\w$") * pl.col("foo")).to_dict(as_series=False) == {
2607
"a": [13, 28, 45, 64],
2608
"b": [65, 84, 105, 128],
2609
"c": [117, 140, 165, 192],
2610
}
2611
2612
# Multi * Multi
2613
result = test_df.select(pl.col(["a", "b", "c"]) * pl.col(["a", "b", "c"]))
2614
expected = {"a": [1, 4, 9, 16], "b": [25, 36, 49, 64], "c": [81, 100, 121, 144]}
2615
2616
assert result.to_dict(as_series=False) == expected
2617
assert test_df.select(pl.exclude("foo") * pl.exclude("foo")).to_dict(
2618
as_series=False
2619
) == {
2620
"a": [1, 4, 9, 16],
2621
"b": [25, 36, 49, 64],
2622
"c": [81, 100, 121, 144],
2623
}
2624
assert test_df.select(pl.col("^\\w$") * pl.col("^\\w$")).to_dict(
2625
as_series=False
2626
) == {
2627
"a": [1, 4, 9, 16],
2628
"b": [25, 36, 49, 64],
2629
"c": [81, 100, 121, 144],
2630
}
2631
2632
df = test_df.select(
2633
re=pl.struct(pl.col("^\\w$")),
2634
odd=pl.struct((pl.col(INTEGER_DTYPES) % 2).name.suffix("_is_odd")),
2635
maxes=pl.struct(pl.all().max().name.suffix("_max")),
2636
).head(2)
2637
# ┌───────────┬───────────┬─────────────┐
2638
# │ re ┆ odd ┆ maxes │
2639
# │ --- ┆ --- ┆ --- │
2640
# │ struct[3] ┆ struct[4] ┆ struct[4] │
2641
# ╞═══════════╪═══════════╪═════════════╡
2642
# │ {1,5,9} ┆ {1,1,1,1} ┆ {4,8,12,16} │
2643
# │ {2,6,10} ┆ {0,0,0,0} ┆ {4,8,12,16} │
2644
# └───────────┴───────────┴─────────────┘
2645
assert df.rows() == [
2646
(
2647
{"a": 1, "b": 5, "c": 9},
2648
{"a_is_odd": 1, "b_is_odd": 1, "c_is_odd": 1, "foo_is_odd": 1},
2649
{"a_max": 4, "b_max": 8, "c_max": 12, "foo_max": 16},
2650
),
2651
(
2652
{"a": 2, "b": 6, "c": 10},
2653
{"a_is_odd": 0, "b_is_odd": 0, "c_is_odd": 0, "foo_is_odd": 0},
2654
{"a_max": 4, "b_max": 8, "c_max": 12, "foo_max": 16},
2655
),
2656
]
2657
2658
2659
@pytest.mark.parametrize("subset", ["a", cs.starts_with("x", "a")])
2660
@pytest.mark.may_fail_auto_streaming # Flaky in CI, see https://github.com/pola-rs/polars/issues/20943
2661
@pytest.mark.may_fail_cloud
2662
def test_unique_on_sorted(subset: Any) -> None:
2663
df = pl.DataFrame(data={"a": [1, 1, 3], "b": [1, 2, 3]})
2664
2665
result = df.with_columns([pl.col("a").set_sorted()]).unique(
2666
subset=subset,
2667
keep="last",
2668
)
2669
2670
expected = pl.DataFrame({"a": [1, 3], "b": [2, 3]})
2671
assert_frame_equal(result, expected)
2672
2673
2674
def test_len_compute(df: pl.DataFrame) -> None:
2675
df = df.with_columns(pl.struct(["list_bool", "cat"]).alias("struct"))
2676
filtered = df.filter(pl.col("bools"))
2677
for col in filtered.columns:
2678
assert len(filtered[col]) == 1
2679
2680
taken = df[[1, 2], :]
2681
for col in taken.columns:
2682
assert len(taken[col]) == 2
2683
2684
2685
def test_filter_sequence() -> None:
2686
df = pl.DataFrame({"a": [1, 2, 3]})
2687
assert df.filter([True, False, True])["a"].to_list() == [1, 3]
2688
assert df.filter(np.array([True, False, True]))["a"].to_list() == [1, 3]
2689
2690
2691
def test_filter_multiple_predicates() -> None:
2692
df = pl.DataFrame(
2693
{
2694
"a": [1, 1, 1, 2, 2],
2695
"b": [1, 1, 2, 2, 2],
2696
"c": [1, 1, 2, 3, 4],
2697
}
2698
)
2699
2700
# multiple predicates
2701
expected = pl.DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "c": [1, 1, 2]})
2702
for out in (
2703
df.filter(pl.col("a") == 1, pl.col("b") <= 2), # positional/splat
2704
df.filter([pl.col("a") == 1, pl.col("b") <= 2]), # as list
2705
):
2706
assert_frame_equal(out, expected)
2707
2708
# multiple kwargs
2709
assert_frame_equal(
2710
df.filter(a=1, b=2),
2711
pl.DataFrame({"a": [1], "b": [2], "c": [2]}),
2712
)
2713
2714
# both positional and keyword args
2715
assert_frame_equal(
2716
pl.DataFrame({"a": [2], "b": [2], "c": [3]}),
2717
df.filter(pl.col("c") < 4, a=2, b=2),
2718
)
2719
2720
# boolean mask
2721
out = df.filter([True, False, False, False, True])
2722
expected = pl.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 4]})
2723
assert_frame_equal(out, expected)
2724
2725
# multiple boolean masks
2726
out = df.filter(
2727
np.array([True, True, False, True, False]),
2728
np.array([True, False, True, True, False]),
2729
)
2730
expected = pl.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 3]})
2731
assert_frame_equal(out, expected)
2732
2733
2734
def test_indexing_set() -> None:
2735
df = pl.DataFrame({"bool": [True, True], "str": ["N/A", "N/A"], "nr": [1, 2]})
2736
2737
df[0, "bool"] = False
2738
df[0, "nr"] = 100
2739
df[0, "str"] = "foo"
2740
2741
assert df.to_dict(as_series=False) == {
2742
"bool": [False, True],
2743
"str": ["foo", "N/A"],
2744
"nr": [100, 2],
2745
}
2746
2747
2748
def test_set() -> None:
2749
# Setting a dataframe using indices is deprecated.
2750
# We keep these tests because we only generate a warning.
2751
np.random.seed(1)
2752
df = pl.DataFrame(
2753
{"foo": np.random.rand(10), "bar": np.arange(10), "ham": ["h"] * 10}
2754
)
2755
with pytest.raises(
2756
TypeError,
2757
match=r"DataFrame object does not support `Series` assignment by index"
2758
r"\n\nUse `DataFrame.with_columns`.",
2759
):
2760
df["new"] = np.random.rand(10)
2761
2762
with pytest.raises(
2763
TypeError,
2764
match=r"not allowed to set DataFrame by boolean mask in the row position"
2765
r"\n\nConsider using `DataFrame.with_columns`.",
2766
):
2767
df[df["ham"] > 0.5, "ham"] = "a"
2768
with pytest.raises(
2769
TypeError,
2770
match=r"not allowed to set DataFrame by boolean mask in the row position"
2771
r"\n\nConsider using `DataFrame.with_columns`.",
2772
):
2773
df[[True, False], "ham"] = "a"
2774
2775
# set 2D
2776
df = pl.DataFrame({"b": [0, 0]})
2777
df[["A", "B"]] = [[1, 2], [1, 2]]
2778
2779
with pytest.raises(ValueError):
2780
df[["C", "D"]] = 1
2781
with pytest.raises(ValueError):
2782
df[["C", "D"]] = [1, 1]
2783
with pytest.raises(ValueError):
2784
df[["C", "D"]] = [[1, 2, 3], [1, 2, 3]]
2785
2786
# set tuple
2787
df = pl.DataFrame({"b": [0, 0]})
2788
df[0, "b"] = 1
2789
assert df[0, "b"] == 1
2790
2791
df[0, 0] = 2
2792
assert df[0, "b"] == 2
2793
2794
# row and col selection have to be int or str
2795
with pytest.raises(TypeError):
2796
df[:, [1]] = 1 # type: ignore[index]
2797
with pytest.raises(TypeError):
2798
df[True, :] = 1 # type: ignore[index]
2799
2800
# needs to be a 2 element tuple
2801
with pytest.raises(ValueError):
2802
df[1, 2, 3] = 1
2803
2804
# we cannot index with any type, such as bool
2805
with pytest.raises(TypeError):
2806
df[True] = 1 # type: ignore[index]
2807
2808
2809
def test_series_iter_over_frame() -> None:
2810
df = pl.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4], "c": [3, 4, 5]})
2811
2812
expected = {
2813
0: pl.Series("a", [1, 2, 3]),
2814
1: pl.Series("b", [2, 3, 4]),
2815
2: pl.Series("c", [3, 4, 5]),
2816
}
2817
for idx, s in enumerate(df):
2818
assert_series_equal(s, expected[idx])
2819
2820
expected = {
2821
0: pl.Series("c", [3, 4, 5]),
2822
1: pl.Series("b", [2, 3, 4]),
2823
2: pl.Series("a", [1, 2, 3]),
2824
}
2825
for idx, s in enumerate(reversed(df)):
2826
assert_series_equal(s, expected[idx])
2827
2828
2829
def test_union_with_aliases_4770() -> None:
2830
lf = pl.DataFrame(
2831
{
2832
"a": [1, None],
2833
"b": [3, 4],
2834
}
2835
).lazy()
2836
2837
lf = pl.concat(
2838
[
2839
lf.select([pl.col("a").alias("x")]),
2840
lf.select([pl.col("b").alias("x")]),
2841
]
2842
).filter(pl.col("x").is_not_null())
2843
2844
assert lf.collect()["x"].to_list() == [1, 3, 4]
2845
2846
2847
def test_init_datetimes_with_timezone() -> None:
2848
tz_us = "America/New_York"
2849
tz_europe = "Europe/Amsterdam"
2850
2851
dtm = datetime(2022, 10, 12, 12, 30)
2852
for time_unit in DTYPE_TEMPORAL_UNITS:
2853
for type_overrides in (
2854
{
2855
"schema": [
2856
("d1", pl.Datetime(time_unit, tz_us)),
2857
("d2", pl.Datetime(time_unit, tz_europe)),
2858
]
2859
},
2860
{
2861
"schema_overrides": {
2862
"d1": pl.Datetime(time_unit, tz_us),
2863
"d2": pl.Datetime(time_unit, tz_europe),
2864
}
2865
},
2866
):
2867
result = pl.DataFrame(
2868
data={
2869
"d1": [dtm.replace(tzinfo=ZoneInfo(tz_us))],
2870
"d2": [dtm.replace(tzinfo=ZoneInfo(tz_europe))],
2871
},
2872
**type_overrides,
2873
)
2874
expected = pl.DataFrame(
2875
{"d1": ["2022-10-12 12:30"], "d2": ["2022-10-12 12:30"]}
2876
).with_columns(
2877
pl.col("d1").str.to_datetime(time_unit=time_unit, time_zone=tz_us),
2878
pl.col("d2").str.to_datetime(time_unit=time_unit, time_zone=tz_europe),
2879
)
2880
assert_frame_equal(result, expected)
2881
2882
2883
@pytest.mark.parametrize(
2884
(
2885
"tzinfo",
2886
"offset",
2887
"dtype_time_zone",
2888
"expected_time_zone",
2889
"expected_item",
2890
),
2891
[
2892
(None, "", None, None, datetime(2020, 1, 1)),
2893
(
2894
timezone(timedelta(hours=-8)),
2895
"-08:00",
2896
"UTC",
2897
"UTC",
2898
datetime(2020, 1, 1, 8, tzinfo=timezone.utc),
2899
),
2900
(
2901
timezone(timedelta(hours=-8)),
2902
"-08:00",
2903
None,
2904
"UTC",
2905
datetime(2020, 1, 1, 8, tzinfo=timezone.utc),
2906
),
2907
],
2908
)
2909
@pytest.mark.may_fail_cloud
2910
def test_init_vs_strptime_consistency(
2911
tzinfo: timezone | None,
2912
offset: str,
2913
dtype_time_zone: str | None,
2914
expected_time_zone: str,
2915
expected_item: datetime,
2916
) -> None:
2917
result_init = pl.Series(
2918
[datetime(2020, 1, 1, tzinfo=tzinfo)],
2919
dtype=pl.Datetime("us", dtype_time_zone),
2920
)
2921
result_strptime = pl.Series([f"2020-01-01 00:00{offset}"]).str.strptime(
2922
pl.Datetime("us", dtype_time_zone)
2923
)
2924
assert result_init.dtype == pl.Datetime("us", expected_time_zone)
2925
assert result_init.item() == expected_item
2926
assert_series_equal(result_init, result_strptime)
2927
2928
2929
def test_init_vs_strptime_consistency_converts() -> None:
2930
result = pl.Series(
2931
[datetime(2020, 1, 1, tzinfo=timezone(timedelta(hours=-8)))],
2932
dtype=pl.Datetime("us", "America/Los_Angeles"),
2933
).item()
2934
assert result == datetime(
2935
2020, 1, 1, 0, 0, tzinfo=ZoneInfo(key="America/Los_Angeles")
2936
)
2937
result = (
2938
pl.Series(["2020-01-01 00:00-08:00"])
2939
.str.strptime(pl.Datetime("us", "America/Los_Angeles"))
2940
.item()
2941
)
2942
assert result == datetime(
2943
2020, 1, 1, 0, 0, tzinfo=ZoneInfo(key="America/Los_Angeles")
2944
)
2945
2946
2947
def test_init_physical_with_timezone() -> None:
2948
tz_uae = "Asia/Dubai"
2949
tz_asia = "Asia/Tokyo"
2950
2951
dtm_us = 1665577800000000
2952
for time_unit in DTYPE_TEMPORAL_UNITS:
2953
dtm = {"ms": dtm_us // 1_000, "ns": dtm_us * 1_000}.get(str(time_unit), dtm_us)
2954
df = pl.DataFrame(
2955
data={"d1": [dtm], "d2": [dtm]},
2956
schema=[
2957
("d1", pl.Datetime(time_unit, tz_uae)),
2958
("d2", pl.Datetime(time_unit, tz_asia)),
2959
],
2960
)
2961
assert (df["d1"].to_physical() == df["d2"].to_physical()).all()
2962
assert df.rows() == [
2963
(
2964
datetime(2022, 10, 12, 16, 30, tzinfo=ZoneInfo(tz_uae)),
2965
datetime(2022, 10, 12, 21, 30, tzinfo=ZoneInfo(tz_asia)),
2966
)
2967
]
2968
2969
2970
@pytest.mark.parametrize("divop", [floordiv, truediv])
2971
def test_floordiv_truediv(divop: Callable[..., Any]) -> None:
2972
# validate truediv/floordiv dataframe ops against python
2973
df1 = pl.DataFrame(
2974
data={
2975
"x": [0, -1, -2, -3],
2976
"y": [-0.0, -3.0, 5.0, -7.0],
2977
"z": [10, 3, -5, 7],
2978
}
2979
)
2980
2981
# scalar
2982
for df in [df1, df1.slice(0, 0)]:
2983
for n in (3, 3.0, -3, -3.0):
2984
py_div = [tuple(divop(elem, n) for elem in row) for row in df.rows()]
2985
df_div = divop(df, n).rows()
2986
assert py_div == df_div
2987
2988
# series
2989
xdf, s = df1["x"].to_frame(), pl.Series([2] * 4)
2990
assert list(divop(xdf, s)["x"]) == [divop(x, 2) for x in list(df1["x"])]
2991
2992
# frame
2993
df2 = pl.DataFrame(
2994
data={
2995
"x": [2, -2, 2, 3],
2996
"y": [4, 4, -4, 8],
2997
"z": [0.5, 2.0, -2.0, -3],
2998
}
2999
)
3000
df_div = divop(df1, df2).rows()
3001
for i, (row1, row2) in enumerate(zip(df1.rows(), df2.rows(), strict=True)):
3002
for j, (elem1, elem2) in enumerate(zip(row1, row2, strict=True)):
3003
assert divop(elem1, elem2) == df_div[i][j]
3004
3005
3006
@pytest.mark.parametrize(
3007
("subset", "keep", "expected_mask"),
3008
[
3009
(None, "first", [True, True, True, False]),
3010
("a", "first", [True, True, False, False]),
3011
(["a", "b"], "first", [True, True, False, False]),
3012
(("a", "b"), "last", [True, False, False, True]),
3013
(("a", "b"), "none", [True, False, False, False]),
3014
],
3015
)
3016
def test_unique(
3017
subset: str | Sequence[str], keep: UniqueKeepStrategy, expected_mask: list[bool]
3018
) -> None:
3019
df = pl.DataFrame({"a": [1, 2, 2, 2], "b": [3, 4, 4, 4], "c": [5, 6, 7, 7]})
3020
3021
result = df.unique(maintain_order=True, subset=subset, keep=keep).sort(pl.all())
3022
expected = df.filter(expected_mask).sort(pl.all())
3023
assert_frame_equal(result, expected)
3024
3025
3026
def test_iter_slices() -> None:
3027
df = pl.DataFrame(
3028
{
3029
"a": range(95),
3030
"b": date(2023, 1, 1),
3031
"c": "klmnopqrstuvwxyz",
3032
}
3033
)
3034
batches = list(df.iter_slices(n_rows=50))
3035
3036
assert len(batches[0]) == 50
3037
assert len(batches[1]) == 45
3038
assert batches[1].rows() == df[50:].rows()
3039
3040
3041
def test_format_empty_df() -> None:
3042
df = pl.DataFrame(
3043
[
3044
pl.Series("val1", [], dtype=pl.Categorical),
3045
pl.Series("val2", [], dtype=pl.Categorical),
3046
]
3047
).select(
3048
pl.format("{}:{}", pl.col("val1"), pl.col("val2")).alias("cat"),
3049
)
3050
assert df.shape == (0, 1)
3051
assert df.dtypes == [pl.String]
3052
3053
3054
def test_deadlocks_3409() -> None:
3055
assert (
3056
pl.DataFrame({"col1": [[1, 2, 3]]})
3057
.with_columns(
3058
pl.col("col1").list.eval(
3059
pl.element().map_elements(lambda x: x, return_dtype=pl.Int64)
3060
)
3061
)
3062
.to_dict(as_series=False)
3063
) == {"col1": [[1, 2, 3]]}
3064
3065
assert (
3066
pl.DataFrame({"col1": [1, 2, 3]})
3067
.with_columns(
3068
pl.col("col1").cumulative_eval(
3069
pl.element().map_batches(lambda x: 0, pl.Int64, returns_scalar=True)
3070
)
3071
)
3072
.to_dict(as_series=False)
3073
) == {"col1": [0, 0, 0]}
3074
3075
3076
def test_ceil() -> None:
3077
df = pl.DataFrame({"a": [1.8, 1.2, 3.0]})
3078
result = df.select(pl.col("a").ceil())
3079
assert_frame_equal(result, pl.DataFrame({"a": [2.0, 2.0, 3.0]}))
3080
3081
df = pl.DataFrame({"a": [1, 2, 3]})
3082
result = df.select(pl.col("a").ceil())
3083
assert_frame_equal(df, result)
3084
3085
3086
def test_floor() -> None:
3087
df = pl.DataFrame({"a": [1.8, 1.2, 3.0]})
3088
result = df.select(pl.col("a").floor())
3089
assert_frame_equal(result, pl.DataFrame({"a": [1.0, 1.0, 3.0]}))
3090
3091
df = pl.DataFrame({"a": [1, 2, 3]})
3092
result = df.select(pl.col("a").floor())
3093
assert_frame_equal(df, result)
3094
3095
3096
def test_floor_divide() -> None:
3097
x = 10.4
3098
step = 0.5
3099
df = pl.DataFrame({"x": [x]})
3100
assert df.with_columns(pl.col("x") // step)[0, 0] == x // step
3101
3102
3103
def test_round() -> None:
3104
df = pl.DataFrame({"a": [1.8, 1.2, 3.0]})
3105
col_a_rounded = df.select(pl.col("a").round(decimals=0))["a"]
3106
assert_series_equal(col_a_rounded, pl.Series("a", [2, 1, 3]).cast(pl.Float64))
3107
3108
3109
def test_dot() -> None:
3110
df = pl.DataFrame({"a": [1.8, 1.2, 3.0], "b": [3.2, 1, 2]})
3111
assert df.select(pl.col("a").dot(pl.col("b"))).item() == 12.96
3112
3113
3114
def test_unstack() -> None:
3115
from string import ascii_uppercase
3116
3117
df = pl.DataFrame(
3118
{
3119
"col1": list(ascii_uppercase[0:9]),
3120
"col2": pl.int_range(0, 9, eager=True),
3121
"col3": pl.int_range(-9, 0, eager=True),
3122
}
3123
)
3124
assert df.unstack(step=3, how="vertical").to_dict(as_series=False) == {
3125
"col1_0": ["A", "B", "C"],
3126
"col1_1": ["D", "E", "F"],
3127
"col1_2": ["G", "H", "I"],
3128
"col2_0": [0, 1, 2],
3129
"col2_1": [3, 4, 5],
3130
"col2_2": [6, 7, 8],
3131
"col3_0": [-9, -8, -7],
3132
"col3_1": [-6, -5, -4],
3133
"col3_2": [-3, -2, -1],
3134
}
3135
3136
assert df.unstack(step=3, how="horizontal").to_dict(as_series=False) == {
3137
"col1_0": ["A", "D", "G"],
3138
"col1_1": ["B", "E", "H"],
3139
"col1_2": ["C", "F", "I"],
3140
"col2_0": [0, 3, 6],
3141
"col2_1": [1, 4, 7],
3142
"col2_2": [2, 5, 8],
3143
"col3_0": [-9, -6, -3],
3144
"col3_1": [-8, -5, -2],
3145
"col3_2": [-7, -4, -1],
3146
}
3147
3148
for column_subset in (("col2", "col3"), cs.integer()):
3149
assert df.unstack(
3150
step=3,
3151
how="horizontal",
3152
columns=column_subset,
3153
).to_dict(as_series=False) == {
3154
"col2_0": [0, 3, 6],
3155
"col2_1": [1, 4, 7],
3156
"col2_2": [2, 5, 8],
3157
"col3_0": [-9, -6, -3],
3158
"col3_1": [-8, -5, -2],
3159
"col3_2": [-7, -4, -1],
3160
}
3161
3162
3163
def test_window_deadlock() -> None:
3164
np.random.seed(12)
3165
3166
df = pl.DataFrame(
3167
{
3168
"nrs": [1, 2, 3, None, 5],
3169
"names": ["foo", "ham", "spam", "egg", None],
3170
"random": np.random.rand(5),
3171
"groups": ["A", "A", "B", "C", "B"],
3172
}
3173
)
3174
3175
_df = df.select(
3176
pl.col("*"), # select all
3177
pl.col("random").sum().over("groups").alias("sum[random]/groups"),
3178
pl.col("random").implode().over("names").alias("random/name"),
3179
)
3180
3181
3182
def test_sum_empty_column_names() -> None:
3183
df = pl.DataFrame({"x": [], "y": []}, schema={"x": pl.Boolean, "y": pl.Boolean})
3184
expected = pl.DataFrame(
3185
{"x": [0], "y": [0]},
3186
schema={"x": pl.get_index_type(), "y": pl.get_index_type()},
3187
)
3188
assert_frame_equal(df.sum(), expected)
3189
3190
3191
def test_flags() -> None:
3192
df = pl.DataFrame({"a": [1, 2, 3], "b": [9, 5, 6]})
3193
assert df.flags == {
3194
"a": {"SORTED_ASC": False, "SORTED_DESC": False},
3195
"b": {"SORTED_ASC": False, "SORTED_DESC": False},
3196
}
3197
assert df.set_sorted("a").flags == {
3198
"a": {"SORTED_ASC": True, "SORTED_DESC": False},
3199
"b": {"SORTED_ASC": False, "SORTED_DESC": False},
3200
}
3201
3202
3203
def test_interchange() -> None:
3204
df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["foo", "bar"]})
3205
dfi = df.__dataframe__()
3206
3207
# Testing some random properties to make sure conversion happened correctly
3208
assert dfi.num_rows() == 2
3209
assert dfi.get_column(0).dtype[1] == 64
3210
assert dfi.get_column_by_name("c").get_buffers()["data"][0].bufsize == 6
3211
3212
3213
def test_from_dicts_undeclared_column_dtype() -> None:
3214
data = [{"a": 1, "b": 2}]
3215
result = pl.from_dicts(data, schema=["x"])
3216
assert result.schema == {"x": pl.Null}
3217
3218
3219
def test_from_dicts_with_override() -> None:
3220
data = [
3221
{"a": "1", "b": str(2**64 - 1), "c": "1"},
3222
{"a": "1", "b": "1", "c": "-5.0"},
3223
]
3224
override = {"a": pl.Int32, "b": pl.UInt64, "c": pl.Float32}
3225
result = pl.from_dicts(data, schema_overrides=override)
3226
assert_frame_equal(
3227
result,
3228
pl.DataFrame(
3229
{
3230
"a": pl.Series([1, 1], dtype=pl.Int32),
3231
"b": pl.Series([2**64 - 1, 1], dtype=pl.UInt64),
3232
"c": pl.Series([1.0, -5.0], dtype=pl.Float32),
3233
}
3234
),
3235
)
3236
3237
3238
def test_from_records_u64_12329() -> None:
3239
s = pl.from_records([{"a": 9908227375760408577}])
3240
assert s.dtypes == [pl.Int128]
3241
assert s["a"][0] == 9908227375760408577
3242
3243
3244
def test_negative_slice_12642() -> None:
3245
df = pl.DataFrame({"x": range(5)})
3246
assert_frame_equal(df.slice(-2, 1), df.tail(2).head(1))
3247
3248
3249
def test_iter_columns() -> None:
3250
df = pl.DataFrame({"a": [1, 1, 2], "b": [4, 5, 6]})
3251
iter_columns = df.iter_columns()
3252
assert_series_equal(next(iter_columns), pl.Series("a", [1, 1, 2]))
3253
assert_series_equal(next(iter_columns), pl.Series("b", [4, 5, 6]))
3254
3255
3256
def test_get_column_index() -> None:
3257
df = pl.DataFrame({"actual": [1001], "expected": [1000]})
3258
3259
assert df.get_column_index("actual") == 0
3260
assert df.get_column_index("expected") == 1
3261
3262
with pytest.raises(ColumnNotFoundError, match="missing"):
3263
df.get_column_index("missing")
3264
3265
3266
def test_dataframe_creation_with_different_series_lengths_19795() -> None:
3267
with pytest.raises(
3268
ShapeError,
3269
match=r"could not create a new DataFrame: height of column 'b' \(1\) does not match height of column 'a' \(2\)",
3270
):
3271
pl.DataFrame({"a": [1, 2], "b": [1]})
3272
3273
3274
def test_get_column_after_drop_20119() -> None:
3275
df = pl.DataFrame({"a": ["A"], "b": ["B"], "c": ["C"]})
3276
df.drop_in_place("a")
3277
c = df.get_column("c")
3278
assert_series_equal(c, pl.Series("c", ["C"]))
3279
3280
3281
def test_select_oob_row_20775() -> None:
3282
df = pl.DataFrame({"a": [1, 2, 3]})
3283
with pytest.raises(
3284
IndexError,
3285
match="index 99 is out of bounds for DataFrame of height 3",
3286
):
3287
df[99]
3288
3289
3290
@pytest.mark.parametrize("idx", [3, 99, -4, -99])
3291
def test_select_oob_element_20775_too_large(idx: int) -> None:
3292
df = pl.DataFrame({"a": [1, 2, 3]})
3293
with pytest.raises(
3294
IndexError,
3295
match=f"index {idx} is out of bounds for sequence of length 3",
3296
):
3297
df[idx, "a"]
3298
3299
3300
def test_nan_to_null() -> None:
3301
a = np.array([np.nan, 1])
3302
3303
df1 = pl.DataFrame(a, nan_to_null=True)
3304
df2 = pl.DataFrame(
3305
(a,),
3306
nan_to_null=True,
3307
)
3308
3309
assert_frame_equal(df1, df2)
3310
3311
3312
# Below 3 tests for https://github.com/pola-rs/polars/issues/17879
3313
3314
3315
def test_with_columns_dict_direct_typeerror() -> None:
3316
data = {"a": pl.col("a") * 2}
3317
df = pl.select(a=1)
3318
with pytest.raises(
3319
TypeError, match="Cannot pass a dictionary as a single positional argument"
3320
):
3321
df.with_columns(data)
3322
3323
3324
def test_with_columns_dict_unpacking() -> None:
3325
data = {"a": pl.col("a") * 2}
3326
df = pl.select(a=1).with_columns(**data)
3327
expected = pl.DataFrame({"a": [2]})
3328
assert df.equals(expected)
3329
3330
3331
def test_with_columns_generator_alias() -> None:
3332
data = {"a": pl.col("a") * 2}
3333
df = pl.select(a=1).with_columns(expr.alias(name) for name, expr in data.items())
3334
expected = pl.DataFrame({"a": [2]})
3335
assert df.equals(expected)
3336
3337