Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/dataframe/test_df.py
6939 views
1
from __future__ import annotations
2
3
import sys
4
import typing
5
from collections import OrderedDict
6
from collections.abc import Iterator, Mapping
7
from datetime import date, datetime, time, timedelta, timezone
8
from decimal import Decimal
9
from io import BytesIO
10
from operator import floordiv, truediv
11
from typing import TYPE_CHECKING, Any, Callable, cast
12
from zoneinfo import ZoneInfo
13
14
import numpy as np
15
import pyarrow as pa
16
import pytest
17
18
import polars as pl
19
import polars.selectors as cs
20
from polars._plr import PySeries
21
from polars._utils.construction import iterable_to_pydf
22
from polars.datatypes import DTYPE_TEMPORAL_UNITS
23
from polars.exceptions import (
24
ColumnNotFoundError,
25
ComputeError,
26
DuplicateError,
27
InvalidOperationError,
28
OutOfBoundsError,
29
ShapeError,
30
)
31
from polars.testing import (
32
assert_frame_equal,
33
assert_frame_not_equal,
34
assert_series_equal,
35
)
36
from tests.unit.conftest import INTEGER_DTYPES
37
38
if TYPE_CHECKING:
39
from collections.abc import Iterator, Sequence
40
41
from polars import Expr
42
from polars._typing import JoinStrategy, UniqueKeepStrategy
43
44
45
class MappingObject(Mapping[str, Any]): # noqa: D101
46
def __init__(self, **values: Any) -> None:
47
self._data = {**values}
48
49
def __getitem__(self, key: str) -> Any:
50
return self._data[key]
51
52
def __iter__(self) -> Iterator[str]:
53
yield from self._data
54
55
def __len__(self) -> int:
56
return len(self._data)
57
58
59
def test_version() -> None:
60
isinstance(pl.__version__, str)
61
62
63
def test_null_count() -> None:
64
df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", None]})
65
assert df.null_count().shape == (1, 2)
66
assert df.null_count().row(0) == (0, 1)
67
assert df.null_count().row(np.int64(0)) == (0, 1) # type: ignore[call-overload]
68
69
70
@pytest.mark.parametrize("input", [None, (), [], {}, pa.Table.from_arrays([])])
71
def test_init_empty(input: Any) -> None:
72
# test various flavours of empty init
73
df = pl.DataFrame(input)
74
assert df.shape == (0, 0)
75
assert df.is_empty()
76
77
78
def test_df_bool_ambiguous() -> None:
79
empty_df = pl.DataFrame()
80
with pytest.raises(TypeError, match="ambiguous"):
81
not empty_df
82
83
84
def test_special_char_colname_init() -> None:
85
from string import punctuation
86
87
cols = [(c, pl.Int8) for c in punctuation]
88
df = pl.DataFrame(schema=cols)
89
90
assert len(cols) == df.width
91
assert len(df.rows()) == 0
92
assert df.is_empty()
93
94
95
def test_comparisons() -> None:
96
df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
97
98
# Constants
99
assert_frame_equal(df == 2, pl.DataFrame({"a": [False, True], "b": [False, False]}))
100
assert_frame_equal(df != 2, pl.DataFrame({"a": [True, False], "b": [True, True]}))
101
assert_frame_equal(df < 3.0, pl.DataFrame({"a": [True, True], "b": [False, False]}))
102
assert_frame_equal(df >= 2, pl.DataFrame({"a": [False, True], "b": [True, True]}))
103
assert_frame_equal(df <= 2, pl.DataFrame({"a": [True, True], "b": [False, False]}))
104
105
with pytest.raises(ComputeError):
106
df > "2" # noqa: B015
107
108
# Series
109
s = pl.Series([3, 1])
110
assert_frame_equal(df >= s, pl.DataFrame({"a": [False, True], "b": [True, True]}))
111
112
# DataFrame
113
other = pl.DataFrame({"a": [1, 2], "b": [2, 3]})
114
assert_frame_equal(
115
df == other, pl.DataFrame({"a": [True, True], "b": [False, False]})
116
)
117
assert_frame_equal(
118
df != other, pl.DataFrame({"a": [False, False], "b": [True, True]})
119
)
120
assert_frame_equal(
121
df > other, pl.DataFrame({"a": [False, False], "b": [True, True]})
122
)
123
assert_frame_equal(
124
df < other, pl.DataFrame({"a": [False, False], "b": [False, False]})
125
)
126
assert_frame_equal(
127
df >= other, pl.DataFrame({"a": [True, True], "b": [True, True]})
128
)
129
assert_frame_equal(
130
df <= other, pl.DataFrame({"a": [True, True], "b": [False, False]})
131
)
132
133
# DataFrame columns mismatch
134
with pytest.raises(ValueError):
135
df == pl.DataFrame({"a": [1, 2], "c": [3, 4]}) # noqa: B015
136
with pytest.raises(ValueError):
137
df == pl.DataFrame({"b": [3, 4], "a": [1, 2]}) # noqa: B015
138
139
# DataFrame shape mismatch
140
with pytest.raises(ValueError):
141
df == pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) # noqa: B015
142
143
# Type mismatch
144
with pytest.raises(ComputeError):
145
df == pl.DataFrame({"a": [1, 2], "b": ["x", "y"]}) # noqa: B015
146
147
148
def test_column_selection() -> None:
149
df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]})
150
151
# get column by name
152
b = pl.Series("b", [1.0, 2.0, 3.0])
153
assert_series_equal(df["b"], b)
154
assert_series_equal(df.get_column("b"), b)
155
156
with pytest.raises(ColumnNotFoundError, match="x"):
157
df.get_column("x")
158
159
default_series = pl.Series("x", ["?", "?", "?"])
160
assert_series_equal(df.get_column("x", default=default_series), default_series)
161
162
assert df.get_column("x", default=None) is None
163
164
# get column by index
165
assert_series_equal(df.to_series(1), pl.Series("b", [1.0, 2.0, 3.0]))
166
assert_series_equal(df.to_series(-1), pl.Series("c", ["a", "b", "c"]))
167
168
169
def test_mixed_sequence_selection() -> None:
170
df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
171
result = df.select(["a", pl.col("b"), pl.lit("c")])
172
expected = pl.DataFrame({"a": [1, 2], "b": [3, 4], "literal": ["c", "c"]})
173
assert_frame_equal(result, expected)
174
175
176
def test_from_arrow(monkeypatch: Any) -> None:
177
tbl = pa.table(
178
{
179
"a": pa.array([1, 2], pa.timestamp("s")),
180
"b": pa.array([1, 2], pa.timestamp("ms")),
181
"c": pa.array([1, 2], pa.timestamp("us")),
182
"d": pa.array([1, 2], pa.timestamp("ns")),
183
"e": pa.array([1, 2], pa.int32()),
184
"decimal1": pa.array([1, 2], pa.decimal128(2, 1)),
185
"struct": pa.array(
186
[{"a": 1}, {"a": 2}], pa.struct([pa.field("a", pa.int32())])
187
),
188
}
189
)
190
record_batches = tbl.to_batches(max_chunksize=1)
191
expected_schema = {
192
"a": pl.Datetime("ms"),
193
"b": pl.Datetime("ms"),
194
"c": pl.Datetime("us"),
195
"d": pl.Datetime("ns"),
196
"e": pl.Int32,
197
"decimal1": pl.Decimal(2, 1),
198
"struct": pl.Struct({"a": pl.Int32()}),
199
}
200
expected_data = [
201
(
202
datetime(1970, 1, 1, 0, 0, 1),
203
datetime(1970, 1, 1, 0, 0, 0, 1000),
204
datetime(1970, 1, 1, 0, 0, 0, 1),
205
datetime(1970, 1, 1, 0, 0),
206
1,
207
Decimal("1.0"),
208
{"a": 1},
209
),
210
(
211
datetime(1970, 1, 1, 0, 0, 2),
212
datetime(1970, 1, 1, 0, 0, 0, 2000),
213
datetime(1970, 1, 1, 0, 0, 0, 2),
214
datetime(1970, 1, 1, 0, 0),
215
2,
216
Decimal("2.0"),
217
{"a": 2},
218
),
219
]
220
for arrow_data in (tbl, record_batches, (rb for rb in record_batches)):
221
df = cast("pl.DataFrame", pl.from_arrow(arrow_data))
222
assert df.schema == expected_schema
223
assert df.rows() == expected_data
224
225
# record batches (inc. empty)
226
for b, n_expected in (
227
(record_batches[0], 1),
228
(record_batches[0][:0], 0),
229
):
230
df = cast("pl.DataFrame", pl.from_arrow(b))
231
assert df.schema == expected_schema
232
assert df.rows() == expected_data[:n_expected]
233
234
empty_tbl = tbl[:0] # no rows
235
df = cast("pl.DataFrame", pl.from_arrow(empty_tbl))
236
assert df.schema == expected_schema
237
assert df.rows() == []
238
239
# try a single column dtype override
240
for t in (tbl, empty_tbl):
241
df = pl.DataFrame(t, schema_overrides={"e": pl.Int8})
242
override_schema = expected_schema.copy()
243
override_schema["e"] = pl.Int8
244
assert df.schema == override_schema
245
assert df.rows() == expected_data[: (df.height)]
246
247
# init from record batches with overrides
248
df = pl.DataFrame(
249
{
250
"id": ["a123", "b345", "c567", "d789", "e101"],
251
"points": [99, 45, 50, 85, 35],
252
}
253
)
254
tbl = df.to_arrow()
255
batches = tbl.to_batches(max_chunksize=3)
256
257
df0: pl.DataFrame = pl.from_arrow(batches) # type: ignore[assignment]
258
df1: pl.DataFrame = pl.from_arrow( # type: ignore[assignment]
259
data=batches,
260
schema=["x", "y"],
261
schema_overrides={"y": pl.Int32},
262
)
263
df2: pl.DataFrame = pl.from_arrow( # type: ignore[assignment]
264
data=batches[0],
265
schema=["x", "y"],
266
schema_overrides={"y": pl.Int32},
267
)
268
269
assert df0.rows() == df.rows()
270
assert df1.rows() == df.rows()
271
assert df2.rows() == df.rows()[:3]
272
273
assert df0.schema == {"id": pl.String, "points": pl.Int64}
274
print(df1.schema)
275
assert df1.schema == {"x": pl.String, "y": pl.Int32}
276
assert df2.schema == {"x": pl.String, "y": pl.Int32}
277
278
with pytest.raises(TypeError, match="Cannot convert str"):
279
pl.from_arrow(data="xyz")
280
281
with pytest.raises(TypeError, match="Cannot convert int"):
282
pl.from_arrow(data=(x for x in (1, 2, 3)))
283
284
285
@pytest.mark.parametrize(
286
"data",
287
[
288
pa.Table.from_pydict(
289
{
290
"struct": pa.array(
291
[{"a": 1}, {"a": 2}], pa.struct([pa.field("a", pa.int32())])
292
),
293
}
294
),
295
pa.Table.from_pydict(
296
{
297
"struct": pa.chunked_array(
298
[[{"a": 1}], [{"a": 2}]], pa.struct([pa.field("a", pa.int32())])
299
),
300
}
301
),
302
],
303
)
304
def test_from_arrow_struct_column(data: pa.Table) -> None:
305
df = cast("pl.DataFrame", pl.from_arrow(data=data))
306
expected_schema = pl.Schema({"struct": pl.Struct({"a": pl.Int32()})})
307
expected_data = [({"a": 1},), ({"a": 2},)]
308
assert df.schema == expected_schema
309
assert df.rows() == expected_data
310
311
312
def test_dataframe_membership_operator() -> None:
313
# cf. issue #4032
314
df = pl.DataFrame({"name": ["Jane", "John"], "age": [20, 30]})
315
assert "name" in df
316
assert "phone" not in df
317
assert df._ipython_key_completions_() == ["name", "age"]
318
319
320
def test_sort() -> None:
321
df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3]})
322
expected = pl.DataFrame({"a": [1, 2, 3], "b": [2, 1, 3]})
323
assert_frame_equal(df.sort("a"), expected)
324
assert_frame_equal(df.sort(["a", "b"]), expected)
325
326
327
def test_sort_multi_output_exprs_01() -> None:
328
df = pl.DataFrame(
329
{
330
"dts": [date(2077, 10, 3), date(2077, 10, 2), date(2077, 10, 2)],
331
"strs": ["abc", "def", "ghi"],
332
"vals": [10.5, 20.3, 15.7],
333
}
334
)
335
336
expected = pl.DataFrame(
337
{
338
"dts": [date(2077, 10, 2), date(2077, 10, 2), date(2077, 10, 3)],
339
"strs": ["ghi", "def", "abc"],
340
"vals": [15.7, 20.3, 10.5],
341
}
342
)
343
assert_frame_equal(expected, df.sort(pl.col("^(d|v).*$")))
344
assert_frame_equal(expected, df.sort(cs.temporal() | cs.numeric()))
345
assert_frame_equal(expected, df.sort(cs.temporal(), cs.numeric(), cs.binary()))
346
347
expected = pl.DataFrame(
348
{
349
"dts": [date(2077, 10, 3), date(2077, 10, 2), date(2077, 10, 2)],
350
"strs": ["abc", "def", "ghi"],
351
"vals": [10.5, 20.3, 15.7],
352
}
353
)
354
assert_frame_equal(
355
expected,
356
df.sort(pl.col("^(d|v).*$"), descending=[True]),
357
)
358
assert_frame_equal(
359
expected,
360
df.sort(cs.temporal() | cs.numeric(), descending=[True]),
361
)
362
assert_frame_equal(
363
expected,
364
df.sort(cs.temporal(), cs.numeric(), descending=[True, True]),
365
)
366
367
with pytest.raises(
368
ValueError,
369
match=r"the length of `descending` \(2\) does not match the length of `by` \(1\)",
370
):
371
df.sort(by=[cs.temporal()], descending=[True, False])
372
373
with pytest.raises(
374
ValueError,
375
match=r"the length of `nulls_last` \(3\) does not match the length of `by` \(2\)",
376
):
377
df.sort("dts", "strs", nulls_last=[True, False, True])
378
379
# No columns selected - return original input.
380
assert_frame_equal(df, df.sort(pl.col("^xxx$")))
381
382
383
@pytest.mark.parametrize(
384
("by_explicit", "desc_explicit", "by_multi", "desc_multi"),
385
[
386
(
387
["w", "x", "y", "z"],
388
[False, False, True, True],
389
[cs.integer(), cs.string()],
390
[False, True],
391
),
392
(
393
["w", "y", "z"],
394
[True, True, False],
395
[pl.col("^(w|y)$"), pl.col("^z.*$")],
396
[True, False],
397
),
398
(
399
["z", "w", "x"],
400
[True, False, False],
401
[pl.col("z"), cs.numeric()],
402
[True, False],
403
),
404
],
405
)
406
def test_sort_multi_output_exprs_02(
407
by_explicit: list[str],
408
desc_explicit: list[bool],
409
by_multi: list[Expr],
410
desc_multi: list[bool],
411
) -> None:
412
df = pl.DataFrame(
413
{
414
"w": [100, 100, 100, 100, 200, 200, 200, 200],
415
"x": [888, 888, 444, 444, 888, 888, 444, 888],
416
"y": ["b", "b", "a", "a", "b", "b", "a", "a"],
417
"z": ["x", "y", "x", "y", "x", "y", "x", "y"],
418
}
419
)
420
res1 = df.sort(*by_explicit, descending=desc_explicit)
421
res2 = df.sort(*by_multi, descending=desc_multi)
422
assert_frame_equal(res1, res2)
423
424
425
def test_sort_maintain_order() -> None:
426
l1 = (
427
pl.LazyFrame({"A": [1] * 4, "B": ["A", "B", "C", "D"]})
428
.sort("A", maintain_order=True)
429
.slice(0, 3)
430
.collect()["B"]
431
.to_list()
432
)
433
l2 = (
434
pl.LazyFrame({"A": [1] * 4, "B": ["A", "B", "C", "D"]})
435
.sort("A")
436
.collect()
437
.slice(0, 3)["B"]
438
.to_list()
439
)
440
assert l1 == l2 == ["A", "B", "C"]
441
442
443
@pytest.mark.parametrize("nulls_last", [False, True], ids=["nulls_first", "nulls_last"])
444
def test_sort_maintain_order_descending_repeated_nulls(nulls_last: bool) -> None:
445
got = (
446
pl.LazyFrame({"A": [None, -1, 1, 1, None], "B": [1, 2, 3, 4, 5]})
447
.sort("A", descending=True, maintain_order=True, nulls_last=nulls_last)
448
.collect()
449
)
450
if nulls_last:
451
expect = pl.DataFrame({"A": [1, 1, -1, None, None], "B": [3, 4, 2, 1, 5]})
452
else:
453
expect = pl.DataFrame({"A": [None, None, 1, 1, -1], "B": [1, 5, 3, 4, 2]})
454
assert_frame_equal(got, expect)
455
456
457
def test_replace() -> None:
458
df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3]})
459
s = pl.Series("c", [True, False, True])
460
df._replace("a", s)
461
assert_frame_equal(df, pl.DataFrame({"a": [True, False, True], "b": [1, 2, 3]}))
462
463
464
def test_assignment() -> None:
465
df = pl.DataFrame({"foo": [1, 2, 3], "bar": [2, 3, 4]})
466
df = df.with_columns(pl.col("foo").alias("foo"))
467
# make sure that assignment does not change column order
468
assert df.columns == ["foo", "bar"]
469
df = df.with_columns(
470
pl.when(pl.col("foo") > 1).then(9).otherwise(pl.col("foo")).alias("foo")
471
)
472
assert df["foo"].to_list() == [1, 9, 9]
473
474
475
def test_insert_column() -> None:
476
# insert series
477
df = (
478
pl.DataFrame({"z": [3, 4, 5]})
479
.insert_column(0, pl.Series("x", [1, 2, 3]))
480
.insert_column(-1, pl.Series("y", [2, 3, 4]))
481
)
482
expected_df = pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})
483
assert_frame_equal(expected_df, df)
484
485
# insert expressions
486
df = pl.DataFrame(
487
{
488
"id": ["xx", "yy", "zz"],
489
"v1": [5, 4, 6],
490
"v2": [7, 3, 3],
491
}
492
)
493
df.insert_column(3, (pl.col("v1") * pl.col("v2")).alias("v3"))
494
df.insert_column(1, (pl.col("v2") - pl.col("v1")).alias("v0"))
495
496
expected = pl.DataFrame(
497
{
498
"id": ["xx", "yy", "zz"],
499
"v0": [2, -1, -3],
500
"v1": [5, 4, 6],
501
"v2": [7, 3, 3],
502
"v3": [35, 12, 18],
503
}
504
)
505
assert_frame_equal(df, expected)
506
507
# check that we raise suitable index errors
508
for idx, column in (
509
(10, pl.col("v1").sqrt().alias("v1_sqrt")),
510
(-10, pl.Series("foo", [1, 2, 3])),
511
):
512
with pytest.raises(
513
IndexError,
514
match=rf"column index {idx} is out of range \(frame has 5 columns\)",
515
):
516
df.insert_column(idx, column)
517
518
519
def test_replace_column() -> None:
520
df = (
521
pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})
522
.replace_column(0, pl.Series("a", [4, 5, 6]))
523
.replace_column(-2, pl.Series("b", [5, 6, 7]))
524
.replace_column(-1, pl.Series("c", [6, 7, 8]))
525
)
526
expected_df = pl.DataFrame({"a": [4, 5, 6], "b": [5, 6, 7], "c": [6, 7, 8]})
527
assert_frame_equal(expected_df, df)
528
529
530
def test_to_series() -> None:
531
df = pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})
532
533
assert_series_equal(df.to_series(), df["x"])
534
assert_series_equal(df.to_series(0), df["x"])
535
assert_series_equal(df.to_series(-3), df["x"])
536
537
assert_series_equal(df.to_series(1), df["y"])
538
assert_series_equal(df.to_series(-2), df["y"])
539
540
assert_series_equal(df.to_series(2), df["z"])
541
assert_series_equal(df.to_series(-1), df["z"])
542
543
544
def test_to_series_bad_inputs() -> None:
545
df = pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})
546
547
with pytest.raises(IndexError, match="index 5 is out of bounds"):
548
df.to_series(5)
549
550
with pytest.raises(IndexError, match="index -100 is out of bounds"):
551
df.to_series(-100)
552
553
with pytest.raises(
554
TypeError, match="'str' object cannot be interpreted as an integer"
555
):
556
df.to_series("x") # type: ignore[arg-type]
557
558
559
def test_gather_every() -> None:
560
df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["w", "x", "y", "z"]})
561
expected_df = pl.DataFrame({"a": [1, 3], "b": ["w", "y"]})
562
assert_frame_equal(expected_df, df.gather_every(2))
563
564
expected_df = pl.DataFrame({"a": [2, 4], "b": ["x", "z"]})
565
assert_frame_equal(expected_df, df.gather_every(2, offset=1))
566
567
568
def test_gather_every_agg() -> None:
569
df = pl.DataFrame(
570
{
571
"g": [1, 1, 1, 2, 2, 2],
572
"a": ["a", "b", "c", "d", "e", "f"],
573
}
574
)
575
out = df.group_by(pl.col("g")).agg(pl.col("a").gather_every(2)).sort("g")
576
expected = pl.DataFrame(
577
{
578
"g": [1, 2],
579
"a": [["a", "c"], ["d", "f"]],
580
}
581
)
582
assert_frame_equal(out, expected)
583
584
585
def test_take_misc(fruits_cars: pl.DataFrame) -> None:
586
df = fruits_cars
587
588
# Out of bounds error.
589
with pytest.raises(OutOfBoundsError):
590
df.sort("fruits").select(
591
pl.col("B").reverse().gather([1, 2]).implode().over("fruits"),
592
"fruits",
593
)
594
595
# Null indices.
596
assert_frame_equal(
597
df.select(pl.col("fruits").gather(pl.Series([0, None]))),
598
pl.DataFrame({"fruits": ["banana", None]}),
599
)
600
601
for index in [[0, 1], pl.Series([0, 1]), np.array([0, 1])]:
602
out = df.sort("fruits").select(
603
pl.col("B")
604
.reverse()
605
.gather(index) # type: ignore[arg-type]
606
.over("fruits", mapping_strategy="join"),
607
"fruits",
608
)
609
610
assert out[0, "B"].to_list() == [2, 3]
611
assert out[4, "B"].to_list() == [1, 4]
612
613
out = df.sort("fruits").select(
614
pl.col("B").reverse().get(pl.lit(1)).over("fruits"),
615
"fruits",
616
)
617
assert out[0, "B"] == 3
618
assert out[4, "B"] == 4
619
620
621
def test_pipe() -> None:
622
df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, None, 8]})
623
624
def _multiply(data: pl.DataFrame, mul: int) -> pl.DataFrame:
625
return data * mul
626
627
result = df.pipe(_multiply, mul=3)
628
629
assert_frame_equal(result, df * 3)
630
631
632
def test_explode() -> None:
633
df = pl.DataFrame({"letters": ["c", "a"], "nrs": [[1, 2], [1, 3]]})
634
out = df.explode("nrs")
635
assert out["letters"].to_list() == ["c", "c", "a", "a"]
636
assert out["nrs"].to_list() == [1, 2, 1, 3]
637
638
639
@pytest.mark.parametrize(
640
("stack", "exp_shape", "exp_columns"),
641
[
642
([pl.Series("stacked", [-1, -1, -1])], (3, 3), ["a", "b", "stacked"]),
643
(
644
[pl.Series("stacked2", [-1, -1, -1]), pl.Series("stacked3", [-1, -1, -1])],
645
(3, 4),
646
["a", "b", "stacked2", "stacked3"],
647
),
648
],
649
)
650
@pytest.mark.parametrize("in_place", [True, False])
651
def test_hstack_list_of_series(
652
stack: list[pl.Series],
653
exp_shape: tuple[int, int],
654
exp_columns: list[str],
655
in_place: bool,
656
) -> None:
657
df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"]})
658
if in_place:
659
df.hstack(stack, in_place=True)
660
assert df.shape == exp_shape
661
assert df.columns == exp_columns
662
else:
663
df_out = df.hstack(stack, in_place=False)
664
assert df_out.shape == exp_shape
665
assert df_out.columns == exp_columns
666
667
668
@pytest.mark.parametrize("in_place", [True, False])
669
def test_hstack_dataframe(in_place: bool) -> None:
670
df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"]})
671
df2 = pl.DataFrame({"c": [2, 1, 3], "d": ["a", "b", "c"]})
672
expected = pl.DataFrame(
673
{"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [2, 1, 3], "d": ["a", "b", "c"]}
674
)
675
if in_place:
676
df.hstack(df2, in_place=True)
677
assert_frame_equal(df, expected)
678
else:
679
df_out = df.hstack(df2, in_place=False)
680
assert_frame_equal(df_out, expected)
681
682
683
@pytest.mark.may_fail_cloud
684
def test_file_buffer() -> None:
685
f = BytesIO()
686
f.write(b"1,2,3,4,5,6\n7,8,9,10,11,12")
687
f.seek(0)
688
df = pl.read_csv(f, has_header=False)
689
assert df.shape == (2, 6)
690
691
f = BytesIO()
692
f.write(b"1,2,3,4,5,6\n7,8,9,10,11,12")
693
f.seek(0)
694
# check if not fails on TryClone and Length impl in file.rs
695
with pytest.raises(ComputeError):
696
pl.read_parquet(f)
697
698
699
def test_shift() -> None:
700
df = pl.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5]})
701
a = df.shift(1)
702
b = pl.DataFrame(
703
{"A": [None, "a", "b"], "B": [None, 1, 3]},
704
)
705
assert_frame_equal(a, b)
706
707
708
def test_multiple_columns_drop() -> None:
709
df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
710
# List input
711
out = df.drop(["a", "b"])
712
assert out.columns == ["c"]
713
# Positional input
714
out = df.drop("b", "c")
715
assert out.columns == ["a"]
716
717
718
def test_arg_where() -> None:
719
s = pl.Series([True, False, True, False])
720
assert_series_equal(
721
pl.arg_where(s, eager=True).cast(int),
722
pl.Series([0, 2]),
723
)
724
725
726
def test_to_dummies() -> None:
727
df = pl.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5]})
728
dummies = df.to_dummies()
729
730
assert dummies["A_a"].to_list() == [1, 0, 0]
731
assert dummies["A_b"].to_list() == [0, 1, 0]
732
assert dummies["A_c"].to_list() == [0, 0, 1]
733
734
df = pl.DataFrame({"a": [1, 2, 3]})
735
res = df.to_dummies()
736
737
expected = pl.DataFrame(
738
{"a_1": [1, 0, 0], "a_2": [0, 1, 0], "a_3": [0, 0, 1]}
739
).with_columns(pl.all().cast(pl.UInt8))
740
assert_frame_equal(res, expected)
741
742
df = pl.DataFrame(
743
{
744
"i": [1, 2, 3],
745
"category": ["dog", "cat", "cat"],
746
},
747
schema={"i": pl.Int32, "category": pl.Categorical("lexical")},
748
)
749
expected = pl.DataFrame(
750
{
751
"i": [1, 2, 3],
752
"category|cat": [0, 1, 1],
753
"category|dog": [1, 0, 0],
754
},
755
schema={"i": pl.Int32, "category|cat": pl.UInt8, "category|dog": pl.UInt8},
756
)
757
for _cols in ("category", cs.string()):
758
result = df.to_dummies(columns=["category"], separator="|")
759
assert_frame_equal(result, expected)
760
761
# test sorted fast path
762
result = pl.DataFrame({"x": pl.arange(0, 3, eager=True)}).to_dummies("x")
763
expected = pl.DataFrame(
764
{"x_0": [1, 0, 0], "x_1": [0, 1, 0], "x_2": [0, 0, 1]}
765
).with_columns(pl.all().cast(pl.UInt8))
766
assert_frame_equal(result, expected)
767
768
769
def test_to_dummies_drop_first() -> None:
770
df = pl.DataFrame(
771
{
772
"foo": [0, 1, 2],
773
"bar": [3, 4, 5],
774
"baz": ["x", "y", "z"],
775
}
776
)
777
dm = df.to_dummies()
778
dd = df.to_dummies(drop_first=True)
779
780
assert dd.columns == ["foo_1", "foo_2", "bar_4", "bar_5", "baz_y", "baz_z"]
781
assert set(dm.columns) - set(dd.columns) == {"foo_0", "bar_3", "baz_x"}
782
assert_frame_equal(dm.select(dd.columns), dd)
783
assert dd.rows() == [
784
(0, 0, 0, 0, 0, 0),
785
(1, 0, 1, 0, 1, 0),
786
(0, 1, 0, 1, 0, 1),
787
]
788
789
790
def test_to_dummies_drop_nulls() -> None:
791
df = pl.DataFrame(
792
{
793
"foo": [0, 1, None],
794
"bar": [3, None, 5],
795
"baz": [None, "y", "z"],
796
}
797
)
798
799
dm = df.to_dummies(drop_nulls=True)
800
801
expected = pl.DataFrame(
802
{
803
"foo_0": [1, 0, 0],
804
"foo_1": [0, 1, 0],
805
"bar_3": [1, 0, 0],
806
"bar_5": [0, 0, 1],
807
"baz_y": [0, 1, 0],
808
"baz_z": [0, 0, 1],
809
},
810
schema={
811
"foo_0": pl.UInt8,
812
"foo_1": pl.UInt8,
813
"bar_3": pl.UInt8,
814
"bar_5": pl.UInt8,
815
"baz_y": pl.UInt8,
816
"baz_z": pl.UInt8,
817
},
818
)
819
assert_frame_equal(dm, expected)
820
821
822
def test_to_pandas(df: pl.DataFrame) -> None:
823
# pyarrow cannot deal with unsigned dictionary integer yet.
824
# pyarrow cannot convert a time64 w/ non-zero nanoseconds
825
df = df.drop(["cat", "time", "enum"])
826
df.to_arrow()
827
df.to_pandas()
828
# test shifted df
829
df.shift(2).to_pandas()
830
df = pl.DataFrame({"col": pl.Series([True, False, True])})
831
df.shift(2).to_pandas()
832
833
834
def test_from_arrow_table() -> None:
835
data = {"a": [1, 2], "b": [1, 2]}
836
tbl = pa.table(data)
837
838
df = cast("pl.DataFrame", pl.from_arrow(tbl))
839
assert_frame_equal(df, pl.DataFrame(data))
840
841
842
def test_df_stats(df: pl.DataFrame) -> None:
843
df.var()
844
df.std()
845
df.min()
846
df.max()
847
df.sum()
848
df.mean()
849
df.median()
850
df.quantile(0.4, "nearest")
851
852
853
def test_df_fold() -> None:
854
df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})
855
856
assert_series_equal(
857
df.fold(lambda s1, s2: s1 + s2), pl.Series("a", [4.0, 5.0, 9.0])
858
)
859
assert_series_equal(
860
df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2)),
861
pl.Series("a", [1.0, 1.0, 3.0]),
862
)
863
864
df = pl.DataFrame({"a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})
865
out = df.fold(lambda s1, s2: s1 + s2)
866
assert_series_equal(out, pl.Series("a", ["foo11.0", "bar22.0", "233.0"]))
867
868
df = pl.DataFrame({"a": [3, 2, 1], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})
869
# just check dispatch. values are tested on rust side.
870
assert len(df.sum_horizontal()) == 3
871
assert len(df.mean_horizontal()) == 3
872
assert len(df.min_horizontal()) == 3
873
assert len(df.max_horizontal()) == 3
874
875
df_width_one = df[["a"]]
876
assert_series_equal(df_width_one.fold(lambda s1, s2: s1), df["a"])
877
878
879
@pytest.mark.may_fail_cloud # TODO: make pickleable
880
def test_fold_filter() -> None:
881
df = pl.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})
882
883
out = df.filter(
884
pl.fold(
885
acc=pl.lit(True),
886
function=lambda a, b: a & b,
887
exprs=[pl.col(c) > 1 for c in df.columns],
888
)
889
)
890
891
assert out.shape == (1, 2)
892
assert out.rows() == [(3, 2)]
893
894
out = df.filter(
895
pl.fold(
896
acc=pl.lit(True),
897
function=lambda a, b: a | b,
898
exprs=[pl.col(c) > 1 for c in df.columns],
899
)
900
)
901
902
assert out.shape == (3, 2)
903
assert out.rows() == [(1, 0), (2, 1), (3, 2)]
904
905
906
def test_column_names() -> None:
907
tbl = pa.table(
908
{
909
"a": pa.array([1, 2, 3, 4, 5], pa.decimal128(38, 2)),
910
"b": pa.array([1, 2, 3, 4, 5], pa.int64()),
911
}
912
)
913
for a in (tbl, tbl[:0]):
914
df = cast("pl.DataFrame", pl.from_arrow(a))
915
assert df.columns == ["a", "b"]
916
917
918
def test_init_series_edge_cases() -> None:
919
# confirm that we don't modify the name of the input series in-place
920
s1 = pl.Series("X", [1, 2, 3])
921
df1 = pl.DataFrame({"A": s1}, schema_overrides={"A": pl.UInt8})
922
assert s1.name == "X"
923
assert df1["A"].name == "A"
924
925
# init same series object under different names
926
df2 = pl.DataFrame({"A": s1, "B": s1})
927
assert df2.rows(named=True) == [
928
{"A": 1, "B": 1},
929
{"A": 2, "B": 2},
930
{"A": 3, "B": 3},
931
]
932
933
# empty series names should not be overwritten
934
s2 = pl.Series([1, 2, 3])
935
s3 = pl.Series([2, 3, 4])
936
df3 = pl.DataFrame([s2, s3])
937
assert s2.name == s3.name == ""
938
assert df3.columns == ["column_0", "column_1"]
939
940
941
def test_head_group_by() -> None:
942
commodity_prices = {
943
"commodity": [
944
"Wheat",
945
"Wheat",
946
"Wheat",
947
"Wheat",
948
"Corn",
949
"Corn",
950
"Corn",
951
"Corn",
952
"Corn",
953
],
954
"location": [
955
"StPaul",
956
"StPaul",
957
"StPaul",
958
"Chicago",
959
"Chicago",
960
"Chicago",
961
"Chicago",
962
"Chicago",
963
"Chicago",
964
],
965
"seller": [
966
"Bob",
967
"Charlie",
968
"Susan",
969
"Paul",
970
"Ed",
971
"Mary",
972
"Paul",
973
"Charlie",
974
"Norman",
975
],
976
"price": [1.0, 0.7, 0.8, 0.55, 2.0, 3.0, 2.4, 1.8, 2.1],
977
}
978
df = pl.DataFrame(commodity_prices)
979
980
# this query flexes the wildcard exclusion quite a bit.
981
keys = ["commodity", "location"]
982
out = (
983
df.sort(by="price", descending=True)
984
.group_by(keys, maintain_order=True)
985
.agg([pl.col("*").exclude(keys).head(2).name.keep()])
986
.explode(cs.all().exclude(keys))
987
)
988
989
assert out.shape == (5, 4)
990
assert out.rows() == [
991
("Corn", "Chicago", "Mary", 3.0),
992
("Corn", "Chicago", "Paul", 2.4),
993
("Wheat", "StPaul", "Bob", 1.0),
994
("Wheat", "StPaul", "Susan", 0.8),
995
("Wheat", "Chicago", "Paul", 0.55),
996
]
997
998
df = pl.DataFrame(
999
{"letters": ["c", "c", "a", "c", "a", "b"], "nrs": [1, 2, 3, 4, 5, 6]}
1000
)
1001
out = df.group_by("letters").tail(2).sort("letters")
1002
assert_frame_equal(
1003
out,
1004
pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 2, 4]}),
1005
)
1006
out = df.group_by("letters").head(2).sort("letters")
1007
assert_frame_equal(
1008
out,
1009
pl.DataFrame({"letters": ["a", "a", "b", "c", "c"], "nrs": [3, 5, 6, 1, 2]}),
1010
)
1011
1012
1013
def test_is_null_is_not_null() -> None:
1014
df = pl.DataFrame({"nrs": [1, 2, None]})
1015
assert df.select(pl.col("nrs").is_null())["nrs"].to_list() == [False, False, True]
1016
assert df.select(pl.col("nrs").is_not_null())["nrs"].to_list() == [
1017
True,
1018
True,
1019
False,
1020
]
1021
1022
1023
def test_is_nan_is_not_nan() -> None:
1024
df = pl.DataFrame({"nrs": np.array([1, 2, np.nan])})
1025
assert df.select(pl.col("nrs").is_nan())["nrs"].to_list() == [False, False, True]
1026
assert df.select(pl.col("nrs").is_not_nan())["nrs"].to_list() == [True, True, False]
1027
1028
1029
def test_is_finite_is_infinite() -> None:
1030
df = pl.DataFrame({"nrs": np.array([1, 2, np.inf])})
1031
assert df.select(pl.col("nrs").is_infinite())["nrs"].to_list() == [
1032
False,
1033
False,
1034
True,
1035
]
1036
assert df.select(pl.col("nrs").is_finite())["nrs"].to_list() == [True, True, False]
1037
1038
1039
def test_is_finite_is_infinite_null_series() -> None:
1040
df = pl.DataFrame({"a": pl.Series([None, None, None], dtype=pl.Null)})
1041
result = df.select(
1042
pl.col("a").is_finite().alias("finite"),
1043
pl.col("a").is_infinite().alias("infinite"),
1044
)
1045
expected = pl.DataFrame(
1046
{
1047
"finite": pl.Series([None, None, None], dtype=pl.Boolean),
1048
"infinite": pl.Series([None, None, None], dtype=pl.Boolean),
1049
}
1050
)
1051
assert_frame_equal(result, expected)
1052
1053
1054
def test_is_nan_null_series() -> None:
1055
df = pl.DataFrame({"a": pl.Series([None, None, None], dtype=pl.Null)})
1056
result = df.select(pl.col("a").is_nan())
1057
expected = pl.DataFrame({"a": pl.Series([None, None, None], dtype=pl.Boolean)})
1058
assert_frame_equal(result, expected)
1059
1060
1061
def test_len() -> None:
1062
df = pl.DataFrame({"nrs": [1, 2, 3]})
1063
assert cast("int", df.select(pl.col("nrs").len()).item()) == 3
1064
assert len(pl.DataFrame()) == 0
1065
1066
1067
def test_multiple_column_sort() -> None:
1068
df = pl.DataFrame({"a": ["foo", "bar", "2"], "b": [2, 2, 3], "c": [1.0, 2.0, 3.0]})
1069
out = df.sort([pl.col("b"), pl.col("c").reverse()])
1070
assert list(out["c"]) == [2.0, 1.0, 3.0]
1071
assert list(out["b"]) == [2, 2, 3]
1072
1073
# Explicitly specify numpy dtype because of different defaults on Windows
1074
df = pl.DataFrame({"a": np.arange(1, 4, dtype=np.int64), "b": ["a", "a", "b"]})
1075
1076
assert_frame_equal(
1077
df.sort("a", descending=True),
1078
pl.DataFrame({"a": [3, 2, 1], "b": ["b", "a", "a"]}),
1079
)
1080
assert_frame_equal(
1081
df.sort("b", descending=True, maintain_order=True),
1082
pl.DataFrame({"a": [3, 1, 2], "b": ["b", "a", "a"]}),
1083
)
1084
assert_frame_equal(
1085
df.sort(["b", "a"], descending=[False, True]),
1086
pl.DataFrame({"a": [2, 1, 3], "b": ["a", "a", "b"]}),
1087
)
1088
1089
1090
def test_cast_frame() -> None:
1091
df = pl.DataFrame(
1092
{
1093
"a": [1.0, 2.5, 3.0],
1094
"b": [4, 5, None],
1095
"c": [True, False, True],
1096
"d": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)],
1097
}
1098
)
1099
1100
# cast via col:dtype map
1101
assert df.cast(
1102
dtypes={"b": pl.Float32, "c": pl.String, "d": pl.Datetime("ms")},
1103
).schema == {
1104
"a": pl.Float64,
1105
"b": pl.Float32,
1106
"c": pl.String,
1107
"d": pl.Datetime("ms"),
1108
}
1109
1110
# cast via col:pytype map
1111
assert df.cast(
1112
dtypes={"b": float, "c": str, "d": datetime},
1113
).schema == {
1114
"a": pl.Float64,
1115
"b": pl.Float64,
1116
"c": pl.String,
1117
"d": pl.Datetime("us"),
1118
}
1119
1120
# cast via selector:dtype map
1121
assert df.cast(
1122
{
1123
cs.numeric(): pl.UInt8,
1124
cs.temporal(): pl.String,
1125
}
1126
).rows() == [
1127
(1, 4, True, "2020-01-02"),
1128
(2, 5, False, "2021-03-04"),
1129
(3, None, True, "2022-05-06"),
1130
]
1131
1132
# cast all fields to a single type
1133
assert df.cast(pl.String).to_dict(as_series=False) == {
1134
"a": ["1.0", "2.5", "3.0"],
1135
"b": ["4", "5", None],
1136
"c": ["true", "false", "true"],
1137
"d": ["2020-01-02", "2021-03-04", "2022-05-06"],
1138
}
1139
1140
1141
def test_duration_arithmetic() -> None:
1142
df = pl.DataFrame(
1143
{"a": [datetime(2022, 1, 1, 0, 0, 0), datetime(2022, 1, 2, 0, 0, 0)]}
1144
)
1145
d1 = pl.duration(days=3, microseconds=987000)
1146
d2 = pl.duration(days=6, milliseconds=987)
1147
1148
assert_frame_equal(
1149
df.with_columns(
1150
b=(df["a"] + d1),
1151
c=(pl.col("a") + d2),
1152
),
1153
pl.DataFrame(
1154
{
1155
"a": [
1156
datetime(2022, 1, 1, 0, 0, 0),
1157
datetime(2022, 1, 2, 0, 0, 0),
1158
],
1159
"b": [
1160
datetime(2022, 1, 4, 0, 0, 0, 987000),
1161
datetime(2022, 1, 5, 0, 0, 0, 987000),
1162
],
1163
"c": [
1164
datetime(2022, 1, 7, 0, 0, 0, 987000),
1165
datetime(2022, 1, 8, 0, 0, 0, 987000),
1166
],
1167
}
1168
),
1169
)
1170
1171
1172
def test_assign() -> None:
1173
# check if can assign in case of a single column
1174
df = pl.DataFrame({"a": [1, 2, 3]})
1175
# test if we can assign in case of single column
1176
df = df.with_columns(pl.col("a") * 2)
1177
assert list(df["a"]) == [2, 4, 6]
1178
1179
1180
def test_arg_sort_by(df: pl.DataFrame) -> None:
1181
idx_df = df.select(
1182
pl.arg_sort_by(["int_nulls", "floats"], descending=[False, True]).alias("idx")
1183
)
1184
assert (idx_df["idx"] == [1, 0, 2]).all()
1185
1186
idx_df = df.select(
1187
pl.arg_sort_by(["int_nulls", "floats"], descending=False).alias("idx")
1188
)
1189
assert (idx_df["idx"] == [1, 0, 2]).all()
1190
1191
df = pl.DataFrame({"x": [0, 0, 0, 1, 1, 2], "y": [9, 9, 8, 7, 6, 6]})
1192
for expr, expected in (
1193
(pl.arg_sort_by(["x", "y"]), [2, 0, 1, 4, 3, 5]),
1194
(pl.arg_sort_by(["x", "y"], descending=[True, True]), [5, 3, 4, 0, 1, 2]),
1195
(pl.arg_sort_by(["x", "y"], descending=[True, False]), [5, 4, 3, 2, 0, 1]),
1196
(pl.arg_sort_by(["x", "y"], descending=[False, True]), [0, 1, 2, 3, 4, 5]),
1197
):
1198
assert (df.select(expr.alias("idx"))["idx"] == expected).all()
1199
1200
1201
def test_literal_series() -> None:
1202
df = pl.DataFrame(
1203
{
1204
"a": np.array([21.7, 21.8, 21], dtype=np.float32),
1205
"b": np.array([1, 3, 2], dtype=np.int8),
1206
"c": ["reg1", "reg2", "reg3"],
1207
"d": np.array(
1208
[datetime(2022, 8, 16), datetime(2022, 8, 17), datetime(2022, 8, 18)],
1209
dtype="<M8[ns]",
1210
),
1211
},
1212
schema_overrides={"a": pl.Float64},
1213
)
1214
out = (
1215
df.lazy()
1216
.with_columns(pl.Series("e", [2, 1, 3], pl.Int32))
1217
.with_columns(pl.col("e").cast(pl.Float32))
1218
.collect()
1219
)
1220
expected_schema = {
1221
"a": pl.Float64,
1222
"b": pl.Int8,
1223
"c": pl.String,
1224
"d": pl.Datetime("ns"),
1225
"e": pl.Float32,
1226
}
1227
assert_frame_equal(
1228
pl.DataFrame(
1229
[
1230
(21.7, 1, "reg1", datetime(2022, 8, 16, 0), 2),
1231
(21.8, 3, "reg2", datetime(2022, 8, 17, 0), 1),
1232
(21.0, 2, "reg3", datetime(2022, 8, 18, 0), 3),
1233
],
1234
schema=expected_schema, # type: ignore[arg-type]
1235
orient="row",
1236
),
1237
out,
1238
abs_tol=0.00001,
1239
)
1240
1241
1242
def test_write_csv() -> None:
1243
df = pl.DataFrame(
1244
{
1245
"foo": [1, 2, 3, 4, 5],
1246
"bar": [6, 7, 8, 9, 10],
1247
"ham": ["a", "b", "c", "d", "e"],
1248
}
1249
)
1250
expected = "foo,bar,ham\n1,6,a\n2,7,b\n3,8,c\n4,9,d\n5,10,e\n"
1251
1252
# if no file argument is supplied, write_csv() will return the string
1253
s = df.write_csv()
1254
assert s == expected
1255
1256
# otherwise it will write to the file/iobuffer
1257
file = BytesIO()
1258
df.write_csv(file)
1259
file.seek(0)
1260
s = file.read().decode("utf8")
1261
assert s == expected
1262
1263
1264
def test_from_generator_or_iterable() -> None:
1265
# generator function
1266
def gen(n: int, *, strkey: bool = True) -> Iterator[Any]:
1267
for i in range(n):
1268
yield (str(i) if strkey else i), 1 * i, 2**i, 3**i
1269
1270
# iterable object
1271
class Rows:
1272
def __init__(self, n: int, *, strkey: bool = True) -> None:
1273
self._n = n
1274
self._strkey = strkey
1275
1276
def __iter__(self) -> Iterator[Any]:
1277
yield from gen(self._n, strkey=self._strkey)
1278
1279
# check init from column-oriented generator
1280
assert_frame_equal(
1281
pl.DataFrame(data=gen(4, strkey=False), orient="col"),
1282
pl.DataFrame(
1283
data=[(0, 0, 1, 1), (1, 1, 2, 3), (2, 2, 4, 9), (3, 3, 8, 27)], orient="col"
1284
),
1285
)
1286
# check init from row-oriented generators (more common)
1287
expected = pl.DataFrame(
1288
data=list(gen(4)), schema=["a", "b", "c", "d"], orient="row"
1289
)
1290
for generated_frame in (
1291
pl.DataFrame(data=gen(4), schema=["a", "b", "c", "d"]),
1292
pl.DataFrame(data=Rows(4), schema=["a", "b", "c", "d"]),
1293
pl.DataFrame(data=(x for x in Rows(4)), schema=["a", "b", "c", "d"]),
1294
):
1295
assert_frame_equal(expected, generated_frame)
1296
assert generated_frame.schema == {
1297
"a": pl.String,
1298
"b": pl.Int64,
1299
"c": pl.Int64,
1300
"d": pl.Int64,
1301
}
1302
1303
# test 'iterable_to_pydf' directly to validate 'chunk_size' behaviour
1304
cols = ["a", "b", ("c", pl.Int8), "d"]
1305
1306
expected_data = [("0", 0, 1, 1), ("1", 1, 2, 3), ("2", 2, 4, 9), ("3", 3, 8, 27)]
1307
expected_schema = [
1308
("a", pl.String),
1309
("b", pl.Int64),
1310
("c", pl.Int8),
1311
("d", pl.Int64),
1312
]
1313
1314
for params in (
1315
{"data": Rows(4)},
1316
{"data": gen(4), "chunk_size": 2},
1317
{"data": Rows(4), "chunk_size": 3},
1318
{"data": gen(4), "infer_schema_length": None},
1319
{"data": Rows(4), "infer_schema_length": 1},
1320
{"data": gen(4), "chunk_size": 2},
1321
{"data": Rows(4), "infer_schema_length": 5},
1322
{"data": gen(4), "infer_schema_length": 3, "chunk_size": 2},
1323
{"data": gen(4), "infer_schema_length": None, "chunk_size": 3},
1324
):
1325
d = iterable_to_pydf(schema=cols, **params) # type: ignore[arg-type]
1326
assert expected_data == d.row_tuples()
1327
assert expected_schema == list(zip(d.columns(), d.dtypes()))
1328
1329
# ref: issue #6489 (initial chunk_size cannot be smaller than 'infer_schema_length')
1330
df = pl.DataFrame(
1331
data=iter(([{"col": None}] * 1000) + [{"col": ["a", "b", "c"]}]),
1332
infer_schema_length=1001,
1333
)
1334
assert df.schema == {"col": pl.List(pl.String)}
1335
assert df[-2:]["col"].to_list() == [None, ["a", "b", "c"]]
1336
1337
# empty iterator
1338
assert_frame_equal(
1339
pl.DataFrame(data=gen(0), schema=["a", "b", "c", "d"]),
1340
pl.DataFrame(schema=["a", "b", "c", "d"]),
1341
)
1342
1343
1344
def test_from_rows() -> None:
1345
df = pl.from_records([[1, 2, "foo"], [2, 3, "bar"]], orient="row")
1346
assert_frame_equal(
1347
df,
1348
pl.DataFrame(
1349
{"column_0": [1, 2], "column_1": [2, 3], "column_2": ["foo", "bar"]}
1350
),
1351
)
1352
df = pl.from_records(
1353
[[1, datetime.fromtimestamp(100)], [2, datetime.fromtimestamp(2398754908)]],
1354
schema_overrides={"column_0": pl.UInt32},
1355
orient="row",
1356
)
1357
assert df.dtypes == [pl.UInt32, pl.Datetime]
1358
1359
# auto-inference with same num rows/cols
1360
data = [(1, 2, "foo"), (2, 3, "bar"), (3, 4, "baz")]
1361
df = pl.from_records(data, orient="row")
1362
assert data == df.rows()
1363
1364
1365
@pytest.mark.parametrize(
1366
"records",
1367
[
1368
[
1369
{"id": 1, "value": 100, "_meta": "a"},
1370
{"id": 2, "value": 101, "_meta": "b"},
1371
],
1372
[
1373
None,
1374
{"id": 1, "value": 100, "_meta": "a"},
1375
{"id": 2, "value": 101, "_meta": "b"},
1376
],
1377
[
1378
{"id": 1, "value": 100, "_meta": "a"},
1379
{"id": 2, "value": 101, "_meta": "b"},
1380
None,
1381
],
1382
[
1383
MappingObject(id=1, value=100, _meta="a"),
1384
MappingObject(id=2, value=101, _meta="b"),
1385
],
1386
[
1387
None,
1388
MappingObject(id=1, value=100, _meta="a"),
1389
MappingObject(id=2, value=101, _meta="b"),
1390
],
1391
[
1392
MappingObject(id=1, value=100, _meta="a"),
1393
MappingObject(id=2, value=101, _meta="b"),
1394
None,
1395
],
1396
],
1397
)
1398
def test_from_rows_of_dicts(records: list[dict[str, Any]]) -> None:
1399
for df_init in (pl.from_dicts, pl.DataFrame):
1400
df1 = df_init(records).remove(pl.col("id").is_null())
1401
assert df1.rows() == [(1, 100, "a"), (2, 101, "b")]
1402
1403
overrides = {
1404
"id": pl.Int16,
1405
"value": pl.Int32,
1406
}
1407
df2 = df_init(records, schema_overrides=overrides).remove(
1408
pl.col("id").is_null()
1409
)
1410
assert df2.rows() == [(1, 100, "a"), (2, 101, "b")]
1411
assert df2.schema == {"id": pl.Int16, "value": pl.Int32, "_meta": pl.String}
1412
1413
df3 = df_init(records, schema=overrides).remove(pl.col("id").is_null())
1414
assert df3.rows() == [(1, 100), (2, 101)]
1415
assert df3.schema == {"id": pl.Int16, "value": pl.Int32}
1416
1417
# explicitly check "anyvalue" conversion for dict/mapping dtypes
1418
py_s = PySeries.new_from_any_values("s", records, True)
1419
assert py_s.dtype() == pl.Struct(
1420
{
1421
"id": pl.Int64,
1422
"value": pl.Int64,
1423
"_meta": pl.String,
1424
}
1425
)
1426
1427
1428
def test_from_records_with_schema_overrides_12032() -> None:
1429
# the 'id' fields contains an int value that exceeds Int64 and doesn't have an exact
1430
# Float64 representation; confirm that the override is applied *during* inference,
1431
# not as a post-inference cast, so we maintain the accuracy of the original value.
1432
rec = [
1433
{"id": 9187643043065364490, "x": 333, "y": None},
1434
{"id": 9223671840084328467, "x": 666.5, "y": 1698177261953686},
1435
{"id": 9187643043065364505, "x": 999, "y": 9223372036854775807},
1436
]
1437
df = pl.from_records(rec, schema_overrides={"x": pl.Float32, "id": pl.UInt64})
1438
assert df.schema == OrderedDict(
1439
[
1440
("id", pl.UInt64),
1441
("x", pl.Float32),
1442
("y", pl.Int64),
1443
]
1444
)
1445
assert rec == df.rows(named=True)
1446
1447
1448
def test_from_large_uint64_misc() -> None:
1449
uint_data = [[9187643043065364490, 9223671840084328467, 9187643043065364505]]
1450
1451
df = pl.DataFrame(uint_data, orient="col", schema_overrides={"column_0": pl.UInt64})
1452
assert df["column_0"].dtype == pl.UInt64
1453
assert df["column_0"].to_list() == uint_data[0]
1454
1455
for overrides in ({}, {"column_1": pl.UInt64}):
1456
df = pl.DataFrame(
1457
uint_data,
1458
orient="row",
1459
schema_overrides=overrides,
1460
)
1461
assert df.schema == OrderedDict(
1462
[
1463
("column_0", pl.Int64),
1464
("column_1", pl.Int128 if overrides == {} else pl.UInt64),
1465
("column_2", pl.Int64),
1466
]
1467
)
1468
assert df.row(0) == tuple(uint_data[0])
1469
1470
1471
def test_repeat_by_unequal_lengths_panic() -> None:
1472
df = pl.DataFrame(
1473
{
1474
"a": ["x", "y", "z"],
1475
}
1476
)
1477
with pytest.raises(ShapeError):
1478
df.select(pl.col("a").repeat_by(pl.Series([2, 2])))
1479
1480
1481
@pytest.mark.parametrize(
1482
("value", "values_expect"),
1483
[
1484
(1.2, [[1.2], [1.2, 1.2], [1.2, 1.2, 1.2]]),
1485
(True, [[True], [True, True], [True, True, True]]),
1486
("x", [["x"], ["x", "x"], ["x", "x", "x"]]),
1487
(b"a", [[b"a"], [b"a", b"a"], [b"a", b"a", b"a"]]),
1488
],
1489
)
1490
def test_repeat_by_broadcast_left(
1491
value: float | bool | str, values_expect: list[list[float | bool | str]]
1492
) -> None:
1493
df = pl.DataFrame(
1494
{
1495
"n": [1, 2, 3],
1496
}
1497
)
1498
expected = pl.DataFrame({"values": values_expect})
1499
result = df.select(pl.lit(value).repeat_by(pl.col("n")).alias("values"))
1500
assert_frame_equal(result, expected)
1501
1502
1503
@pytest.mark.parametrize(
1504
("a", "a_expected"),
1505
[
1506
([1.2, 2.2, 3.3], [[1.2, 1.2, 1.2], [2.2, 2.2, 2.2], [3.3, 3.3, 3.3]]),
1507
([True, False], [[True, True, True], [False, False, False]]),
1508
(["x", "y", "z"], [["x", "x", "x"], ["y", "y", "y"], ["z", "z", "z"]]),
1509
(
1510
[b"a", b"b", b"c"],
1511
[[b"a", b"a", b"a"], [b"b", b"b", b"b"], [b"c", b"c", b"c"]],
1512
),
1513
],
1514
)
1515
def test_repeat_by_broadcast_right(
1516
a: list[float | bool | str], a_expected: list[list[float | bool | str]]
1517
) -> None:
1518
df = pl.DataFrame(
1519
{
1520
"a": a,
1521
}
1522
)
1523
expected = pl.DataFrame({"a": a_expected})
1524
result = df.select(pl.col("a").repeat_by(3))
1525
assert_frame_equal(result, expected)
1526
result = df.select(pl.col("a").repeat_by(pl.lit(3)))
1527
assert_frame_equal(result, expected)
1528
1529
1530
@pytest.mark.parametrize(
1531
("a", "a_expected"),
1532
[
1533
(["foo", "bar"], [["foo", "foo"], ["bar", "bar", "bar"]]),
1534
([1, 2], [[1, 1], [2, 2, 2]]),
1535
([True, False], [[True, True], [False, False, False]]),
1536
(
1537
[b"a", b"b"],
1538
[[b"a", b"a"], [b"b", b"b", b"b"]],
1539
),
1540
],
1541
)
1542
def test_repeat_by(
1543
a: list[float | bool | str], a_expected: list[list[float | bool | str]]
1544
) -> None:
1545
df = pl.DataFrame({"a": a, "n": [2, 3]})
1546
expected = pl.DataFrame({"a": a_expected})
1547
result = df.select(pl.col("a").repeat_by("n"))
1548
assert_frame_equal(result, expected)
1549
1550
1551
def test_join_dates() -> None:
1552
dts_in = pl.datetime_range(
1553
datetime(2021, 6, 24),
1554
datetime(2021, 6, 24, 10, 0, 0),
1555
interval=timedelta(hours=1),
1556
closed="left",
1557
eager=True,
1558
)
1559
dts = (
1560
dts_in.cast(int)
1561
.map_elements(lambda x: x + np.random.randint(1_000 * 60, 60_000 * 60))
1562
.cast(pl.Datetime)
1563
)
1564
1565
# some df with sensor id, (randomish) datetime and some value
1566
df = pl.DataFrame(
1567
{
1568
"sensor": ["a"] * 5 + ["b"] * 5,
1569
"datetime": dts,
1570
"value": [2, 3, 4, 1, 2, 3, 5, 1, 2, 3],
1571
}
1572
)
1573
out = df.join(df, on="datetime")
1574
assert out.height == df.height
1575
1576
1577
def test_asof_cross_join() -> None:
1578
left = pl.DataFrame({"a": [-10, 5, 10], "left_val": ["a", "b", "c"]}).with_columns(
1579
pl.col("a").set_sorted()
1580
)
1581
right = pl.DataFrame(
1582
{"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}
1583
).with_columns(pl.col("a").set_sorted())
1584
1585
# only test dispatch of asof join
1586
out = left.join_asof(right, on="a")
1587
assert out.shape == (3, 3)
1588
1589
left.lazy().join_asof(right.lazy(), on="a").collect()
1590
assert out.shape == (3, 3)
1591
1592
# only test dispatch of cross join
1593
out = left.join(right, how="cross")
1594
assert out.shape == (15, 4)
1595
1596
left.lazy().join(right.lazy(), how="cross").collect()
1597
assert out.shape == (15, 4)
1598
1599
1600
def test_join_bad_input_type() -> None:
1601
left = pl.DataFrame({"a": [1, 2, 3]})
1602
right = pl.DataFrame({"a": [1, 2, 3]})
1603
1604
with pytest.raises(
1605
TypeError,
1606
match="expected `other` .*to be a 'DataFrame'.* not 'LazyFrame'",
1607
):
1608
left.join(right.lazy(), on="a") # type: ignore[arg-type]
1609
1610
with pytest.raises(
1611
TypeError,
1612
match="expected `other` .*to be a 'DataFrame'.* not 'Series'",
1613
):
1614
left.join(pl.Series([1, 2, 3]), on="a") # type: ignore[arg-type]
1615
1616
class DummyDataFrameSubclass(pl.DataFrame):
1617
pass
1618
1619
right = DummyDataFrameSubclass(right)
1620
1621
left.join(right, on="a")
1622
1623
1624
def test_join_where() -> None:
1625
east = pl.DataFrame(
1626
{
1627
"id": [100, 101, 102],
1628
"dur": [120, 140, 160],
1629
"rev": [12, 14, 16],
1630
"cores": [2, 8, 4],
1631
}
1632
)
1633
west = pl.DataFrame(
1634
{
1635
"t_id": [404, 498, 676, 742],
1636
"time": [90, 130, 150, 170],
1637
"cost": [9, 13, 15, 16],
1638
"cores": [4, 2, 1, 4],
1639
}
1640
)
1641
out = east.join_where(
1642
west,
1643
pl.col("dur") < pl.col("time"),
1644
pl.col("rev") < pl.col("cost"),
1645
)
1646
1647
expected = pl.DataFrame(
1648
{
1649
"id": [100, 100, 100, 101, 101],
1650
"dur": [120, 120, 120, 140, 140],
1651
"rev": [12, 12, 12, 14, 14],
1652
"cores": [2, 2, 2, 8, 8],
1653
"t_id": [498, 676, 742, 676, 742],
1654
"time": [130, 150, 170, 150, 170],
1655
"cost": [13, 15, 16, 15, 16],
1656
"cores_right": [2, 1, 4, 1, 4],
1657
}
1658
)
1659
1660
assert_frame_equal(out, expected)
1661
1662
1663
def test_join_where_bad_input_type() -> None:
1664
east = pl.DataFrame(
1665
{
1666
"id": [100, 101, 102],
1667
"dur": [120, 140, 160],
1668
"rev": [12, 14, 16],
1669
"cores": [2, 8, 4],
1670
}
1671
)
1672
west = pl.DataFrame(
1673
{
1674
"t_id": [404, 498, 676, 742],
1675
"time": [90, 130, 150, 170],
1676
"cost": [9, 13, 15, 16],
1677
"cores": [4, 2, 1, 4],
1678
}
1679
)
1680
with pytest.raises(
1681
TypeError,
1682
match="expected `other` .*to be a 'DataFrame'.* not 'LazyFrame'",
1683
):
1684
east.join_where(
1685
west.lazy(), # type: ignore[arg-type]
1686
pl.col("dur") < pl.col("time"),
1687
pl.col("rev") < pl.col("cost"),
1688
)
1689
1690
with pytest.raises(
1691
TypeError,
1692
match="expected `other` .*to be a 'DataFrame'.* not 'Series'",
1693
):
1694
east.join_where(
1695
pl.Series(west), # type: ignore[arg-type]
1696
pl.col("dur") < pl.col("time"),
1697
pl.col("rev") < pl.col("cost"),
1698
)
1699
1700
class DummyDataFrameSubclass(pl.DataFrame):
1701
pass
1702
1703
west = DummyDataFrameSubclass(west)
1704
1705
east.join_where(
1706
west,
1707
pl.col("dur") < pl.col("time"),
1708
pl.col("rev") < pl.col("cost"),
1709
)
1710
1711
1712
def test_str_concat() -> None:
1713
df = pl.DataFrame(
1714
{
1715
"nrs": [1, 2, 3, 4],
1716
"name": ["ham", "spam", "foo", None],
1717
}
1718
)
1719
out = df.with_columns((pl.lit("Dr. ") + pl.col("name")).alias("graduated_name"))
1720
assert out["graduated_name"][0] == "Dr. ham"
1721
assert out["graduated_name"][1] == "Dr. spam"
1722
1723
1724
def test_dot_product() -> None:
1725
df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [2, 2, 2, 2]})
1726
1727
assert df["a"].dot(df["b"]) == 20
1728
assert typing.cast("int", df.select([pl.col("a").dot("b")])[0, "a"]) == 20
1729
1730
result = pl.Series([1, 2, 3]) @ pl.Series([4, 5, 6])
1731
assert isinstance(result, int)
1732
assert result == 32
1733
1734
result = pl.Series([1, 2, 3]) @ pl.Series([4.0, 5.0, 6.0])
1735
assert isinstance(result, float)
1736
assert result == 32.0
1737
1738
result = pl.Series([1.0, 2.0, 3.0]) @ pl.Series([4.0, 5.0, 6.0])
1739
assert isinstance(result, float)
1740
assert result == 32.0
1741
1742
with pytest.raises(
1743
InvalidOperationError, match="`dot` operation not supported for dtype `bool`"
1744
):
1745
pl.Series([True, False, False, True]) @ pl.Series([4, 5, 6, 7])
1746
1747
with pytest.raises(
1748
InvalidOperationError, match="`dot` operation not supported for dtype `str`"
1749
):
1750
pl.Series([1, 2, 3, 4]) @ pl.Series(["True", "False", "False", "True"])
1751
1752
1753
def test_hash_rows() -> None:
1754
df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [2, 2, 2, 2]})
1755
assert df.hash_rows().dtype == pl.UInt64
1756
assert df["a"].hash().dtype == pl.UInt64
1757
assert df.select([pl.col("a").hash().alias("foo")])["foo"].dtype == pl.UInt64
1758
1759
1760
def test_reproducible_hash_with_seeds() -> None:
1761
"""
1762
Test the reproducibility of DataFrame.hash_rows, Series.hash, and Expr.hash.
1763
1764
cf. issue #3966, hashes must always be reproducible across sessions when using
1765
the same seeds.
1766
"""
1767
df = pl.DataFrame({"s": [1234, None, 5678]})
1768
seeds = (11, 22, 33, 44)
1769
expected = pl.Series(
1770
"s",
1771
[10832467230526607564, 3044502640115867787, 17228373233104406792],
1772
dtype=pl.UInt64,
1773
)
1774
result = df.hash_rows(*seeds)
1775
assert_series_equal(expected, result, check_names=False, check_exact=True)
1776
result = df["s"].hash(*seeds)
1777
assert_series_equal(expected, result, check_names=False, check_exact=True)
1778
result = df.select([pl.col("s").hash(*seeds)])["s"]
1779
assert_series_equal(expected, result, check_names=False, check_exact=True)
1780
1781
1782
@pytest.mark.slow
1783
@pytest.mark.parametrize(
1784
"e",
1785
[
1786
pl.int_range(1_000_000),
1787
# Test code path for null_count > 0
1788
pl.when(pl.int_range(1_000_000) != 0).then(pl.int_range(1_000_000)),
1789
],
1790
)
1791
def test_hash_collision_multiple_columns_equal_values_15390(e: pl.Expr) -> None:
1792
df = pl.select(e.alias("a"))
1793
1794
for n_columns in (1, 2, 3, 4):
1795
s = df.select(pl.col("a").alias(f"x{i}") for i in range(n_columns)).hash_rows()
1796
1797
vc = s.sort().value_counts(sort=True)
1798
max_bucket_size = vc["count"][0]
1799
1800
assert max_bucket_size == 1
1801
1802
1803
@pytest.mark.may_fail_auto_streaming # Python objects not yet supported in row encoding
1804
@pytest.mark.may_fail_cloud
1805
def test_hashing_on_python_objects() -> None:
1806
# see if we can do a group_by, drop_duplicates on a DataFrame with objects.
1807
# this requires that the hashing and aggregations are done on python objects
1808
1809
df = pl.DataFrame({"a": [1, 1, 3, 4], "b": [1, 1, 2, 2]})
1810
1811
class Foo:
1812
def __hash__(self) -> int:
1813
return 0
1814
1815
def __eq__(self, other: object) -> bool:
1816
return True
1817
1818
df = df.with_columns(pl.col("a").map_elements(lambda x: Foo()).alias("foo"))
1819
assert df.group_by(["foo"]).first().shape == (1, 3)
1820
assert df.unique().shape == (3, 3)
1821
1822
1823
def test_unique_unit_rows() -> None:
1824
df = pl.DataFrame({"a": [1], "b": [None]}, schema={"a": pl.Int64, "b": pl.Float32})
1825
1826
# 'unique' one-row frame should be equal to the original frame
1827
assert_frame_equal(df, df.unique(subset="a"))
1828
for col in df.columns:
1829
assert df.n_unique(subset=[col]) == 1
1830
1831
1832
def test_panic() -> None:
1833
# may contain some tests that yielded a panic in polars or pl_arrow
1834
# https://github.com/pola-rs/polars/issues/1110
1835
a = pl.DataFrame(
1836
{
1837
"col1": ["a"] * 500 + ["b"] * 500,
1838
}
1839
)
1840
a.filter(pl.col("col1") != "b")
1841
1842
1843
def test_horizontal_agg() -> None:
1844
df = pl.DataFrame({"a": [1, None, 3], "b": [1, 2, 3]})
1845
1846
assert_series_equal(df.sum_horizontal(), pl.Series("sum", [2, 2, 6]))
1847
assert_series_equal(
1848
df.sum_horizontal(ignore_nulls=False), pl.Series("sum", [2, None, 6])
1849
)
1850
assert_series_equal(
1851
df.mean_horizontal(ignore_nulls=False), pl.Series("mean", [1.0, None, 3.0])
1852
)
1853
1854
1855
def test_slicing() -> None:
1856
# https://github.com/pola-rs/polars/issues/1322
1857
n = 20
1858
1859
df = pl.DataFrame(
1860
{
1861
"d": ["u", "u", "d", "c", "c", "d", "d"] * n,
1862
"v1": [None, "help", None, None, None, None, None] * n,
1863
}
1864
)
1865
1866
assert (df.filter(pl.col("d") != "d").select([pl.col("v1").unique()])).shape == (
1867
2,
1868
1,
1869
)
1870
1871
1872
def test_group_by_cat_list() -> None:
1873
grouped = (
1874
pl.DataFrame(
1875
[
1876
pl.Series("str_column", ["a", "b", "b", "a", "b"]),
1877
pl.Series("int_column", [1, 1, 2, 2, 3]),
1878
]
1879
)
1880
.with_columns(pl.col("str_column").cast(pl.Categorical).alias("cat_column"))
1881
.group_by("int_column", maintain_order=True)
1882
.agg([pl.col("cat_column")])["cat_column"]
1883
)
1884
1885
out = grouped.explode()
1886
assert out.dtype == pl.Categorical
1887
assert out[0] == "a"
1888
1889
1890
def test_group_by_agg_n_unique_floats() -> None:
1891
# tests proper dispatch
1892
df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})
1893
1894
for dtype in [pl.Float32, pl.Float64]:
1895
out = df.group_by("a", maintain_order=True).agg(
1896
[pl.col("b").cast(dtype).n_unique()]
1897
)
1898
assert out["b"].to_list() == [2, 1]
1899
1900
1901
def test_group_by_agg_n_unique_empty_group_idx_path() -> None:
1902
df = pl.DataFrame(
1903
{
1904
"key": [1, 1, 1, 2, 2, 2],
1905
"value": [1, 2, 3, 4, 5, 6],
1906
"filt": [True, True, True, False, False, False],
1907
}
1908
)
1909
out = df.group_by("key", maintain_order=True).agg(
1910
pl.col("value").filter("filt").n_unique().alias("n_unique")
1911
)
1912
expected = pl.DataFrame(
1913
{
1914
"key": [1, 2],
1915
"n_unique": pl.Series([3, 0], dtype=pl.UInt32),
1916
}
1917
)
1918
assert_frame_equal(out, expected)
1919
1920
1921
def test_group_by_agg_n_unique_empty_group_slice_path() -> None:
1922
df = pl.DataFrame(
1923
{
1924
"key": [1, 1, 1, 2, 2, 2],
1925
"value": [1, 2, 3, 4, 5, 6],
1926
"filt": [False, False, False, False, False, False],
1927
}
1928
)
1929
out = df.group_by("key", maintain_order=True).agg(
1930
pl.col("value").filter("filt").n_unique().alias("n_unique")
1931
)
1932
expected = pl.DataFrame(
1933
{
1934
"key": [1, 2],
1935
"n_unique": pl.Series([0, 0], dtype=pl.UInt32),
1936
}
1937
)
1938
assert_frame_equal(out, expected)
1939
1940
1941
def test_select_by_dtype(df: pl.DataFrame) -> None:
1942
out = df.select(pl.col(pl.String))
1943
assert out.columns == ["strings", "strings_nulls"]
1944
out = df.select(pl.col([pl.String, pl.Boolean]))
1945
assert out.columns == ["bools", "bools_nulls", "strings", "strings_nulls"]
1946
out = df.select(pl.col(INTEGER_DTYPES))
1947
assert out.columns == ["int", "int_nulls"]
1948
1949
out = df.select(ints=pl.struct(pl.col(INTEGER_DTYPES)))
1950
assert out.schema == {
1951
"ints": pl.Struct([pl.Field("int", pl.Int64), pl.Field("int_nulls", pl.Int64)])
1952
}
1953
1954
1955
def test_with_row_index() -> None:
1956
df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})
1957
1958
out = df.with_row_index()
1959
assert out["index"].to_list() == [0, 1, 2]
1960
1961
out = df.lazy().with_row_index().collect()
1962
assert out["index"].to_list() == [0, 1, 2]
1963
1964
1965
def test_with_row_index_bad_offset() -> None:
1966
df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})
1967
1968
with pytest.raises(ValueError, match="cannot be negative"):
1969
df.with_row_index(offset=-1)
1970
with pytest.raises(
1971
ValueError, match="cannot be greater than the maximum index value"
1972
):
1973
df.with_row_index(offset=2**32)
1974
1975
1976
def test_with_row_index_bad_offset_lazy() -> None:
1977
lf = pl.LazyFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})
1978
1979
with pytest.raises(ValueError, match="cannot be negative"):
1980
lf.with_row_index(offset=-1)
1981
with pytest.raises(
1982
ValueError, match="cannot be greater than the maximum index value"
1983
):
1984
lf.with_row_index(offset=2**32)
1985
1986
1987
def test_with_row_count_deprecated() -> None:
1988
df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})
1989
1990
with pytest.deprecated_call():
1991
out = df.with_row_count()
1992
assert out["row_nr"].to_list() == [0, 1, 2]
1993
1994
with pytest.deprecated_call():
1995
out = df.lazy().with_row_count().collect()
1996
assert out["row_nr"].to_list() == [0, 1, 2]
1997
1998
1999
@pytest.mark.may_fail_cloud
2000
def test_filter_with_all_expansion() -> None:
2001
df = pl.DataFrame(
2002
{
2003
"b": [1, 2, None],
2004
"c": [1, 2, None],
2005
"a": [None, None, None],
2006
}
2007
)
2008
out = df.filter(~pl.fold(True, lambda acc, s: acc & s.is_null(), pl.all()))
2009
assert out.shape == (2, 3)
2010
2011
2012
# TODO: investigate this discrepancy in auto streaming
2013
@pytest.mark.may_fail_auto_streaming
2014
@pytest.mark.may_fail_cloud
2015
def test_extension() -> None:
2016
class Foo:
2017
def __init__(self, value: Any) -> None:
2018
self.value = value
2019
2020
def __repr__(self) -> str:
2021
return f"foo({self.value})"
2022
2023
foos = [Foo(1), Foo(2), Foo(3)]
2024
2025
# foos and sys.getrefcount both have a reference.
2026
base_count = 2
2027
2028
# We compute the refcount on a separate line otherwise pytest's assert magic
2029
# might add reference counts.
2030
rc = sys.getrefcount(foos[0])
2031
assert rc == base_count
2032
2033
df = pl.DataFrame({"groups": [1, 1, 2], "a": foos})
2034
rc = sys.getrefcount(foos[0])
2035
assert rc == base_count + 1
2036
del df
2037
rc = sys.getrefcount(foos[0])
2038
assert rc == base_count
2039
2040
df = pl.DataFrame({"groups": [1, 1, 2], "a": foos})
2041
rc = sys.getrefcount(foos[0])
2042
assert rc == base_count + 1
2043
2044
out = df.group_by("groups", maintain_order=True).agg(pl.col("a").alias("a"))
2045
rc = sys.getrefcount(foos[0])
2046
assert rc == base_count + 2
2047
s = out["a"].list.explode()
2048
rc = sys.getrefcount(foos[0])
2049
assert rc == base_count + 3
2050
del s
2051
rc = sys.getrefcount(foos[0])
2052
assert rc == base_count + 2
2053
2054
assert out["a"].list.explode().to_list() == foos
2055
rc = sys.getrefcount(foos[0])
2056
assert rc == base_count + 2
2057
del out
2058
rc = sys.getrefcount(foos[0])
2059
assert rc == base_count + 1
2060
del df
2061
rc = sys.getrefcount(foos[0])
2062
assert rc == base_count
2063
2064
2065
@pytest.mark.parametrize("name", [None, "n", ""])
2066
def test_group_by_order_dispatch(name: str | None) -> None:
2067
df = pl.DataFrame({"x": list("bab"), "y": range(3)})
2068
lf = df.lazy()
2069
2070
result = df.group_by("x", maintain_order=True).len(name=name)
2071
lazy_result = lf.group_by("x").len(name=name).sort(by="x", descending=True)
2072
2073
name = "len" if name is None else name
2074
expected = pl.DataFrame(
2075
data={"x": ["b", "a"], name: [2, 1]},
2076
schema_overrides={name: pl.UInt32},
2077
)
2078
assert_frame_equal(result, expected)
2079
assert_frame_equal(lazy_result.collect(), expected)
2080
2081
result = df.group_by("x", maintain_order=True).all()
2082
expected = pl.DataFrame({"x": ["b", "a"], "y": [[0, 2], [1]]})
2083
assert_frame_equal(result, expected)
2084
2085
2086
def test_partitioned_group_by_order() -> None:
2087
# check if group ordering is maintained.
2088
# we only have 30 groups, so this triggers a partitioned group by
2089
df = pl.DataFrame({"x": [chr(v) for v in range(33, 63)], "y": range(30)})
2090
out = df.group_by("x", maintain_order=True).agg(pl.all().implode())
2091
assert_series_equal(out["x"], df["x"])
2092
2093
2094
def test_schema() -> None:
2095
df = pl.DataFrame(
2096
{"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]}
2097
)
2098
expected = {"foo": pl.Int64, "bar": pl.Float64, "ham": pl.String}
2099
assert df.schema == expected
2100
2101
2102
def test_schema_equality() -> None:
2103
lf = pl.LazyFrame({"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0]})
2104
lf_rev = lf.select("bar", "foo")
2105
2106
assert lf.collect_schema() != lf_rev.collect_schema()
2107
assert lf.collect().schema != lf_rev.collect().schema
2108
2109
2110
def test_df_schema_unique() -> None:
2111
df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
2112
with pytest.raises(DuplicateError):
2113
df.columns = ["a", "a"]
2114
2115
with pytest.raises(DuplicateError):
2116
df.rename({"b": "a"})
2117
2118
2119
def test_empty_projection() -> None:
2120
empty_df = pl.DataFrame({"a": [1, 2], "b": [3, 4]}).select([])
2121
assert empty_df.rows() == []
2122
assert empty_df.schema == {}
2123
assert empty_df.shape == (0, 0)
2124
2125
2126
def test_fill_null() -> None:
2127
df = pl.DataFrame({"a": [1, 2], "b": [3, None]})
2128
assert_frame_equal(df.fill_null(4), pl.DataFrame({"a": [1, 2], "b": [3, 4]}))
2129
assert_frame_equal(
2130
df.fill_null(strategy="max"), pl.DataFrame({"a": [1, 2], "b": [3, 3]})
2131
)
2132
2133
# string and list data
2134
# string goes via binary
2135
df = pl.DataFrame(
2136
{
2137
"c": [
2138
["Apple", "Orange"],
2139
["Apple", "Orange"],
2140
None,
2141
["Carrot"],
2142
None,
2143
None,
2144
],
2145
"b": ["Apple", "Orange", None, "Carrot", None, None],
2146
}
2147
)
2148
2149
assert df.select(
2150
pl.all().fill_null(strategy="forward").name.suffix("_forward"),
2151
pl.all().fill_null(strategy="backward").name.suffix("_backward"),
2152
).to_dict(as_series=False) == {
2153
"c_forward": [
2154
["Apple", "Orange"],
2155
["Apple", "Orange"],
2156
["Apple", "Orange"],
2157
["Carrot"],
2158
["Carrot"],
2159
["Carrot"],
2160
],
2161
"b_forward": ["Apple", "Orange", "Orange", "Carrot", "Carrot", "Carrot"],
2162
"c_backward": [
2163
["Apple", "Orange"],
2164
["Apple", "Orange"],
2165
["Carrot"],
2166
["Carrot"],
2167
None,
2168
None,
2169
],
2170
"b_backward": ["Apple", "Orange", "Carrot", "Carrot", None, None],
2171
}
2172
# categoricals
2173
df = pl.DataFrame(pl.Series("cat", ["a", None], dtype=pl.Categorical))
2174
s = df.select(pl.col("cat").fill_null(strategy="forward"))["cat"]
2175
assert s.dtype == pl.Categorical
2176
assert s.to_list() == ["a", "a"]
2177
2178
2179
def test_fill_nan() -> None:
2180
df = pl.DataFrame({"a": [1, 2], "b": [3.0, float("nan")]})
2181
assert_frame_equal(
2182
df.fill_nan(4),
2183
pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0]}),
2184
)
2185
assert_frame_equal(
2186
df.fill_nan(None),
2187
pl.DataFrame({"a": [1, 2], "b": [3.0, None]}),
2188
)
2189
assert df["b"].fill_nan(5.0).to_list() == [3.0, 5.0]
2190
df = pl.DataFrame(
2191
{
2192
"a": [1.0, np.nan, 3.0],
2193
"b": [datetime(1, 2, 2), datetime(2, 2, 2), datetime(3, 2, 2)],
2194
}
2195
)
2196
assert df.fill_nan(2.0).dtypes == [pl.Float64, pl.Datetime]
2197
2198
2199
#
2200
def test_forward_fill() -> None:
2201
df = pl.DataFrame({"a": [1.0, None, 3.0]})
2202
fill = df.select(pl.col("a").forward_fill())["a"]
2203
assert_series_equal(fill, pl.Series("a", [1, 1, 3]).cast(pl.Float64))
2204
2205
df = pl.DataFrame({"a": [None, 1, None]})
2206
fill = df.select(pl.col("a").forward_fill())["a"]
2207
assert_series_equal(fill, pl.Series("a", [None, 1, 1]).cast(pl.Int64))
2208
2209
2210
def test_backward_fill() -> None:
2211
df = pl.DataFrame({"a": [1.0, None, 3.0]})
2212
fill = df.select(pl.col("a").backward_fill())["a"]
2213
assert_series_equal(fill, pl.Series("a", [1, 3, 3]).cast(pl.Float64))
2214
2215
df = pl.DataFrame({"a": [None, 1, None]})
2216
fill = df.select(pl.col("a").backward_fill())["a"]
2217
assert_series_equal(fill, pl.Series("a", [1, 1, None]).cast(pl.Int64))
2218
2219
2220
def test_shrink_to_fit() -> None:
2221
df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]})
2222
2223
assert df.shrink_to_fit(in_place=True) is df
2224
assert df.shrink_to_fit(in_place=False) is not df
2225
assert_frame_equal(df.shrink_to_fit(in_place=False), df)
2226
2227
2228
def test_add_string() -> None:
2229
df = pl.DataFrame({"a": ["hi", "there"], "b": ["hello", "world"]})
2230
expected = pl.DataFrame(
2231
{"a": ["hi hello", "there hello"], "b": ["hello hello", "world hello"]}
2232
)
2233
assert_frame_equal((df + " hello"), expected)
2234
2235
expected = pl.DataFrame(
2236
{"a": ["hello hi", "hello there"], "b": ["hello hello", "hello world"]}
2237
)
2238
assert_frame_equal(("hello " + df), expected)
2239
2240
2241
def test_df_broadcast() -> None:
2242
df = pl.DataFrame({"a": [1, 2, 3]}, schema_overrides={"a": pl.UInt8})
2243
out = df.with_columns(pl.lit(pl.Series("s", [[1, 2]])).first())
2244
assert out.shape == (3, 2)
2245
assert out.schema == {"a": pl.UInt8, "s": pl.List(pl.Int64)}
2246
assert out.rows() == [(1, [1, 2]), (2, [1, 2]), (3, [1, 2])]
2247
2248
2249
@pytest.mark.may_fail_cloud # not a lazyframe method
2250
def test_product() -> None:
2251
df = pl.DataFrame(
2252
{
2253
"int": [1, 2, 3],
2254
"flt": [-1.0, 12.0, 9.0],
2255
"bool_0": [True, False, True],
2256
"bool_1": [True, True, True],
2257
"str": ["a", "b", "c"],
2258
},
2259
schema_overrides={
2260
"int": pl.UInt16,
2261
"flt": pl.Float32,
2262
},
2263
)
2264
out = df.product()
2265
expected = pl.DataFrame(
2266
{"int": [6], "flt": [-108.0], "bool_0": [0], "bool_1": [1], "str": [None]}
2267
)
2268
assert_frame_not_equal(out, expected, check_dtypes=True)
2269
assert_frame_equal(out, expected, check_dtypes=False)
2270
2271
2272
def test_first_last_nth_expressions(fruits_cars: pl.DataFrame) -> None:
2273
df = fruits_cars
2274
out = df.select(pl.first())
2275
assert out.columns == ["A"]
2276
2277
out = df.select(pl.last())
2278
assert out.columns == ["cars"]
2279
2280
out = df.select(pl.nth(0))
2281
assert out.columns == ["A"]
2282
2283
out = df.select(pl.nth(1))
2284
assert out.columns == ["fruits"]
2285
2286
out = df.select(pl.nth(-2))
2287
assert out.columns == ["B"]
2288
2289
2290
def test_is_between(fruits_cars: pl.DataFrame) -> None:
2291
result = fruits_cars.select(pl.col("A").is_between(2, 4)).to_series()
2292
assert_series_equal(result, pl.Series("A", [False, True, True, True, False]))
2293
2294
result = fruits_cars.select(pl.col("A").is_between(2, 4, closed="none")).to_series()
2295
assert_series_equal(result, pl.Series("A", [False, False, True, False, False]))
2296
2297
result = fruits_cars.select(pl.col("A").is_between(2, 4, closed="both")).to_series()
2298
assert_series_equal(result, pl.Series("A", [False, True, True, True, False]))
2299
2300
result = fruits_cars.select(
2301
pl.col("A").is_between(2, 4, closed="right")
2302
).to_series()
2303
assert_series_equal(result, pl.Series("A", [False, False, True, True, False]))
2304
2305
result = fruits_cars.select(pl.col("A").is_between(2, 4, closed="left")).to_series()
2306
assert_series_equal(result, pl.Series("A", [False, True, True, False, False]))
2307
2308
2309
def test_is_between_data_types() -> None:
2310
df = pl.DataFrame(
2311
{
2312
"flt": [1.4, 1.2, 2.5],
2313
"int": [2, 3, 4],
2314
"str": ["xyz", "str", "abc"],
2315
"date": [date(2020, 1, 1), date(2020, 2, 2), date(2020, 3, 3)],
2316
"datetime": [
2317
datetime(2020, 1, 1, 0, 0, 0),
2318
datetime(2020, 1, 1, 10, 0, 0),
2319
datetime(2020, 1, 1, 12, 0, 0),
2320
],
2321
"tm": [time(10, 30), time(0, 45), time(15, 15)],
2322
}
2323
)
2324
2325
# on purpose, for float and int, we pass in a mixture of bound data types
2326
assert_series_equal(
2327
df.select(pl.col("flt").is_between(1, 2.3))[:, 0],
2328
pl.Series("flt", [True, True, False]),
2329
)
2330
assert_series_equal(
2331
df.select(pl.col("int").is_between(1.5, 3))[:, 0],
2332
pl.Series("int", [True, True, False]),
2333
)
2334
assert_series_equal(
2335
df.select(pl.col("date").is_between(date(2019, 1, 1), date(2020, 2, 5)))[:, 0],
2336
pl.Series("date", [True, True, False]),
2337
)
2338
assert_series_equal(
2339
df.select(
2340
pl.col("datetime").is_between(
2341
datetime(2020, 1, 1, 5, 0, 0), datetime(2020, 1, 1, 11, 0, 0)
2342
)
2343
)[:, 0],
2344
pl.Series("datetime", [False, True, False]),
2345
)
2346
assert_series_equal(
2347
df.select(
2348
pl.col("str").is_between(pl.lit("str"), pl.lit("zzz"), closed="left")
2349
)[:, 0],
2350
pl.Series("str", [True, True, False]),
2351
)
2352
assert_series_equal(
2353
df.select(
2354
pl.col("tm")
2355
.is_between(time(0, 45), time(10, 30), closed="right")
2356
.alias("tm_between")
2357
)[:, 0],
2358
pl.Series("tm_between", [True, False, False]),
2359
)
2360
2361
2362
def test_empty_is_in() -> None:
2363
df_empty_isin = pl.DataFrame({"foo": ["a", "b", "c", "d"]}).filter(
2364
pl.col("foo").is_in([])
2365
)
2366
assert df_empty_isin.shape == (0, 1)
2367
assert df_empty_isin.rows() == []
2368
assert df_empty_isin.schema == {"foo": pl.String}
2369
2370
2371
def test_group_by_slice_expression_args() -> None:
2372
df = pl.DataFrame({"groups": ["a"] * 10 + ["b"] * 20, "vals": range(30)})
2373
2374
out = (
2375
df.group_by("groups", maintain_order=True)
2376
.agg([pl.col("vals").slice((pl.len() * 0.1).cast(int), (pl.len() // 5))])
2377
.explode("vals")
2378
)
2379
2380
expected = pl.DataFrame(
2381
{"groups": ["a", "a", "b", "b", "b", "b"], "vals": [1, 2, 12, 13, 14, 15]}
2382
)
2383
assert_frame_equal(out, expected)
2384
2385
2386
def test_join_suffixes() -> None:
2387
df_a = pl.DataFrame({"A": [1], "B": [1]})
2388
df_b = pl.DataFrame({"A": [1], "B": [1]})
2389
2390
join_strategies: list[JoinStrategy] = ["left", "inner", "full", "cross"]
2391
for how in join_strategies:
2392
# no need for an assert, we error if wrong
2393
df_a.join(df_b, on="A" if how != "cross" else None, suffix="_y", how=how)["B_y"]
2394
2395
df_a.join_asof(df_b, on=pl.col("A").set_sorted(), suffix="_y")["B_y"]
2396
2397
2398
def test_explode_empty() -> None:
2399
df = (
2400
pl.DataFrame({"x": ["a", "a", "b", "b"], "y": [1, 1, 2, 2]})
2401
.group_by("x", maintain_order=True)
2402
.agg(pl.col("y").gather([]))
2403
)
2404
assert df.explode("y").to_dict(as_series=False) == {
2405
"x": ["a", "b"],
2406
"y": [None, None],
2407
}
2408
2409
df = pl.DataFrame({"x": ["1", "2", "4"], "y": [["a", "b", "c"], ["d"], []]})
2410
assert_frame_equal(
2411
df.explode("y"),
2412
pl.DataFrame({"x": ["1", "1", "1", "2", "4"], "y": ["a", "b", "c", "d", None]}),
2413
)
2414
2415
df = pl.DataFrame(
2416
{
2417
"letters": ["a"],
2418
"numbers": [[]],
2419
}
2420
)
2421
assert df.explode("numbers").to_dict(as_series=False) == {
2422
"letters": ["a"],
2423
"numbers": [None],
2424
}
2425
2426
2427
def test_asof_by_multiple_keys() -> None:
2428
lhs = pl.DataFrame(
2429
{
2430
"a": [-20, -19, 8, 12, 14],
2431
"by": [1, 1, 2, 2, 2],
2432
"by2": [1, 1, 2, 2, 2],
2433
}
2434
)
2435
2436
rhs = pl.DataFrame(
2437
{
2438
"a": [-19, -15, 3, 5, 13],
2439
"by": [1, 1, 2, 2, 2],
2440
"by2": [1, 1, 2, 2, 2],
2441
}
2442
)
2443
2444
result = lhs.join_asof(
2445
rhs, on=pl.col("a").set_sorted(), by=["by", "by2"], strategy="backward"
2446
).select(["a", "by"])
2447
expected = pl.DataFrame({"a": [-20, -19, 8, 12, 14], "by": [1, 1, 2, 2, 2]})
2448
assert_frame_equal(
2449
result.group_by("by").agg("a"),
2450
expected.group_by("by").agg("a"),
2451
check_row_order=False,
2452
)
2453
2454
2455
def test_asof_bad_input_type() -> None:
2456
lhs = pl.DataFrame({"a": [1, 2, 3]})
2457
rhs = pl.DataFrame({"a": [1, 2, 3]})
2458
2459
with pytest.raises(
2460
TypeError,
2461
match="expected `other` .*to be a 'DataFrame'.* not 'LazyFrame'",
2462
):
2463
lhs.join_asof(rhs.lazy(), on="a") # type: ignore[arg-type]
2464
2465
with pytest.raises(
2466
TypeError,
2467
match="expected `other` .*to be a 'DataFrame'.* not 'Series'",
2468
):
2469
lhs.join_asof(pl.Series([1, 2, 3]), on="a") # type: ignore[arg-type]
2470
2471
class DummyDataFrameSubclass(pl.DataFrame):
2472
pass
2473
2474
rhs = DummyDataFrameSubclass(rhs)
2475
2476
lhs.join_asof(rhs, on="a")
2477
2478
2479
def test_list_of_list_of_struct() -> None:
2480
expected = [{"list_of_list_of_struct": [[{"a": 1}, {"a": 2}]]}]
2481
pa_df = pa.Table.from_pylist(expected)
2482
2483
df = pl.from_arrow(pa_df)
2484
assert df.rows() == [([[{"a": 1}, {"a": 2}]],)] # type: ignore[union-attr]
2485
assert df.to_dicts() == expected # type: ignore[union-attr]
2486
2487
df = pl.from_arrow(pa_df[:0])
2488
assert df.to_dicts() == [] # type: ignore[union-attr]
2489
2490
2491
def test_fill_null_limits() -> None:
2492
assert pl.DataFrame(
2493
{
2494
"a": [1, None, None, None, 5, 6, None, None, None, 10],
2495
"b": ["a", None, None, None, "b", "c", None, None, None, "d"],
2496
"c": [True, None, None, None, False, True, None, None, None, False],
2497
}
2498
).select(
2499
pl.all().fill_null(strategy="forward", limit=2),
2500
pl.all().fill_null(strategy="backward", limit=2).name.suffix("_backward"),
2501
).to_dict(as_series=False) == {
2502
"a": [1, 1, 1, None, 5, 6, 6, 6, None, 10],
2503
"b": ["a", "a", "a", None, "b", "c", "c", "c", None, "d"],
2504
"c": [True, True, True, None, False, True, True, True, None, False],
2505
"a_backward": [1, None, 5, 5, 5, 6, None, 10, 10, 10],
2506
"b_backward": ["a", None, "b", "b", "b", "c", None, "d", "d", "d"],
2507
"c_backward": [
2508
True,
2509
None,
2510
False,
2511
False,
2512
False,
2513
True,
2514
None,
2515
False,
2516
False,
2517
False,
2518
],
2519
}
2520
2521
2522
def test_lower_bound_upper_bound(fruits_cars: pl.DataFrame) -> None:
2523
res_expr = fruits_cars.select(pl.col("A").lower_bound())
2524
assert res_expr.item() == -9223372036854775808
2525
2526
res_expr = fruits_cars.select(pl.col("B").upper_bound())
2527
assert res_expr.item() == 9223372036854775807
2528
2529
with pytest.raises(ComputeError):
2530
fruits_cars.select(pl.col("fruits").upper_bound())
2531
2532
2533
def test_selection_misc() -> None:
2534
df = pl.DataFrame({"x": "abc"}, schema={"x": pl.String})
2535
2536
# literal values (as scalar/list)
2537
for zero in (0, [0]):
2538
assert df.select(zero)["literal"].to_list() == [0]
2539
assert df.select(literal=0)["literal"].to_list() == [0]
2540
2541
# expect string values to be interpreted as cols
2542
for x in ("x", ["x"], pl.col("x")):
2543
assert df.select(x).rows() == [("abc",)]
2544
2545
# string col + lit
2546
assert df.with_columns(["x", 0]).to_dicts() == [{"x": "abc", "literal": 0}]
2547
2548
2549
def test_selection_regex_and_multicol() -> None:
2550
test_df = pl.DataFrame(
2551
{
2552
"a": [1, 2, 3, 4],
2553
"b": [5, 6, 7, 8],
2554
"c": [9, 10, 11, 12],
2555
"foo": [13, 14, 15, 16],
2556
},
2557
schema_overrides={"foo": pl.UInt8},
2558
)
2559
2560
# Selection only
2561
test_df.select(
2562
pl.col(["a", "b", "c"]).name.suffix("_list"),
2563
pl.all().exclude("foo").name.suffix("_wild"),
2564
pl.col("^\\w$").name.suffix("_regex"),
2565
)
2566
2567
# Multi * Single
2568
assert test_df.select(pl.col(["a", "b", "c"]) * pl.col("foo")).to_dict(
2569
as_series=False
2570
) == {
2571
"a": [13, 28, 45, 64],
2572
"b": [65, 84, 105, 128],
2573
"c": [117, 140, 165, 192],
2574
}
2575
assert test_df.select(pl.all().exclude("foo") * pl.col("foo")).to_dict(
2576
as_series=False
2577
) == {
2578
"a": [13, 28, 45, 64],
2579
"b": [65, 84, 105, 128],
2580
"c": [117, 140, 165, 192],
2581
}
2582
2583
assert test_df.select(pl.col("^\\w$") * pl.col("foo")).to_dict(as_series=False) == {
2584
"a": [13, 28, 45, 64],
2585
"b": [65, 84, 105, 128],
2586
"c": [117, 140, 165, 192],
2587
}
2588
2589
# Multi * Multi
2590
result = test_df.select(pl.col(["a", "b", "c"]) * pl.col(["a", "b", "c"]))
2591
expected = {"a": [1, 4, 9, 16], "b": [25, 36, 49, 64], "c": [81, 100, 121, 144]}
2592
2593
assert result.to_dict(as_series=False) == expected
2594
assert test_df.select(pl.exclude("foo") * pl.exclude("foo")).to_dict(
2595
as_series=False
2596
) == {
2597
"a": [1, 4, 9, 16],
2598
"b": [25, 36, 49, 64],
2599
"c": [81, 100, 121, 144],
2600
}
2601
assert test_df.select(pl.col("^\\w$") * pl.col("^\\w$")).to_dict(
2602
as_series=False
2603
) == {
2604
"a": [1, 4, 9, 16],
2605
"b": [25, 36, 49, 64],
2606
"c": [81, 100, 121, 144],
2607
}
2608
2609
df = test_df.select(
2610
re=pl.struct(pl.col("^\\w$")),
2611
odd=pl.struct((pl.col(INTEGER_DTYPES) % 2).name.suffix("_is_odd")),
2612
maxes=pl.struct(pl.all().max().name.suffix("_max")),
2613
).head(2)
2614
# ┌───────────┬───────────┬─────────────┐
2615
# │ re ┆ odd ┆ maxes │
2616
# │ --- ┆ --- ┆ --- │
2617
# │ struct[3] ┆ struct[4] ┆ struct[4] │
2618
# ╞═══════════╪═══════════╪═════════════╡
2619
# │ {1,5,9} ┆ {1,1,1,1} ┆ {4,8,12,16} │
2620
# │ {2,6,10} ┆ {0,0,0,0} ┆ {4,8,12,16} │
2621
# └───────────┴───────────┴─────────────┘
2622
assert df.rows() == [
2623
(
2624
{"a": 1, "b": 5, "c": 9},
2625
{"a_is_odd": 1, "b_is_odd": 1, "c_is_odd": 1, "foo_is_odd": 1},
2626
{"a_max": 4, "b_max": 8, "c_max": 12, "foo_max": 16},
2627
),
2628
(
2629
{"a": 2, "b": 6, "c": 10},
2630
{"a_is_odd": 0, "b_is_odd": 0, "c_is_odd": 0, "foo_is_odd": 0},
2631
{"a_max": 4, "b_max": 8, "c_max": 12, "foo_max": 16},
2632
),
2633
]
2634
2635
2636
@pytest.mark.parametrize("subset", ["a", cs.starts_with("x", "a")])
2637
@pytest.mark.may_fail_auto_streaming # Flaky in CI, see https://github.com/pola-rs/polars/issues/20943
2638
@pytest.mark.may_fail_cloud
2639
def test_unique_on_sorted(subset: Any) -> None:
2640
df = pl.DataFrame(data={"a": [1, 1, 3], "b": [1, 2, 3]})
2641
2642
result = df.with_columns([pl.col("a").set_sorted()]).unique(
2643
subset=subset,
2644
keep="last",
2645
)
2646
2647
expected = pl.DataFrame({"a": [1, 3], "b": [2, 3]})
2648
assert_frame_equal(result, expected)
2649
2650
2651
def test_len_compute(df: pl.DataFrame) -> None:
2652
df = df.with_columns(pl.struct(["list_bool", "cat"]).alias("struct"))
2653
filtered = df.filter(pl.col("bools"))
2654
for col in filtered.columns:
2655
assert len(filtered[col]) == 1
2656
2657
taken = df[[1, 2], :]
2658
for col in taken.columns:
2659
assert len(taken[col]) == 2
2660
2661
2662
def test_filter_sequence() -> None:
2663
df = pl.DataFrame({"a": [1, 2, 3]})
2664
assert df.filter([True, False, True])["a"].to_list() == [1, 3]
2665
assert df.filter(np.array([True, False, True]))["a"].to_list() == [1, 3]
2666
2667
2668
def test_filter_multiple_predicates() -> None:
2669
df = pl.DataFrame(
2670
{
2671
"a": [1, 1, 1, 2, 2],
2672
"b": [1, 1, 2, 2, 2],
2673
"c": [1, 1, 2, 3, 4],
2674
}
2675
)
2676
2677
# multiple predicates
2678
expected = pl.DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "c": [1, 1, 2]})
2679
for out in (
2680
df.filter(pl.col("a") == 1, pl.col("b") <= 2), # positional/splat
2681
df.filter([pl.col("a") == 1, pl.col("b") <= 2]), # as list
2682
):
2683
assert_frame_equal(out, expected)
2684
2685
# multiple kwargs
2686
assert_frame_equal(
2687
df.filter(a=1, b=2),
2688
pl.DataFrame({"a": [1], "b": [2], "c": [2]}),
2689
)
2690
2691
# both positional and keyword args
2692
assert_frame_equal(
2693
pl.DataFrame({"a": [2], "b": [2], "c": [3]}),
2694
df.filter(pl.col("c") < 4, a=2, b=2),
2695
)
2696
2697
# boolean mask
2698
out = df.filter([True, False, False, False, True])
2699
expected = pl.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 4]})
2700
assert_frame_equal(out, expected)
2701
2702
# multiple boolean masks
2703
out = df.filter(
2704
np.array([True, True, False, True, False]),
2705
np.array([True, False, True, True, False]),
2706
)
2707
expected = pl.DataFrame({"a": [1, 2], "b": [1, 2], "c": [1, 3]})
2708
assert_frame_equal(out, expected)
2709
2710
2711
def test_indexing_set() -> None:
2712
df = pl.DataFrame({"bool": [True, True], "str": ["N/A", "N/A"], "nr": [1, 2]})
2713
2714
df[0, "bool"] = False
2715
df[0, "nr"] = 100
2716
df[0, "str"] = "foo"
2717
2718
assert df.to_dict(as_series=False) == {
2719
"bool": [False, True],
2720
"str": ["foo", "N/A"],
2721
"nr": [100, 2],
2722
}
2723
2724
2725
def test_set() -> None:
2726
# Setting a dataframe using indices is deprecated.
2727
# We keep these tests because we only generate a warning.
2728
np.random.seed(1)
2729
df = pl.DataFrame(
2730
{"foo": np.random.rand(10), "bar": np.arange(10), "ham": ["h"] * 10}
2731
)
2732
with pytest.raises(
2733
TypeError,
2734
match=r"DataFrame object does not support `Series` assignment by index"
2735
r"\n\nUse `DataFrame.with_columns`.",
2736
):
2737
df["new"] = np.random.rand(10)
2738
2739
with pytest.raises(
2740
TypeError,
2741
match=r"not allowed to set DataFrame by boolean mask in the row position"
2742
r"\n\nConsider using `DataFrame.with_columns`.",
2743
):
2744
df[df["ham"] > 0.5, "ham"] = "a"
2745
with pytest.raises(
2746
TypeError,
2747
match=r"not allowed to set DataFrame by boolean mask in the row position"
2748
r"\n\nConsider using `DataFrame.with_columns`.",
2749
):
2750
df[[True, False], "ham"] = "a"
2751
2752
# set 2D
2753
df = pl.DataFrame({"b": [0, 0]})
2754
df[["A", "B"]] = [[1, 2], [1, 2]]
2755
2756
with pytest.raises(ValueError):
2757
df[["C", "D"]] = 1
2758
with pytest.raises(ValueError):
2759
df[["C", "D"]] = [1, 1]
2760
with pytest.raises(ValueError):
2761
df[["C", "D"]] = [[1, 2, 3], [1, 2, 3]]
2762
2763
# set tuple
2764
df = pl.DataFrame({"b": [0, 0]})
2765
df[0, "b"] = 1
2766
assert df[0, "b"] == 1
2767
2768
df[0, 0] = 2
2769
assert df[0, "b"] == 2
2770
2771
# row and col selection have to be int or str
2772
with pytest.raises(TypeError):
2773
df[:, [1]] = 1 # type: ignore[index]
2774
with pytest.raises(TypeError):
2775
df[True, :] = 1 # type: ignore[index]
2776
2777
# needs to be a 2 element tuple
2778
with pytest.raises(ValueError):
2779
df[1, 2, 3] = 1
2780
2781
# we cannot index with any type, such as bool
2782
with pytest.raises(TypeError):
2783
df[True] = 1 # type: ignore[index]
2784
2785
2786
def test_series_iter_over_frame() -> None:
2787
df = pl.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4], "c": [3, 4, 5]})
2788
2789
expected = {
2790
0: pl.Series("a", [1, 2, 3]),
2791
1: pl.Series("b", [2, 3, 4]),
2792
2: pl.Series("c", [3, 4, 5]),
2793
}
2794
for idx, s in enumerate(df):
2795
assert_series_equal(s, expected[idx])
2796
2797
expected = {
2798
0: pl.Series("c", [3, 4, 5]),
2799
1: pl.Series("b", [2, 3, 4]),
2800
2: pl.Series("a", [1, 2, 3]),
2801
}
2802
for idx, s in enumerate(reversed(df)):
2803
assert_series_equal(s, expected[idx])
2804
2805
2806
def test_union_with_aliases_4770() -> None:
2807
lf = pl.DataFrame(
2808
{
2809
"a": [1, None],
2810
"b": [3, 4],
2811
}
2812
).lazy()
2813
2814
lf = pl.concat(
2815
[
2816
lf.select([pl.col("a").alias("x")]),
2817
lf.select([pl.col("b").alias("x")]),
2818
]
2819
).filter(pl.col("x").is_not_null())
2820
2821
assert lf.collect()["x"].to_list() == [1, 3, 4]
2822
2823
2824
def test_init_datetimes_with_timezone() -> None:
2825
tz_us = "America/New_York"
2826
tz_europe = "Europe/Amsterdam"
2827
2828
dtm = datetime(2022, 10, 12, 12, 30)
2829
for time_unit in DTYPE_TEMPORAL_UNITS:
2830
for type_overrides in (
2831
{
2832
"schema": [
2833
("d1", pl.Datetime(time_unit, tz_us)),
2834
("d2", pl.Datetime(time_unit, tz_europe)),
2835
]
2836
},
2837
{
2838
"schema_overrides": {
2839
"d1": pl.Datetime(time_unit, tz_us),
2840
"d2": pl.Datetime(time_unit, tz_europe),
2841
}
2842
},
2843
):
2844
result = pl.DataFrame(
2845
data={
2846
"d1": [dtm.replace(tzinfo=ZoneInfo(tz_us))],
2847
"d2": [dtm.replace(tzinfo=ZoneInfo(tz_europe))],
2848
},
2849
**type_overrides,
2850
)
2851
expected = pl.DataFrame(
2852
{"d1": ["2022-10-12 12:30"], "d2": ["2022-10-12 12:30"]}
2853
).with_columns(
2854
pl.col("d1").str.to_datetime(time_unit=time_unit, time_zone=tz_us),
2855
pl.col("d2").str.to_datetime(time_unit=time_unit, time_zone=tz_europe),
2856
)
2857
assert_frame_equal(result, expected)
2858
2859
2860
@pytest.mark.parametrize(
2861
(
2862
"tzinfo",
2863
"offset",
2864
"dtype_time_zone",
2865
"expected_time_zone",
2866
"expected_item",
2867
),
2868
[
2869
(None, "", None, None, datetime(2020, 1, 1)),
2870
(
2871
timezone(timedelta(hours=-8)),
2872
"-08:00",
2873
"UTC",
2874
"UTC",
2875
datetime(2020, 1, 1, 8, tzinfo=timezone.utc),
2876
),
2877
(
2878
timezone(timedelta(hours=-8)),
2879
"-08:00",
2880
None,
2881
"UTC",
2882
datetime(2020, 1, 1, 8, tzinfo=timezone.utc),
2883
),
2884
],
2885
)
2886
@pytest.mark.may_fail_cloud
2887
def test_init_vs_strptime_consistency(
2888
tzinfo: timezone | None,
2889
offset: str,
2890
dtype_time_zone: str | None,
2891
expected_time_zone: str,
2892
expected_item: datetime,
2893
) -> None:
2894
result_init = pl.Series(
2895
[datetime(2020, 1, 1, tzinfo=tzinfo)],
2896
dtype=pl.Datetime("us", dtype_time_zone),
2897
)
2898
result_strptime = pl.Series([f"2020-01-01 00:00{offset}"]).str.strptime(
2899
pl.Datetime("us", dtype_time_zone)
2900
)
2901
assert result_init.dtype == pl.Datetime("us", expected_time_zone)
2902
assert result_init.item() == expected_item
2903
assert_series_equal(result_init, result_strptime)
2904
2905
2906
def test_init_vs_strptime_consistency_converts() -> None:
2907
result = pl.Series(
2908
[datetime(2020, 1, 1, tzinfo=timezone(timedelta(hours=-8)))],
2909
dtype=pl.Datetime("us", "US/Pacific"),
2910
).item()
2911
assert result == datetime(2020, 1, 1, 0, 0, tzinfo=ZoneInfo(key="US/Pacific"))
2912
result = (
2913
pl.Series(["2020-01-01 00:00-08:00"])
2914
.str.strptime(pl.Datetime("us", "US/Pacific"))
2915
.item()
2916
)
2917
assert result == datetime(2020, 1, 1, 0, 0, tzinfo=ZoneInfo(key="US/Pacific"))
2918
2919
2920
def test_init_physical_with_timezone() -> None:
2921
tz_uae = "Asia/Dubai"
2922
tz_asia = "Asia/Tokyo"
2923
2924
dtm_us = 1665577800000000
2925
for time_unit in DTYPE_TEMPORAL_UNITS:
2926
dtm = {"ms": dtm_us // 1_000, "ns": dtm_us * 1_000}.get(str(time_unit), dtm_us)
2927
df = pl.DataFrame(
2928
data={"d1": [dtm], "d2": [dtm]},
2929
schema=[
2930
("d1", pl.Datetime(time_unit, tz_uae)),
2931
("d2", pl.Datetime(time_unit, tz_asia)),
2932
],
2933
)
2934
assert (df["d1"].to_physical() == df["d2"].to_physical()).all()
2935
assert df.rows() == [
2936
(
2937
datetime(2022, 10, 12, 16, 30, tzinfo=ZoneInfo(tz_uae)),
2938
datetime(2022, 10, 12, 21, 30, tzinfo=ZoneInfo(tz_asia)),
2939
)
2940
]
2941
2942
2943
@pytest.mark.parametrize("divop", [floordiv, truediv])
2944
def test_floordiv_truediv(divop: Callable[..., Any]) -> None:
2945
# validate truediv/floordiv dataframe ops against python
2946
df1 = pl.DataFrame(
2947
data={
2948
"x": [0, -1, -2, -3],
2949
"y": [-0.0, -3.0, 5.0, -7.0],
2950
"z": [10, 3, -5, 7],
2951
}
2952
)
2953
2954
# scalar
2955
for df in [df1, df1.slice(0, 0)]:
2956
for n in (3, 3.0, -3, -3.0):
2957
py_div = [tuple(divop(elem, n) for elem in row) for row in df.rows()]
2958
df_div = divop(df, n).rows()
2959
assert py_div == df_div
2960
2961
# series
2962
xdf, s = df1["x"].to_frame(), pl.Series([2] * 4)
2963
assert list(divop(xdf, s)["x"]) == [divop(x, 2) for x in list(df1["x"])]
2964
2965
# frame
2966
df2 = pl.DataFrame(
2967
data={
2968
"x": [2, -2, 2, 3],
2969
"y": [4, 4, -4, 8],
2970
"z": [0.5, 2.0, -2.0, -3],
2971
}
2972
)
2973
df_div = divop(df1, df2).rows()
2974
for i, (row1, row2) in enumerate(zip(df1.rows(), df2.rows())):
2975
for j, (elem1, elem2) in enumerate(zip(row1, row2)):
2976
assert divop(elem1, elem2) == df_div[i][j]
2977
2978
2979
@pytest.mark.parametrize(
2980
("subset", "keep", "expected_mask"),
2981
[
2982
(None, "first", [True, True, True, False]),
2983
("a", "first", [True, True, False, False]),
2984
(["a", "b"], "first", [True, True, False, False]),
2985
(("a", "b"), "last", [True, False, False, True]),
2986
(("a", "b"), "none", [True, False, False, False]),
2987
],
2988
)
2989
def test_unique(
2990
subset: str | Sequence[str], keep: UniqueKeepStrategy, expected_mask: list[bool]
2991
) -> None:
2992
df = pl.DataFrame({"a": [1, 2, 2, 2], "b": [3, 4, 4, 4], "c": [5, 6, 7, 7]})
2993
2994
result = df.unique(maintain_order=True, subset=subset, keep=keep).sort(pl.all())
2995
expected = df.filter(expected_mask).sort(pl.all())
2996
assert_frame_equal(result, expected)
2997
2998
2999
def test_iter_slices() -> None:
3000
df = pl.DataFrame(
3001
{
3002
"a": range(95),
3003
"b": date(2023, 1, 1),
3004
"c": "klmnopqrstuvwxyz",
3005
}
3006
)
3007
batches = list(df.iter_slices(n_rows=50))
3008
3009
assert len(batches[0]) == 50
3010
assert len(batches[1]) == 45
3011
assert batches[1].rows() == df[50:].rows()
3012
3013
3014
def test_format_empty_df() -> None:
3015
df = pl.DataFrame(
3016
[
3017
pl.Series("val1", [], dtype=pl.Categorical),
3018
pl.Series("val2", [], dtype=pl.Categorical),
3019
]
3020
).select(
3021
pl.format("{}:{}", pl.col("val1"), pl.col("val2")).alias("cat"),
3022
)
3023
assert df.shape == (0, 1)
3024
assert df.dtypes == [pl.String]
3025
3026
3027
def test_deadlocks_3409() -> None:
3028
assert (
3029
pl.DataFrame({"col1": [[1, 2, 3]]})
3030
.with_columns(
3031
pl.col("col1").list.eval(
3032
pl.element().map_elements(lambda x: x, return_dtype=pl.Int64)
3033
)
3034
)
3035
.to_dict(as_series=False)
3036
) == {"col1": [[1, 2, 3]]}
3037
3038
assert (
3039
pl.DataFrame({"col1": [1, 2, 3]})
3040
.with_columns(
3041
pl.col("col1").cumulative_eval(
3042
pl.element().map_batches(lambda x: 0, pl.Int64, returns_scalar=True)
3043
)
3044
)
3045
.to_dict(as_series=False)
3046
) == {"col1": [0, 0, 0]}
3047
3048
3049
def test_ceil() -> None:
3050
df = pl.DataFrame({"a": [1.8, 1.2, 3.0]})
3051
result = df.select(pl.col("a").ceil())
3052
assert_frame_equal(result, pl.DataFrame({"a": [2.0, 2.0, 3.0]}))
3053
3054
df = pl.DataFrame({"a": [1, 2, 3]})
3055
result = df.select(pl.col("a").ceil())
3056
assert_frame_equal(df, result)
3057
3058
3059
def test_floor() -> None:
3060
df = pl.DataFrame({"a": [1.8, 1.2, 3.0]})
3061
result = df.select(pl.col("a").floor())
3062
assert_frame_equal(result, pl.DataFrame({"a": [1.0, 1.0, 3.0]}))
3063
3064
df = pl.DataFrame({"a": [1, 2, 3]})
3065
result = df.select(pl.col("a").floor())
3066
assert_frame_equal(df, result)
3067
3068
3069
def test_floor_divide() -> None:
3070
x = 10.4
3071
step = 0.5
3072
df = pl.DataFrame({"x": [x]})
3073
assert df.with_columns(pl.col("x") // step)[0, 0] == x // step
3074
3075
3076
def test_round() -> None:
3077
df = pl.DataFrame({"a": [1.8, 1.2, 3.0]})
3078
col_a_rounded = df.select(pl.col("a").round(decimals=0))["a"]
3079
assert_series_equal(col_a_rounded, pl.Series("a", [2, 1, 3]).cast(pl.Float64))
3080
3081
3082
def test_dot() -> None:
3083
df = pl.DataFrame({"a": [1.8, 1.2, 3.0], "b": [3.2, 1, 2]})
3084
assert df.select(pl.col("a").dot(pl.col("b"))).item() == 12.96
3085
3086
3087
def test_unstack() -> None:
3088
from string import ascii_uppercase
3089
3090
df = pl.DataFrame(
3091
{
3092
"col1": list(ascii_uppercase[0:9]),
3093
"col2": pl.int_range(0, 9, eager=True),
3094
"col3": pl.int_range(-9, 0, eager=True),
3095
}
3096
)
3097
assert df.unstack(step=3, how="vertical").to_dict(as_series=False) == {
3098
"col1_0": ["A", "B", "C"],
3099
"col1_1": ["D", "E", "F"],
3100
"col1_2": ["G", "H", "I"],
3101
"col2_0": [0, 1, 2],
3102
"col2_1": [3, 4, 5],
3103
"col2_2": [6, 7, 8],
3104
"col3_0": [-9, -8, -7],
3105
"col3_1": [-6, -5, -4],
3106
"col3_2": [-3, -2, -1],
3107
}
3108
3109
assert df.unstack(step=3, how="horizontal").to_dict(as_series=False) == {
3110
"col1_0": ["A", "D", "G"],
3111
"col1_1": ["B", "E", "H"],
3112
"col1_2": ["C", "F", "I"],
3113
"col2_0": [0, 3, 6],
3114
"col2_1": [1, 4, 7],
3115
"col2_2": [2, 5, 8],
3116
"col3_0": [-9, -6, -3],
3117
"col3_1": [-8, -5, -2],
3118
"col3_2": [-7, -4, -1],
3119
}
3120
3121
for column_subset in (("col2", "col3"), cs.integer()):
3122
assert df.unstack(
3123
step=3,
3124
how="horizontal",
3125
columns=column_subset,
3126
).to_dict(as_series=False) == {
3127
"col2_0": [0, 3, 6],
3128
"col2_1": [1, 4, 7],
3129
"col2_2": [2, 5, 8],
3130
"col3_0": [-9, -6, -3],
3131
"col3_1": [-8, -5, -2],
3132
"col3_2": [-7, -4, -1],
3133
}
3134
3135
3136
def test_window_deadlock() -> None:
3137
np.random.seed(12)
3138
3139
df = pl.DataFrame(
3140
{
3141
"nrs": [1, 2, 3, None, 5],
3142
"names": ["foo", "ham", "spam", "egg", None],
3143
"random": np.random.rand(5),
3144
"groups": ["A", "A", "B", "C", "B"],
3145
}
3146
)
3147
3148
_df = df.select(
3149
pl.col("*"), # select all
3150
pl.col("random").sum().over("groups").alias("sum[random]/groups"),
3151
pl.col("random").implode().over("names").alias("random/name"),
3152
)
3153
3154
3155
def test_sum_empty_column_names() -> None:
3156
df = pl.DataFrame({"x": [], "y": []}, schema={"x": pl.Boolean, "y": pl.Boolean})
3157
expected = pl.DataFrame(
3158
{"x": [0], "y": [0]}, schema={"x": pl.UInt32, "y": pl.UInt32}
3159
)
3160
assert_frame_equal(df.sum(), expected)
3161
3162
3163
def test_flags() -> None:
3164
df = pl.DataFrame({"a": [1, 2, 3], "b": [9, 5, 6]})
3165
assert df.flags == {
3166
"a": {"SORTED_ASC": False, "SORTED_DESC": False},
3167
"b": {"SORTED_ASC": False, "SORTED_DESC": False},
3168
}
3169
assert df.set_sorted("a").flags == {
3170
"a": {"SORTED_ASC": True, "SORTED_DESC": False},
3171
"b": {"SORTED_ASC": False, "SORTED_DESC": False},
3172
}
3173
3174
3175
def test_interchange() -> None:
3176
df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["foo", "bar"]})
3177
dfi = df.__dataframe__()
3178
3179
# Testing some random properties to make sure conversion happened correctly
3180
assert dfi.num_rows() == 2
3181
assert dfi.get_column(0).dtype[1] == 64
3182
assert dfi.get_column_by_name("c").get_buffers()["data"][0].bufsize == 6
3183
3184
3185
def test_from_dicts_undeclared_column_dtype() -> None:
3186
data = [{"a": 1, "b": 2}]
3187
result = pl.from_dicts(data, schema=["x"])
3188
assert result.schema == {"x": pl.Null}
3189
3190
3191
def test_from_dicts_with_override() -> None:
3192
data = [
3193
{"a": "1", "b": str(2**64 - 1), "c": "1"},
3194
{"a": "1", "b": "1", "c": "-5.0"},
3195
]
3196
override = {"a": pl.Int32, "b": pl.UInt64, "c": pl.Float32}
3197
result = pl.from_dicts(data, schema_overrides=override)
3198
assert_frame_equal(
3199
result,
3200
pl.DataFrame(
3201
{
3202
"a": pl.Series([1, 1], dtype=pl.Int32),
3203
"b": pl.Series([2**64 - 1, 1], dtype=pl.UInt64),
3204
"c": pl.Series([1.0, -5.0], dtype=pl.Float32),
3205
}
3206
),
3207
)
3208
3209
3210
def test_from_records_u64_12329() -> None:
3211
s = pl.from_records([{"a": 9908227375760408577}])
3212
assert s.dtypes == [pl.Int128]
3213
assert s["a"][0] == 9908227375760408577
3214
3215
3216
def test_negative_slice_12642() -> None:
3217
df = pl.DataFrame({"x": range(5)})
3218
assert_frame_equal(df.slice(-2, 1), df.tail(2).head(1))
3219
3220
3221
def test_iter_columns() -> None:
3222
df = pl.DataFrame({"a": [1, 1, 2], "b": [4, 5, 6]})
3223
iter_columns = df.iter_columns()
3224
assert_series_equal(next(iter_columns), pl.Series("a", [1, 1, 2]))
3225
assert_series_equal(next(iter_columns), pl.Series("b", [4, 5, 6]))
3226
3227
3228
def test_get_column_index() -> None:
3229
df = pl.DataFrame({"actual": [1001], "expected": [1000]})
3230
3231
assert df.get_column_index("actual") == 0
3232
assert df.get_column_index("expected") == 1
3233
3234
with pytest.raises(ColumnNotFoundError, match="missing"):
3235
df.get_column_index("missing")
3236
3237
3238
def test_dataframe_creation_with_different_series_lengths_19795() -> None:
3239
with pytest.raises(
3240
ShapeError,
3241
match=r"could not create a new DataFrame: height of column 'b' \(1\) does not match height of column 'a' \(2\)",
3242
):
3243
pl.DataFrame({"a": [1, 2], "b": [1]})
3244
3245
3246
def test_get_column_after_drop_20119() -> None:
3247
df = pl.DataFrame({"a": ["A"], "b": ["B"], "c": ["C"]})
3248
df.drop_in_place("a")
3249
c = df.get_column("c")
3250
assert_series_equal(c, pl.Series("c", ["C"]))
3251
3252
3253
def test_select_oob_row_20775() -> None:
3254
df = pl.DataFrame({"a": [1, 2, 3]})
3255
with pytest.raises(
3256
IndexError,
3257
match="index 99 is out of bounds for DataFrame of height 3",
3258
):
3259
df[99]
3260
3261
3262
@pytest.mark.parametrize("idx", [3, 99, -4, -99])
3263
def test_select_oob_element_20775_too_large(idx: int) -> None:
3264
df = pl.DataFrame({"a": [1, 2, 3]})
3265
with pytest.raises(
3266
IndexError,
3267
match=f"index {idx} is out of bounds for sequence of length 3",
3268
):
3269
df[idx, "a"]
3270
3271
3272
def test_nan_to_null() -> None:
3273
a = np.array([np.nan, 1])
3274
3275
df1 = pl.DataFrame(a, nan_to_null=True)
3276
df2 = pl.DataFrame(
3277
(a,),
3278
nan_to_null=True,
3279
)
3280
3281
assert_frame_equal(df1, df2)
3282
3283
3284
# Below 3 tests for https://github.com/pola-rs/polars/issues/17879
3285
3286
3287
def test_with_columns_dict_direct_typeerror() -> None:
3288
data = {"a": pl.col("a") * 2}
3289
df = pl.select(a=1)
3290
with pytest.raises(
3291
TypeError, match="Cannot pass a dictionary as a single positional argument"
3292
):
3293
df.with_columns(data)
3294
3295
3296
def test_with_columns_dict_unpacking() -> None:
3297
data = {"a": pl.col("a") * 2}
3298
df = pl.select(a=1).with_columns(**data)
3299
expected = pl.DataFrame({"a": [2]})
3300
assert df.equals(expected)
3301
3302
3303
def test_with_columns_generator_alias() -> None:
3304
data = {"a": pl.col("a") * 2}
3305
df = pl.select(a=1).with_columns(expr.alias(name) for name, expr in data.items())
3306
expected = pl.DataFrame({"a": [2]})
3307
assert df.equals(expected)
3308
3309