Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/constructors/test_constructors.py
6939 views
1
from __future__ import annotations
2
3
from collections import OrderedDict, namedtuple
4
from datetime import date, datetime, time, timedelta, timezone
5
from decimal import Decimal
6
from random import shuffle
7
from typing import TYPE_CHECKING, Any, Literal, NamedTuple
8
from zoneinfo import ZoneInfo
9
10
import numpy as np
11
import pandas as pd
12
import pyarrow as pa
13
import pytest
14
from packaging.version import parse as parse_version
15
from pydantic import BaseModel, Field, TypeAdapter
16
17
import polars as pl
18
import polars.selectors as cs
19
from polars._utils.construction.utils import try_get_type_hints
20
from polars.datatypes import numpy_char_code_to_dtype
21
from polars.dependencies import dataclasses, pydantic
22
from polars.exceptions import DuplicateError, ShapeError
23
from polars.testing import assert_frame_equal, assert_series_equal
24
from tests.unit.utils.pycapsule_utils import PyCapsuleArrayHolder, PyCapsuleStreamHolder
25
26
if TYPE_CHECKING:
27
import sys
28
from collections.abc import Callable
29
30
from polars._typing import PolarsDataType
31
32
if sys.version_info >= (3, 11):
33
from typing import Self
34
else:
35
from typing_extensions import Self
36
37
from typing_extensions import assert_type
38
39
40
# -----------------------------------------------------------------------------------
41
# nested dataclasses, models, namedtuple classes (can't be defined inside test func)
42
# -----------------------------------------------------------------------------------
43
@dataclasses.dataclass
44
class _TestBazDC:
45
d: datetime
46
e: float
47
f: str
48
49
50
@dataclasses.dataclass
51
class _TestBarDC:
52
a: str
53
b: int
54
c: _TestBazDC
55
56
57
@dataclasses.dataclass
58
class _TestFooDC:
59
x: int
60
y: _TestBarDC
61
62
63
class _TestBazPD(pydantic.BaseModel):
64
d: datetime
65
e: float
66
f: str
67
68
69
class _TestBarPD(pydantic.BaseModel):
70
a: str
71
b: int
72
c: _TestBazPD
73
74
75
class _TestFooPD(pydantic.BaseModel):
76
x: int
77
y: _TestBarPD
78
79
80
class _TestBazNT(NamedTuple):
81
d: datetime
82
e: float
83
f: str
84
85
86
class _TestBarNT(NamedTuple):
87
a: str
88
b: int
89
c: _TestBazNT
90
91
92
class _TestFooNT(NamedTuple):
93
x: int
94
y: _TestBarNT
95
96
97
# --------------------------------------------------------------------------------
98
99
100
def test_init_dict() -> None:
101
# Empty dictionary
102
df = pl.DataFrame({})
103
assert df.shape == (0, 0)
104
105
# Empty dictionary/values
106
df = pl.DataFrame({"a": [], "b": []})
107
assert df.shape == (0, 2)
108
assert df.schema == {"a": pl.Null, "b": pl.Null}
109
110
for df in (
111
pl.DataFrame({}, schema={"a": pl.Date, "b": pl.String}),
112
pl.DataFrame({"a": [], "b": []}, schema={"a": pl.Date, "b": pl.String}),
113
):
114
assert df.shape == (0, 2)
115
assert df.schema == {"a": pl.Date, "b": pl.String}
116
117
# List of empty list
118
df = pl.DataFrame({"a": [[]], "b": [[]]})
119
expected = {"a": pl.List(pl.Null), "b": pl.List(pl.Null)}
120
assert df.schema == expected
121
assert df.rows() == [([], [])]
122
123
# Mixed dtypes
124
df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
125
assert df.shape == (3, 2)
126
assert df.columns == ["a", "b"]
127
assert df.dtypes == [pl.Int64, pl.Float64]
128
129
df = pl.DataFrame(
130
data={"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]},
131
schema=[("a", pl.Int8), ("b", pl.Float32)],
132
)
133
assert df.schema == {"a": pl.Int8, "b": pl.Float32}
134
135
# Values contained in tuples
136
df = pl.DataFrame({"a": (1, 2, 3), "b": [1.0, 2.0, 3.0]})
137
assert df.shape == (3, 2)
138
139
# Datetime/Date types (from both python and integer values)
140
py_datetimes = (
141
datetime(2022, 12, 31, 23, 59, 59),
142
datetime(2022, 12, 31, 23, 59, 59),
143
)
144
py_dates = (date(2022, 12, 31), date(2022, 12, 31))
145
int_datetimes = [1672531199000000, 1672531199000000]
146
int_dates = [19357, 19357]
147
148
for dates, datetimes, coldefs in (
149
# test inferred and explicit (given both py/polars dtypes)
150
(py_dates, py_datetimes, None),
151
(py_dates, py_datetimes, [("dt", date), ("dtm", datetime)]),
152
(py_dates, py_datetimes, [("dt", pl.Date), ("dtm", pl.Datetime)]),
153
(int_dates, int_datetimes, [("dt", date), ("dtm", datetime)]),
154
(int_dates, int_datetimes, [("dt", pl.Date), ("dtm", pl.Datetime)]),
155
):
156
df = pl.DataFrame(
157
data={"dt": dates, "dtm": datetimes},
158
schema=coldefs,
159
)
160
assert df.schema == {"dt": pl.Date, "dtm": pl.Datetime("us")}
161
assert df.rows() == list(zip(py_dates, py_datetimes))
162
163
# Overriding dict column names/types
164
df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, schema=["c", "d"])
165
assert df.columns == ["c", "d"]
166
167
df = pl.DataFrame(
168
{"a": [1, 2, 3], "b": [4, 5, 6]},
169
schema=["c", ("d", pl.Int8)],
170
) # partial type info (allowed, but mypy doesn't like it ;p)
171
assert df.schema == {"c": pl.Int64, "d": pl.Int8}
172
173
df = pl.DataFrame(
174
{"a": [1, 2, 3], "b": [4, 5, 6]}, schema=[("c", pl.Int8), ("d", pl.Int16)]
175
)
176
assert df.schema == {"c": pl.Int8, "d": pl.Int16}
177
178
# empty nested objects
179
for empty_val in [None, "", {}, []]: # type: ignore[var-annotated]
180
test = [{"field": {"sub_field": empty_val, "sub_field_2": 2}}]
181
df = pl.DataFrame(test, schema={"field": pl.Object})
182
assert df["field"][0] == test[0]["field"]
183
184
185
def test_error_string_dtypes() -> None:
186
with pytest.raises(TypeError, match="cannot parse input"):
187
pl.DataFrame(
188
data={"x": [1, 2], "y": [3, 4], "z": [5, 6]},
189
schema={"x": "i16", "y": "i32", "z": "f32"}, # type: ignore[dict-item]
190
)
191
192
with pytest.raises(TypeError, match="cannot parse input"):
193
pl.Series("n", [1, 2, 3], dtype="f32") # type: ignore[arg-type]
194
195
196
def test_init_structured_objects() -> None:
197
# validate init from dataclass, namedtuple, and pydantic model objects
198
@dataclasses.dataclass
199
class TradeDC:
200
timestamp: datetime
201
ticker: str
202
price: Decimal
203
size: int | None = None
204
205
class TradePD(pydantic.BaseModel):
206
timestamp: datetime
207
ticker: str
208
price: Decimal
209
size: int
210
211
class TradeNT(NamedTuple):
212
timestamp: datetime
213
ticker: str
214
price: Decimal
215
size: int | None = None
216
217
raw_data = [
218
(datetime(2022, 9, 8, 14, 30, 45), "AAPL", Decimal("157.5"), 125),
219
(datetime(2022, 9, 9, 10, 15, 12), "FLSY", Decimal("10.0"), 1500),
220
(datetime(2022, 9, 7, 15, 30), "MU", Decimal("55.5"), 400),
221
]
222
columns = ["timestamp", "ticker", "price", "size"]
223
224
for TradeClass in (TradeDC, TradeNT, TradePD):
225
trades = [TradeClass(**dict(zip(columns, values))) for values in raw_data] # type: ignore[arg-type]
226
227
for DF in (pl.DataFrame, pl.from_records):
228
df = DF(data=trades)
229
assert df.schema == {
230
"timestamp": pl.Datetime("us"),
231
"ticker": pl.String,
232
"price": pl.Decimal(scale=1),
233
"size": pl.Int64,
234
}
235
assert df.rows() == raw_data
236
237
# partial dtypes override
238
df = DF(
239
data=trades,
240
schema_overrides={"timestamp": pl.Datetime("ms"), "size": pl.Int32},
241
)
242
assert df.schema == {
243
"timestamp": pl.Datetime("ms"),
244
"ticker": pl.String,
245
"price": pl.Decimal(scale=1),
246
"size": pl.Int32,
247
}
248
249
# in conjunction with full 'columns' override (rename/downcast)
250
df = pl.DataFrame(
251
data=trades,
252
schema=[
253
("ts", pl.Datetime("ms")),
254
("tk", pl.Categorical),
255
("pc", pl.Decimal(scale=1)),
256
("sz", pl.UInt16),
257
],
258
)
259
assert df.schema == {
260
"ts": pl.Datetime("ms"),
261
"tk": pl.Categorical(ordering="lexical"),
262
"pc": pl.Decimal(scale=1),
263
"sz": pl.UInt16,
264
}
265
assert df.rows() == raw_data
266
267
# cover a miscellaneous edge-case when detecting the annotations
268
assert try_get_type_hints(obj=type(None)) == {}
269
270
271
def test_init_pydantic_2x() -> None:
272
class PageView(BaseModel):
273
user_id: str
274
ts: datetime = Field(alias=["ts", "$date"]) # type: ignore[literal-required, call-overload]
275
path: str = Field("?", alias=["url", "path"]) # type: ignore[literal-required, call-overload]
276
referer: str = Field("?", alias="referer")
277
event: Literal["leave", "enter"] = Field("enter")
278
time_on_page: int = Field(0, serialization_alias="top")
279
280
data_json = """
281
[{
282
"user_id": "x",
283
"ts": {"$date": "2021-01-01T00:00:00.000Z"},
284
"url": "/latest/foobar",
285
"referer": "https://google.com",
286
"event": "enter",
287
"top": 123
288
}]
289
"""
290
adapter: TypeAdapter[Any] = TypeAdapter(list[PageView])
291
models = adapter.validate_json(data_json)
292
293
result = pl.DataFrame(models)
294
expected = pl.DataFrame(
295
{
296
"user_id": ["x"],
297
"ts": [datetime(2021, 1, 1, 0, 0)],
298
"path": ["?"],
299
"referer": ["https://google.com"],
300
"event": ["enter"],
301
"time_on_page": [0],
302
}
303
)
304
assert_frame_equal(result, expected)
305
306
307
def test_init_structured_objects_unhashable() -> None:
308
# cover an edge-case with namedtuple fields that aren't hashable
309
310
class Test(NamedTuple):
311
dt: datetime
312
info: dict[str, int]
313
314
test_data = [
315
Test(datetime(2017, 1, 1), {"a": 1, "b": 2}),
316
Test(datetime(2017, 1, 2), {"a": 2, "b": 2}),
317
]
318
df = pl.DataFrame(test_data)
319
# shape: (2, 2)
320
# ┌─────────────────────┬───────────┐
321
# │ dt ┆ info │
322
# │ --- ┆ --- │
323
# │ datetime[μs] ┆ struct[2] │
324
# ╞═════════════════════╪═══════════╡
325
# │ 2017-01-01 00:00:00 ┆ {1,2} │
326
# │ 2017-01-02 00:00:00 ┆ {2,2} │
327
# └─────────────────────┴───────────┘
328
assert df.schema == {
329
"dt": pl.Datetime(time_unit="us", time_zone=None),
330
"info": pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Int64)]),
331
}
332
assert df.rows() == test_data
333
334
335
@pytest.mark.parametrize(
336
("foo", "bar", "baz"),
337
[
338
(_TestFooDC, _TestBarDC, _TestBazDC),
339
(_TestFooPD, _TestBarPD, _TestBazPD),
340
(_TestFooNT, _TestBarNT, _TestBazNT),
341
],
342
)
343
def test_init_structured_objects_nested(foo: Any, bar: Any, baz: Any) -> None:
344
data = [
345
foo(
346
x=100,
347
y=bar(
348
a="hello",
349
b=800,
350
c=baz(d=datetime(2023, 4, 12, 10, 30), e=-10.5, f="world"),
351
),
352
)
353
]
354
df = pl.DataFrame(data)
355
# shape: (1, 2)
356
# ┌─────┬───────────────────────────────────┐
357
# │ x ┆ y │
358
# │ --- ┆ --- │
359
# │ i64 ┆ struct[3] │
360
# ╞═════╪═══════════════════════════════════╡
361
# │ 100 ┆ {"hello",800,{2023-04-12 10:30:0… │
362
# └─────┴───────────────────────────────────┘
363
364
assert df.schema == {
365
"x": pl.Int64,
366
"y": pl.Struct(
367
[
368
pl.Field("a", pl.String),
369
pl.Field("b", pl.Int64),
370
pl.Field(
371
"c",
372
pl.Struct(
373
[
374
pl.Field("d", pl.Datetime("us")),
375
pl.Field("e", pl.Float64),
376
pl.Field("f", pl.String),
377
]
378
),
379
),
380
]
381
),
382
}
383
assert df.row(0) == (
384
100,
385
{
386
"a": "hello",
387
"b": 800,
388
"c": {
389
"d": datetime(2023, 4, 12, 10, 30),
390
"e": -10.5,
391
"f": "world",
392
},
393
},
394
)
395
396
# validate nested schema override
397
override_struct_schema: dict[str, PolarsDataType] = {
398
"x": pl.Int16,
399
"y": pl.Struct(
400
[
401
pl.Field("a", pl.String),
402
pl.Field("b", pl.Int32),
403
pl.Field(
404
name="c",
405
dtype=pl.Struct(
406
[
407
pl.Field("d", pl.Datetime("ms")),
408
pl.Field("e", pl.Float32),
409
pl.Field("f", pl.String),
410
]
411
),
412
),
413
]
414
),
415
}
416
for schema, schema_overrides in (
417
(None, override_struct_schema),
418
(override_struct_schema, None),
419
):
420
df = (
421
pl.DataFrame(data, schema=schema, schema_overrides=schema_overrides)
422
.unnest("y")
423
.unnest("c")
424
)
425
# shape: (1, 6)
426
# ┌─────┬───────┬─────┬─────────────────────┬───────┬───────┐
427
# │ x ┆ a ┆ b ┆ d ┆ e ┆ f │
428
# │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
429
# │ i16 ┆ str ┆ i32 ┆ datetime[ms] ┆ f32 ┆ str │
430
# ╞═════╪═══════╪═════╪═════════════════════╪═══════╪═══════╡
431
# │ 100 ┆ hello ┆ 800 ┆ 2023-04-12 10:30:00 ┆ -10.5 ┆ world │
432
# └─────┴───────┴─────┴─────────────────────┴───────┴───────┘
433
assert df.schema == {
434
"x": pl.Int16,
435
"a": pl.String,
436
"b": pl.Int32,
437
"d": pl.Datetime("ms"),
438
"e": pl.Float32,
439
"f": pl.String,
440
}
441
assert df.row(0) == (
442
100,
443
"hello",
444
800,
445
datetime(2023, 4, 12, 10, 30),
446
-10.5,
447
"world",
448
)
449
450
451
def test_dataclasses_initvar_typing() -> None:
452
@dataclasses.dataclass
453
class ABC:
454
x: date
455
y: float
456
z: dataclasses.InitVar[list[str]] = None
457
458
# should be able to parse the initvar typing...
459
abc = ABC(x=date(1999, 12, 31), y=100.0)
460
df = pl.DataFrame([abc])
461
462
# ...but should not load the initvar field into the DataFrame
463
assert dataclasses.asdict(abc) == df.rows(named=True)[0]
464
465
466
@pytest.mark.parametrize(
467
"nt",
468
[
469
namedtuple("TestData", ["id", "info"]), # noqa: PYI024
470
NamedTuple("TestData", [("id", int), ("info", str)]),
471
],
472
)
473
def test_collections_namedtuple(nt: type) -> None:
474
nt_data = [nt(1, "a"), nt(2, "b"), nt(3, "c")]
475
476
result = pl.DataFrame(nt_data)
477
expected = pl.DataFrame({"id": [1, 2, 3], "info": ["a", "b", "c"]})
478
assert_frame_equal(result, expected)
479
480
result = pl.DataFrame({"data": nt_data, "misc": ["x", "y", "z"]})
481
expected = pl.DataFrame(
482
{
483
"data": [
484
{"id": 1, "info": "a"},
485
{"id": 2, "info": "b"},
486
{"id": 3, "info": "c"},
487
],
488
"misc": ["x", "y", "z"],
489
}
490
)
491
assert_frame_equal(result, expected)
492
493
494
def test_init_ndarray() -> None:
495
# Empty array
496
df = pl.DataFrame(np.array([]))
497
assert_frame_equal(df, pl.DataFrame())
498
499
# 1D array
500
df = pl.DataFrame(np.array([1, 2, 3], dtype=np.int64), schema=["a"])
501
expected = pl.DataFrame({"a": [1, 2, 3]})
502
assert_frame_equal(df, expected)
503
504
df = pl.DataFrame(np.array([1, 2, 3]), schema=[("a", pl.Int32)])
505
expected = pl.DataFrame({"a": [1, 2, 3]}).with_columns(pl.col("a").cast(pl.Int32))
506
assert_frame_equal(df, expected)
507
508
# 2D array (or 2x 1D array) - should default to column orientation (if C-contiguous)
509
for data in (
510
np.array([[1, 2], [3, 4]], dtype=np.int64),
511
[np.array([1, 2], dtype=np.int64), np.array([3, 4], dtype=np.int64)],
512
):
513
df = pl.DataFrame(data, orient="col")
514
expected = pl.DataFrame({"column_0": [1, 2], "column_1": [3, 4]})
515
assert_frame_equal(df, expected)
516
517
df = pl.DataFrame([[1, 2.0, "a"], [None, None, None]], orient="row")
518
expected = pl.DataFrame(
519
{"column_0": [1, None], "column_1": [2.0, None], "column_2": ["a", None]}
520
)
521
assert_frame_equal(df, expected)
522
523
df = pl.DataFrame(
524
data=[[1, 2.0, "a"], [None, None, None]],
525
schema=[("x", pl.Boolean), ("y", pl.Int32), "z"],
526
orient="row",
527
)
528
assert df.rows() == [(True, 2, "a"), (None, None, None)]
529
assert df.schema == {"x": pl.Boolean, "y": pl.Int32, "z": pl.String}
530
531
# 2D array - default to column orientation
532
df = pl.DataFrame(np.array([[1, 2], [3, 4]], dtype=np.int64))
533
expected = pl.DataFrame({"column_0": [1, 3], "column_1": [2, 4]})
534
assert_frame_equal(df, expected)
535
536
# no orientation, numpy convention
537
df = pl.DataFrame(np.ones((3, 1), dtype=np.int64))
538
assert df.shape == (3, 1)
539
540
# 2D array - row orientation inferred
541
df = pl.DataFrame(
542
np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64), schema=["a", "b", "c"]
543
)
544
expected = pl.DataFrame({"a": [1, 4], "b": [2, 5], "c": [3, 6]})
545
assert_frame_equal(df, expected)
546
547
# 2D array - column orientation inferred
548
df = pl.DataFrame(
549
np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64), schema=["a", "b"]
550
)
551
expected = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
552
assert_frame_equal(df, expected)
553
554
# List column from 2D array with single-column schema
555
df = pl.DataFrame(np.arange(4).reshape(-1, 1).astype(np.int64), schema=["a"])
556
assert_frame_equal(df, pl.DataFrame({"a": [0, 1, 2, 3]}))
557
assert np.array_equal(df.to_numpy(), np.arange(4).reshape(-1, 1).astype(np.int64))
558
559
df = pl.DataFrame(np.arange(4).reshape(-1, 2).astype(np.int64), schema=["a"])
560
assert_frame_equal(
561
df,
562
pl.DataFrame(
563
{"a": [[0, 1], [2, 3]]}, schema={"a": pl.Array(pl.Int64, shape=2)}
564
),
565
)
566
567
# 2D numpy arrays
568
df = pl.DataFrame({"a": np.arange(5, dtype=np.int64).reshape(1, -1)})
569
assert df.dtypes == [pl.Array(pl.Int64, shape=5)]
570
assert df.shape == (1, 1)
571
572
df = pl.DataFrame({"a": np.arange(10, dtype=np.int64).reshape(2, -1)})
573
assert df.dtypes == [pl.Array(pl.Int64, shape=5)]
574
assert df.shape == (2, 1)
575
assert df.rows() == [([0, 1, 2, 3, 4],), ([5, 6, 7, 8, 9],)]
576
577
test_rows = [(1, 2), (3, 4)]
578
df = pl.DataFrame([np.array(test_rows[0]), np.array(test_rows[1])], orient="row")
579
expected = pl.DataFrame(test_rows, orient="row")
580
assert_frame_equal(df, expected)
581
582
# round trip export/init
583
for shape in ((4, 4), (4, 8), (8, 4)):
584
np_ones = np.ones(shape=shape, dtype=np.float64)
585
names = [f"c{i}" for i in range(shape[1])]
586
587
df = pl.DataFrame(np_ones, schema=names)
588
assert_frame_equal(df, pl.DataFrame(np.asarray(df), schema=names))
589
590
591
def test_init_ndarray_errors() -> None:
592
# 2D array: orientation conflicts with columns
593
with pytest.raises(ValueError):
594
pl.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]), schema=["a", "b"], orient="row")
595
596
with pytest.raises(ValueError):
597
pl.DataFrame(
598
np.array([[1, 2, 3], [4, 5, 6]]),
599
schema=[("a", pl.UInt32), ("b", pl.UInt32)],
600
orient="row",
601
)
602
603
# Invalid orient value
604
with pytest.raises(ValueError):
605
pl.DataFrame(
606
np.array([[1, 2, 3], [4, 5, 6]]),
607
orient="wrong", # type: ignore[arg-type]
608
)
609
610
# Dimensions mismatch
611
with pytest.raises(ValueError):
612
_ = pl.DataFrame(np.array([1, 2, 3]), schema=[])
613
614
# Cannot init with 3D array
615
with pytest.raises(ValueError):
616
_ = pl.DataFrame(np.random.randn(2, 2, 2))
617
618
619
def test_init_ndarray_nan() -> None:
620
# numpy arrays containing NaN
621
df0 = pl.DataFrame(
622
data={"x": [1.0, 2.5, float("nan")], "y": [4.0, float("nan"), 6.5]},
623
)
624
df1 = pl.DataFrame(
625
data={"x": np.array([1.0, 2.5, np.nan]), "y": np.array([4.0, np.nan, 6.5])},
626
)
627
df2 = pl.DataFrame(
628
data={"x": np.array([1.0, 2.5, np.nan]), "y": np.array([4.0, np.nan, 6.5])},
629
nan_to_null=True,
630
)
631
assert_frame_equal(df0, df1)
632
assert df2.rows() == [(1.0, 4.0), (2.5, None), (None, 6.5)]
633
634
s0 = pl.Series("n", [1.0, 2.5, float("nan")])
635
s1 = pl.Series("n", np.array([1.0, 2.5, float("nan")]))
636
s2 = pl.Series("n", np.array([1.0, 2.5, float("nan")]), nan_to_null=True)
637
638
assert_series_equal(s0, s1)
639
assert s2.to_list() == [1.0, 2.5, None]
640
641
642
def test_init_ndarray_square() -> None:
643
# 2D square array; ensure that we maintain convention
644
# (first axis = rows) with/without an explicit schema
645
arr = np.arange(4).reshape(2, 2)
646
assert (
647
[(0, 1), (2, 3)]
648
== pl.DataFrame(arr).rows()
649
== pl.DataFrame(arr, schema=["a", "b"]).rows()
650
)
651
# check that we tie-break square arrays using fortran vs c-contiguous row/col major
652
df_c = pl.DataFrame(
653
data=np.array([[1, 2], [3, 4]], dtype=np.int64, order="C"),
654
schema=["x", "y"],
655
)
656
assert_frame_equal(df_c, pl.DataFrame({"x": [1, 3], "y": [2, 4]}))
657
658
df_f = pl.DataFrame(
659
data=np.array([[1, 2], [3, 4]], dtype=np.int64, order="F"),
660
schema=["x", "y"],
661
)
662
assert_frame_equal(df_f, pl.DataFrame({"x": [1, 2], "y": [3, 4]}))
663
664
665
def test_init_numpy_unavailable(monkeypatch: Any) -> None:
666
monkeypatch.setattr(pl.dataframe.frame, "_check_for_numpy", lambda x: False)
667
with pytest.raises(TypeError):
668
pl.DataFrame(np.array([1, 2, 3]), schema=["a"])
669
670
671
def test_init_numpy_scalars() -> None:
672
df = pl.DataFrame(
673
{
674
"bool": [np.bool_(True), np.bool_(False)],
675
"i8": [np.int8(16), np.int8(64)],
676
"u32": [np.uint32(1234), np.uint32(9876)],
677
}
678
)
679
df_expected = pl.from_records(
680
data=[(True, 16, 1234), (False, 64, 9876)],
681
schema=OrderedDict([("bool", pl.Boolean), ("i8", pl.Int8), ("u32", pl.UInt32)]),
682
orient="row",
683
)
684
assert_frame_equal(df, df_expected)
685
686
687
def test_null_array_print_format() -> None:
688
pa_tbl_null = pa.table({"a": [None, None]})
689
df_null = pl.from_arrow(pa_tbl_null)
690
assert df_null.shape == (2, 1)
691
assert df_null.dtypes == [pl.Null] # type: ignore[union-attr]
692
assert df_null.rows() == [(None,), (None,)] # type: ignore[union-attr]
693
694
assert (
695
str(df_null) == "shape: (2, 1)\n"
696
"┌──────┐\n"
697
"│ a │\n"
698
"│ --- │\n"
699
"│ null │\n"
700
"╞══════╡\n"
701
"│ null │\n"
702
"│ null │\n"
703
"└──────┘"
704
)
705
706
707
def test_init_arrow() -> None:
708
# Handle unnamed column
709
df = pl.DataFrame(pa.table({"a": [1, 2], None: [3, 4]}))
710
expected = pl.DataFrame({"a": [1, 2], "None": [3, 4]})
711
assert_frame_equal(df, expected)
712
713
# Rename columns
714
df = pl.DataFrame(pa.table({"a": [1, 2], "b": [3, 4]}), schema=["c", "d"])
715
expected = pl.DataFrame({"c": [1, 2], "d": [3, 4]})
716
assert_frame_equal(df, expected)
717
718
df = pl.DataFrame(
719
pa.table({"a": [1, 2], None: [3, 4]}),
720
schema=[("c", pl.Int32), ("d", pl.Float32)],
721
)
722
assert df.schema == {"c": pl.Int32, "d": pl.Float32}
723
assert df.rows() == [(1, 3.0), (2, 4.0)]
724
725
# Bad columns argument
726
with pytest.raises(ValueError):
727
pl.DataFrame(pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}), schema=["c", "d", "e"])
728
729
730
def test_init_arrow_dupes() -> None:
731
tbl = pa.Table.from_arrays(
732
arrays=[
733
pa.array([1, 2, 3], type=pa.int32()),
734
pa.array([4, 5, 6], type=pa.int32()),
735
pa.array(
736
[7, 8, 9], type=pa.decimal128(38, 10)
737
), # included as this triggers a panic during construction alongside duplicate fields
738
],
739
schema=pa.schema(
740
[("col", pa.int32()), ("col", pa.int32()), ("col3", pa.decimal128(38, 10))]
741
),
742
)
743
with pytest.raises(
744
DuplicateError,
745
match=r"""column appears more than once; names must be unique: \["col"\]""",
746
):
747
pl.DataFrame(tbl)
748
749
750
def test_init_from_frame() -> None:
751
df1 = pl.DataFrame({"id": [0, 1], "misc": ["a", "b"], "val": [-10, 10]})
752
assert_frame_equal(df1, pl.DataFrame(df1))
753
754
df2 = pl.DataFrame(df1, schema=["a", "b", "c"])
755
assert_frame_equal(df2, pl.DataFrame(df2))
756
757
df3 = pl.DataFrame(df1, schema=["a", "b", "c"], schema_overrides={"val": pl.Int8})
758
assert_frame_equal(df3, pl.DataFrame(df3))
759
760
assert df1.schema == {"id": pl.Int64, "misc": pl.String, "val": pl.Int64}
761
assert df2.schema == {"a": pl.Int64, "b": pl.String, "c": pl.Int64}
762
assert df3.schema == {"a": pl.Int64, "b": pl.String, "c": pl.Int8}
763
assert df1.rows() == df2.rows() == df3.rows()
764
765
s1 = pl.Series("s", df3)
766
s2 = pl.Series(df3)
767
768
assert s1.name == "s"
769
assert s2.name == ""
770
771
772
def test_init_series() -> None:
773
# List of Series
774
df = pl.DataFrame([pl.Series("a", [1, 2, 3]), pl.Series("b", [4, 5, 6])])
775
expected = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
776
assert_frame_equal(df, expected)
777
778
# Tuple of Series
779
df = pl.DataFrame((pl.Series("a", (1, 2, 3)), pl.Series("b", (4, 5, 6))))
780
assert_frame_equal(df, expected)
781
782
df = pl.DataFrame(
783
(pl.Series("a", (1, 2, 3)), pl.Series("b", (4, 5, 6))),
784
schema=[("x", pl.Float64), ("y", pl.Float64)],
785
)
786
assert df.schema == {"x": pl.Float64, "y": pl.Float64}
787
assert df.rows() == [(1.0, 4.0), (2.0, 5.0), (3.0, 6.0)]
788
789
# List of unnamed Series
790
df = pl.DataFrame([pl.Series([1, 2, 3]), pl.Series([4, 5, 6])])
791
col0 = pl.Series("column_0", [1, 2, 3])
792
col1 = pl.Series("column_1", [4, 5, 6])
793
expected = pl.DataFrame([col0, col1])
794
assert_frame_equal(df, expected)
795
796
df = pl.DataFrame([pl.Series([0.0]), pl.Series([1.0])])
797
assert df.schema == {"column_0": pl.Float64, "column_1": pl.Float64}
798
assert df.rows() == [(0.0, 1.0)]
799
800
df = pl.DataFrame(
801
[pl.Series([None]), pl.Series([1.0])],
802
schema=[("x", pl.Date), ("y", pl.Boolean)],
803
)
804
assert df.schema == {"x": pl.Date, "y": pl.Boolean}
805
assert df.rows() == [(None, True)]
806
807
# Single Series
808
df = pl.DataFrame(pl.Series("a", [1, 2, 3]))
809
expected = pl.DataFrame({"a": [1, 2, 3]})
810
assert df.schema == {"a": pl.Int64}
811
assert_frame_equal(df, expected)
812
813
df = pl.DataFrame(pl.Series("a", [1, 2, 3]), schema=[("a", pl.UInt32)])
814
assert df.rows() == [(1,), (2,), (3,)]
815
assert df.schema == {"a": pl.UInt32}
816
817
# nested list, with/without explicit dtype
818
s1 = pl.Series([[[2, 2]]])
819
assert s1.dtype == pl.List(pl.List(pl.Int64))
820
821
s2 = pl.Series([[[2, 2]]], dtype=pl.List(pl.List(pl.UInt8)))
822
assert s2.dtype == pl.List(pl.List(pl.UInt8))
823
824
nested_dtype = pl.List(pl.List(pl.UInt8))
825
s3 = pl.Series("x", dtype=nested_dtype)
826
s4 = pl.Series(s3)
827
for s in (s3, s4):
828
assert s.dtype == nested_dtype
829
assert s.to_list() == []
830
assert s.name == "x"
831
832
s5 = pl.Series("", df, dtype=pl.Int8)
833
assert_series_equal(s5, pl.Series("", [1, 2, 3], dtype=pl.Int8))
834
835
836
@pytest.mark.parametrize(
837
("dtype", "expected_dtype"),
838
[
839
(int, pl.Int64),
840
(bytes, pl.Binary),
841
(float, pl.Float64),
842
(str, pl.String),
843
(date, pl.Date),
844
(time, pl.Time),
845
(datetime, pl.Datetime("us")),
846
(timedelta, pl.Duration("us")),
847
(Decimal, pl.Decimal(precision=None, scale=0)),
848
],
849
)
850
def test_init_py_dtype(dtype: Any, expected_dtype: PolarsDataType) -> None:
851
for s in (
852
pl.Series("s", [None], dtype=dtype),
853
pl.Series("s", [], dtype=dtype),
854
):
855
assert s.dtype == expected_dtype
856
857
for df in (
858
pl.DataFrame({"col": [None]}, schema={"col": dtype}),
859
pl.DataFrame({"col": []}, schema={"col": dtype}),
860
):
861
assert df.schema == {"col": expected_dtype}
862
863
864
def test_init_py_dtype_misc_float() -> None:
865
assert pl.Series([100], dtype=float).dtype == pl.Float64 # type: ignore[arg-type]
866
867
df = pl.DataFrame(
868
{"x": [100.0], "y": [200], "z": [None]},
869
schema={"x": float, "y": float, "z": float},
870
)
871
assert df.schema == {"x": pl.Float64, "y": pl.Float64, "z": pl.Float64}
872
assert df.rows() == [(100.0, 200.0, None)]
873
874
875
def test_init_seq_of_seq() -> None:
876
# List of lists
877
df = pl.DataFrame([[1, 2, 3], [4, 5, 6]], schema=["a", "b", "c"], orient="row")
878
expected = pl.DataFrame({"a": [1, 4], "b": [2, 5], "c": [3, 6]})
879
assert_frame_equal(df, expected)
880
881
df = pl.DataFrame(
882
[[1, 2, 3], [4, 5, 6]],
883
schema=[("a", pl.Int8), ("b", pl.Int16), ("c", pl.Int32)],
884
orient="row",
885
)
886
assert df.schema == {"a": pl.Int8, "b": pl.Int16, "c": pl.Int32}
887
assert df.rows() == [(1, 2, 3), (4, 5, 6)]
888
889
# Tuple of tuples, default to column orientation
890
df = pl.DataFrame(((1, 2, 3), (4, 5, 6)))
891
expected = pl.DataFrame({"column_0": [1, 2, 3], "column_1": [4, 5, 6]})
892
assert_frame_equal(df, expected)
893
894
# Row orientation
895
df = pl.DataFrame(((1, 2), (3, 4)), schema=("a", "b"), orient="row")
896
expected = pl.DataFrame({"a": [1, 3], "b": [2, 4]})
897
assert_frame_equal(df, expected)
898
899
df = pl.DataFrame(
900
((1, 2), (3, 4)), schema=(("a", pl.Float32), ("b", pl.Float32)), orient="row"
901
)
902
assert df.schema == {"a": pl.Float32, "b": pl.Float32}
903
assert df.rows() == [(1.0, 2.0), (3.0, 4.0)]
904
905
# Wrong orient value
906
with pytest.raises(ValueError):
907
df = pl.DataFrame(((1, 2), (3, 4)), orient="wrong") # type: ignore[arg-type]
908
909
910
def test_init_1d_sequence() -> None:
911
# Empty list
912
df = pl.DataFrame([])
913
assert_frame_equal(df, pl.DataFrame())
914
915
# List/array of strings
916
data = ["a", "b", "c"]
917
for a in (data, np.array(data)):
918
df = pl.DataFrame(a, schema=["s"])
919
expected = pl.DataFrame({"s": data})
920
assert_frame_equal(df, expected)
921
922
df = pl.DataFrame([None, True, False], schema=[("xx", pl.Int8)])
923
assert df.schema == {"xx": pl.Int8}
924
assert df.rows() == [(None,), (1,), (0,)]
925
926
# String sequence
927
result = pl.DataFrame("abc", schema=["s"])
928
expected = pl.DataFrame({"s": ["a", "b", "c"]})
929
assert_frame_equal(result, expected)
930
931
# datetimes sequence
932
df = pl.DataFrame([datetime(2020, 1, 1)], schema={"ts": pl.Datetime("ms")})
933
assert df.schema == {"ts": pl.Datetime("ms")}
934
df = pl.DataFrame(
935
[datetime(2020, 1, 1, tzinfo=timezone.utc)], schema={"ts": pl.Datetime("ms")}
936
)
937
assert df.schema == {"ts": pl.Datetime("ms", "UTC")}
938
df = pl.DataFrame(
939
[datetime(2020, 1, 1, tzinfo=timezone(timedelta(hours=1)))],
940
schema={"ts": pl.Datetime("ms")},
941
)
942
assert df.schema == {"ts": pl.Datetime("ms", "UTC")}
943
df = pl.DataFrame(
944
[datetime(2020, 1, 1, tzinfo=ZoneInfo("Asia/Kathmandu"))],
945
schema={"ts": pl.Datetime("ms")},
946
)
947
assert df.schema == {"ts": pl.Datetime("ms", "Asia/Kathmandu")}
948
949
950
def test_init_pandas(monkeypatch: Any) -> None:
951
pandas_df = pd.DataFrame([[1, 2], [3, 4]], columns=[1, 2])
952
953
# integer column names
954
df = pl.DataFrame(pandas_df)
955
expected = pl.DataFrame({"1": [1, 3], "2": [2, 4]})
956
assert_frame_equal(df, expected)
957
assert df.schema == {"1": pl.Int64, "2": pl.Int64}
958
959
# override column names, types
960
df = pl.DataFrame(pandas_df, schema=[("x", pl.Float64), ("y", pl.Float64)])
961
assert df.schema == {"x": pl.Float64, "y": pl.Float64}
962
assert df.rows() == [(1.0, 2.0), (3.0, 4.0)]
963
964
# subclassed pandas object, with/without data & overrides
965
# type error fixed in pandas-stubs 2.3.0.250703, which doesn't support Python3.9
966
class XSeries(pd.Series): # type: ignore[type-arg, unused-ignore]
967
@property
968
def _constructor(self) -> type:
969
return XSeries
970
971
df = pl.DataFrame(
972
data=[
973
XSeries(name="x", data=[], dtype=np.dtype("<M8[ns]")),
974
XSeries(name="y", data=[], dtype=np.dtype("f8")),
975
XSeries(name="z", data=[], dtype=np.dtype("?")),
976
],
977
)
978
assert df.schema == {"x": pl.Datetime("ns"), "y": pl.Float64, "z": pl.Boolean}
979
assert df.rows() == []
980
981
df = pl.DataFrame(
982
data=[
983
XSeries(
984
name="x",
985
data=[datetime(2022, 10, 31, 10, 30, 45, 123456)],
986
dtype=np.dtype("<M8[ns]"),
987
)
988
],
989
schema={"colx": pl.Datetime("us")},
990
)
991
assert df.schema == {"colx": pl.Datetime("us")}
992
assert df.rows() == [(datetime(2022, 10, 31, 10, 30, 45, 123456),)]
993
994
# pandas is not available
995
monkeypatch.setattr(pl.dataframe.frame, "_check_for_pandas", lambda x: False)
996
997
# pandas 2.2 and higher implement the Arrow PyCapsule Interface, so the constructor
998
# will still work even without using pandas APIs
999
if parse_version(pd.__version__) >= parse_version("2.2.0"):
1000
df = pl.DataFrame(pandas_df)
1001
assert_frame_equal(df, expected)
1002
1003
else:
1004
with pytest.raises(TypeError):
1005
pl.DataFrame(pandas_df)
1006
1007
1008
def test_init_errors() -> None:
1009
# Length mismatch
1010
with pytest.raises(ShapeError):
1011
pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0, 4.0]})
1012
1013
# Columns don't match data dimensions
1014
with pytest.raises(ShapeError):
1015
pl.DataFrame([[1, 2], [3, 4]], schema=["a", "b", "c"])
1016
1017
# Unmatched input
1018
with pytest.raises(TypeError):
1019
pl.DataFrame(0)
1020
1021
1022
def test_init_records() -> None:
1023
dicts = [
1024
{"a": 1, "b": 2},
1025
{"b": 1, "a": 2},
1026
{"a": 1, "b": 2},
1027
]
1028
df = pl.DataFrame(dicts)
1029
expected = pl.DataFrame({"a": [1, 2, 1], "b": [2, 1, 2]})
1030
assert_frame_equal(df, expected)
1031
assert df.to_dicts() == dicts
1032
1033
df_cd = pl.DataFrame(dicts, schema=["a", "c", "d"])
1034
expected_values = {
1035
"a": [1, 2, 1],
1036
"c": [None, None, None],
1037
"d": [None, None, None],
1038
}
1039
assert df_cd.to_dict(as_series=False) == expected_values
1040
1041
data = {"a": 1, "b": 2, "c": 3}
1042
1043
df1 = pl.from_dicts([data])
1044
assert df1.columns == ["a", "b", "c"]
1045
1046
df1.columns = ["x", "y", "z"]
1047
assert df1.columns == ["x", "y", "z"]
1048
1049
df2 = pl.from_dicts([data], schema=["c", "b", "a"])
1050
assert df2.columns == ["c", "b", "a"]
1051
1052
for colname in ("c", "b", "a"):
1053
result = pl.from_dicts([data], schema=[colname])
1054
expected_values = {colname: [data[colname]]}
1055
assert result.to_dict(as_series=False) == expected_values
1056
1057
1058
def test_init_records_schema_order() -> None:
1059
cols: list[str] = ["a", "b", "c", "d"]
1060
data: list[dict[str, int]] = [
1061
{"c": 3, "b": 2, "a": 1},
1062
{"b": 2, "d": 4},
1063
{},
1064
{"a": 1, "b": 2, "c": 3},
1065
{"d": 4, "b": 2, "a": 1},
1066
{"c": 3, "b": 2},
1067
]
1068
lookup = {"a": 1, "b": 2, "c": 3, "d": 4, "e": None}
1069
1070
for constructor in (pl.from_dicts, pl.DataFrame):
1071
# ensure field values are loaded according to the declared schema order
1072
for _ in range(8):
1073
shuffle(data)
1074
shuffle(cols)
1075
1076
df = constructor(data, schema=cols)
1077
for col in df.columns:
1078
assert all(value in (None, lookup[col]) for value in df[col].to_list())
1079
1080
# have schema override inferred types, omit some columns, add a new one
1081
schema = {"a": pl.Int8, "c": pl.Int16, "e": pl.Int32}
1082
df = constructor(data, schema=schema)
1083
1084
assert df.schema == schema
1085
for col in df.columns:
1086
assert all(value in (None, lookup[col]) for value in df[col].to_list())
1087
1088
1089
def test_init_only_columns() -> None:
1090
df = pl.DataFrame(schema=["a", "b", "c"])
1091
expected = pl.DataFrame({"a": [], "b": [], "c": []})
1092
assert_frame_equal(df, expected)
1093
1094
# Validate construction with various flavours of no/empty data
1095
no_data: Any
1096
for no_data in (None, {}, []):
1097
df = pl.DataFrame(
1098
data=no_data,
1099
schema=[
1100
("a", pl.Date),
1101
("b", pl.UInt64),
1102
("c", pl.Int8),
1103
("d", pl.List(pl.UInt8)),
1104
],
1105
)
1106
expected = pl.DataFrame({"a": [], "b": [], "c": []}).with_columns(
1107
pl.col("a").cast(pl.Date),
1108
pl.col("b").cast(pl.UInt64),
1109
pl.col("c").cast(pl.Int8),
1110
)
1111
expected.insert_column(3, pl.Series("d", [], pl.List(pl.UInt8)))
1112
1113
assert df.shape == (0, 4)
1114
assert_frame_equal(df, expected)
1115
assert df.dtypes == [pl.Date, pl.UInt64, pl.Int8, pl.List]
1116
assert pl.List(pl.UInt8).is_(df.schema["d"])
1117
1118
if TYPE_CHECKING:
1119
assert_type(pl.List(pl.UInt8).is_(df.schema["d"]), bool)
1120
1121
dfe = df.clear()
1122
assert len(dfe) == 0
1123
assert df.schema == dfe.schema
1124
assert dfe.shape == df.shape
1125
1126
1127
def test_from_dicts_list_without_dtype() -> None:
1128
result = pl.from_dicts(
1129
[{"id": 1, "hint": ["some_text_here"]}, {"id": 2, "hint": [None]}]
1130
)
1131
expected = pl.DataFrame({"id": [1, 2], "hint": [["some_text_here"], [None]]})
1132
assert_frame_equal(result, expected)
1133
1134
1135
def test_from_dicts_list_struct_without_inner_dtype() -> None:
1136
df = pl.DataFrame(
1137
{
1138
"users": [
1139
[{"category": "A"}, {"category": "B"}],
1140
[{"category": None}, {"category": None}],
1141
],
1142
"days_of_week": [1, 2],
1143
}
1144
)
1145
expected = {
1146
"users": [
1147
[{"category": "A"}, {"category": "B"}],
1148
[{"category": None}, {"category": None}],
1149
],
1150
"days_of_week": [1, 2],
1151
}
1152
assert df.to_dict(as_series=False) == expected
1153
1154
1155
def test_from_dicts_list_struct_without_inner_dtype_5611() -> None:
1156
result = pl.from_dicts(
1157
[
1158
{"a": []},
1159
{"a": [{"b": 1}]},
1160
]
1161
)
1162
expected = pl.DataFrame({"a": [[], [{"b": 1}]]})
1163
assert_frame_equal(result, expected)
1164
1165
1166
def test_from_dict_upcast_primitive() -> None:
1167
df = pl.from_dict({"a": [1, 2.1, 3], "b": [4, 5, 6.4]}, strict=False)
1168
assert df.dtypes == [pl.Float64, pl.Float64]
1169
1170
1171
def test_u64_lit_5031() -> None:
1172
df = pl.DataFrame({"foo": [1, 2, 3]}).with_columns(pl.col("foo").cast(pl.UInt64))
1173
assert df.filter(pl.col("foo") < (1 << 64) - 20).shape == (3, 1)
1174
assert df["foo"].to_list() == [1, 2, 3]
1175
1176
1177
def test_from_dicts_missing_columns() -> None:
1178
# missing columns from some of the data dicts
1179
data = [{"a": 1}, {"b": 2}]
1180
result = pl.from_dicts(data)
1181
expected = pl.DataFrame({"a": [1, None], "b": [None, 2]})
1182
assert_frame_equal(result, expected)
1183
1184
# partial schema with some columns missing; only load the declared keys
1185
data = [{"a": 1, "b": 2}]
1186
result = pl.from_dicts(data, schema=["a"])
1187
expected = pl.DataFrame({"a": [1]})
1188
assert_frame_equal(result, expected)
1189
1190
1191
def test_from_dicts_schema_columns_do_not_match() -> None:
1192
data = [{"a": 1, "b": 2}]
1193
result = pl.from_dicts(data, schema=["x"])
1194
expected = pl.DataFrame({"x": [None]})
1195
assert_frame_equal(result, expected)
1196
1197
1198
def test_from_dicts_infer_integer_types() -> None:
1199
data = [
1200
{
1201
"a": 2**7 - 1,
1202
"b": 2**15 - 1,
1203
"c": 2**31 - 1,
1204
"d": 2**63 - 1,
1205
"e": 2**127 - 1,
1206
}
1207
]
1208
result = pl.from_dicts(data).schema
1209
# all values inferred as i64 except for values too large for i64
1210
expected = {
1211
"a": pl.Int64,
1212
"b": pl.Int64,
1213
"c": pl.Int64,
1214
"d": pl.Int64,
1215
"e": pl.Int128,
1216
}
1217
assert result == expected
1218
1219
with pytest.raises(OverflowError):
1220
pl.from_dicts([{"too_big": 2**127}])
1221
1222
1223
def test_from_dicts_list_large_int_17006() -> None:
1224
data = [{"x": [2**64 - 1]}]
1225
1226
result = pl.from_dicts(data, schema={"x": pl.List(pl.UInt64)})
1227
expected = pl.DataFrame({"x": [[2**64 - 1]]}, schema={"x": pl.List(pl.UInt64)})
1228
assert_frame_equal(result, expected)
1229
1230
result = pl.from_dicts(data, schema={"x": pl.Array(pl.UInt64, 1)})
1231
expected = pl.DataFrame({"x": [[2**64 - 1]]}, schema={"x": pl.Array(pl.UInt64, 1)})
1232
assert_frame_equal(result, expected)
1233
1234
1235
def test_from_rows_dtype() -> None:
1236
# 50 is the default inference length
1237
# 5182
1238
df = pl.DataFrame(
1239
data=[(None, None)] * 50 + [("1.23", None)],
1240
schema=[("foo", pl.String), ("bar", pl.String)],
1241
orient="row",
1242
)
1243
assert df.dtypes == [pl.String, pl.String]
1244
assert df.null_count().row(0) == (50, 51)
1245
1246
type1 = [{"c1": 206, "c2": "type1", "c3": {"x1": "abcd", "x2": "jkl;"}}]
1247
type2 = [
1248
{"c1": 208, "c2": "type2", "c3": {"a1": "abcd", "a2": "jkl;", "a3": "qwerty"}}
1249
]
1250
1251
df = pl.DataFrame(
1252
data=type1 * 50 + type2,
1253
schema=[("c1", pl.Int32), ("c2", pl.Object), ("c3", pl.Object)],
1254
)
1255
assert df.dtypes == [pl.Int32, pl.Object, pl.Object]
1256
1257
# 50 is the default inference length
1258
# 5266
1259
type1 = [{"c1": 206, "c2": "type1", "c3": {"x1": "abcd", "x2": "jkl;"}}]
1260
type2 = [
1261
{"c1": 208, "c2": "type2", "c3": {"a1": "abcd", "a2": "jkl;", "a3": "qwerty"}}
1262
]
1263
1264
df = pl.DataFrame(
1265
data=type1 * 50 + type2,
1266
schema=[("c1", pl.Int32), ("c2", pl.Object), ("c3", pl.Object)],
1267
)
1268
assert df.dtypes == [pl.Int32, pl.Object, pl.Object]
1269
assert df.null_count().row(0) == (0, 0, 0)
1270
1271
dc = _TestBazDC(d=datetime(2020, 2, 22), e=42.0, f="xyz")
1272
df = pl.DataFrame([[dc]], schema={"d": pl.Object})
1273
assert df.schema == {"d": pl.Object}
1274
assert df.item() == dc
1275
1276
1277
def test_from_dicts_schema() -> None:
1278
data = [{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}]
1279
1280
# let polars infer the dtypes, but inform it about a 3rd column.
1281
for schema, overrides in (
1282
({"a": pl.Unknown, "b": pl.Unknown, "c": pl.Int32}, None),
1283
({"a": None, "b": None, "c": None}, {"c": pl.Int32}),
1284
(["a", "b", ("c", pl.Int32)], None),
1285
):
1286
df = pl.from_dicts(
1287
data,
1288
schema=schema, # type: ignore[arg-type]
1289
schema_overrides=overrides,
1290
)
1291
assert df.dtypes == [pl.Int64, pl.Int64, pl.Int32]
1292
assert df.to_dict(as_series=False) == {
1293
"a": [1, 2, 3],
1294
"b": [4, 5, 6],
1295
"c": [None, None, None],
1296
}
1297
1298
# provide data that resolves to an empty frame (ref: scalar
1299
# expansion shortcut), with schema/override hints
1300
schema = {"colx": pl.String, "coly": pl.Int32}
1301
1302
for param in ("schema", "schema_overrides"):
1303
df = pl.DataFrame({"colx": [], "coly": 0}, **{param: schema}) # type: ignore[arg-type]
1304
assert df.schema == schema
1305
1306
1307
def test_nested_read_dicts_4143() -> None:
1308
result = pl.from_dicts(
1309
[
1310
{
1311
"id": 1,
1312
"hint": [
1313
{"some_text_here": "text", "list_": [1, 2, 4]},
1314
{"some_text_here": "text", "list_": [1, 2, 4]},
1315
],
1316
},
1317
{
1318
"id": 2,
1319
"hint": [
1320
{"some_text_here": None, "list_": [1]},
1321
{"some_text_here": None, "list_": [2]},
1322
],
1323
},
1324
]
1325
)
1326
expected = {
1327
"hint": [
1328
[
1329
{"some_text_here": "text", "list_": [1, 2, 4]},
1330
{"some_text_here": "text", "list_": [1, 2, 4]},
1331
],
1332
[
1333
{"some_text_here": None, "list_": [1]},
1334
{"some_text_here": None, "list_": [2]},
1335
],
1336
],
1337
"id": [1, 2],
1338
}
1339
assert result.to_dict(as_series=False) == expected
1340
1341
1342
def test_nested_read_dicts_4143_2() -> None:
1343
result = pl.from_dicts(
1344
[
1345
{
1346
"id": 1,
1347
"hint": [
1348
{"some_text_here": "text", "list_": [1, 2, 4]},
1349
{"some_text_here": "text", "list_": [1, 2, 4]},
1350
],
1351
},
1352
{
1353
"id": 2,
1354
"hint": [
1355
{"some_text_here": "text", "list_": []},
1356
{"some_text_here": "text", "list_": []},
1357
],
1358
},
1359
]
1360
)
1361
1362
assert result.dtypes == [
1363
pl.Int64,
1364
pl.List(pl.Struct({"some_text_here": pl.String, "list_": pl.List(pl.Int64)})),
1365
]
1366
expected = {
1367
"id": [1, 2],
1368
"hint": [
1369
[
1370
{"some_text_here": "text", "list_": [1, 2, 4]},
1371
{"some_text_here": "text", "list_": [1, 2, 4]},
1372
],
1373
[
1374
{"some_text_here": "text", "list_": []},
1375
{"some_text_here": "text", "list_": []},
1376
],
1377
],
1378
}
1379
assert result.to_dict(as_series=False) == expected
1380
1381
1382
def test_from_records_nullable_structs() -> None:
1383
records = [
1384
{"id": 1, "items": [{"item_id": 100, "description": None}]},
1385
{"id": 1, "items": [{"item_id": 100, "description": "hi"}]},
1386
]
1387
1388
schema: list[tuple[str, PolarsDataType]] = [
1389
("id", pl.UInt16),
1390
(
1391
"items",
1392
pl.List(
1393
pl.Struct(
1394
[pl.Field("item_id", pl.UInt32), pl.Field("description", pl.String)]
1395
)
1396
),
1397
),
1398
]
1399
1400
schema_options: list[list[tuple[str, PolarsDataType]] | None] = [schema, None]
1401
for s in schema_options:
1402
result = pl.DataFrame(records, schema=s, orient="row")
1403
expected = {
1404
"id": [1, 1],
1405
"items": [
1406
[{"item_id": 100, "description": None}],
1407
[{"item_id": 100, "description": "hi"}],
1408
],
1409
}
1410
assert result.to_dict(as_series=False) == expected
1411
1412
# check initialisation without any records
1413
df = pl.DataFrame(schema=schema)
1414
dict_schema = dict(schema)
1415
assert df.to_dict(as_series=False) == {"id": [], "items": []}
1416
assert df.schema == dict_schema
1417
1418
dtype: PolarsDataType = dict_schema["items"]
1419
series = pl.Series("items", dtype=dtype)
1420
assert series.to_frame().to_dict(as_series=False) == {"items": []}
1421
assert series.dtype == dict_schema["items"]
1422
assert series.to_list() == []
1423
1424
1425
@pytest.mark.parametrize("unnest_column", ["a", pl.col("a"), cs.by_name("a")])
1426
def test_from_categorical_in_struct_defined_by_schema(unnest_column: Any) -> None:
1427
df = pl.DataFrame(
1428
{"a": [{"value": "foo", "counts": 1}, {"value": "bar", "counts": 2}]},
1429
schema={"a": pl.Struct({"value": pl.Categorical, "counts": pl.UInt32})},
1430
)
1431
1432
expected = pl.DataFrame(
1433
{"value": ["foo", "bar"], "counts": [1, 2]},
1434
schema={"value": pl.Categorical, "counts": pl.UInt32},
1435
)
1436
1437
res_eager = df.unnest(unnest_column)
1438
assert_frame_equal(res_eager, expected, categorical_as_str=True)
1439
1440
res_lazy = df.lazy().unnest(unnest_column)
1441
assert_frame_equal(res_lazy.collect(), expected, categorical_as_str=True)
1442
1443
1444
def test_nested_schema_construction() -> None:
1445
schema = {
1446
"node_groups": pl.List(
1447
pl.Struct(
1448
[
1449
pl.Field("parent_node_group_id", pl.UInt8),
1450
pl.Field(
1451
"nodes",
1452
pl.List(
1453
pl.Struct(
1454
[
1455
pl.Field("name", pl.String),
1456
pl.Field(
1457
"sub_nodes",
1458
pl.List(
1459
pl.Struct(
1460
[
1461
pl.Field("internal_id", pl.UInt64),
1462
pl.Field("value", pl.UInt32),
1463
]
1464
)
1465
),
1466
),
1467
]
1468
)
1469
),
1470
),
1471
]
1472
)
1473
)
1474
}
1475
df = pl.DataFrame(
1476
{
1477
"node_groups": [
1478
[{"nodes": []}, {"nodes": [{"name": "", "sub_nodes": []}]}],
1479
]
1480
},
1481
schema=schema,
1482
)
1483
1484
assert df.schema == schema
1485
assert df.to_dict(as_series=False) == {
1486
"node_groups": [
1487
[
1488
{"parent_node_group_id": None, "nodes": []},
1489
{
1490
"parent_node_group_id": None,
1491
"nodes": [{"name": "", "sub_nodes": []}],
1492
},
1493
]
1494
]
1495
}
1496
1497
1498
def test_nested_schema_construction2() -> None:
1499
schema = {
1500
"node_groups": pl.List(
1501
pl.Struct(
1502
[
1503
pl.Field(
1504
"nodes",
1505
pl.List(
1506
pl.Struct(
1507
[
1508
pl.Field("name", pl.String),
1509
pl.Field("time", pl.UInt32),
1510
]
1511
)
1512
),
1513
)
1514
]
1515
)
1516
)
1517
}
1518
df = pl.DataFrame(
1519
[
1520
{"node_groups": [{"nodes": [{"name": "a", "time": 0}]}]},
1521
{"node_groups": [{"nodes": []}]},
1522
],
1523
schema=schema,
1524
)
1525
assert df.schema == schema
1526
assert df.to_dict(as_series=False) == {
1527
"node_groups": [[{"nodes": [{"name": "a", "time": 0}]}], [{"nodes": []}]]
1528
}
1529
1530
1531
def test_arrow_to_pyseries_with_one_chunk_does_not_copy_data() -> None:
1532
from polars._utils.construction import arrow_to_pyseries
1533
1534
original_array = pa.chunked_array([[1, 2, 3]], type=pa.int64())
1535
pyseries = arrow_to_pyseries("", original_array)
1536
assert (
1537
pyseries.get_chunks()[0]._get_buffer_info()[0]
1538
== original_array.chunks[0].buffers()[1].address
1539
)
1540
1541
1542
def test_init_with_explicit_binary_schema() -> None:
1543
df = pl.DataFrame({"a": [b"hello", b"world"]}, schema={"a": pl.Binary})
1544
assert df.schema == {"a": pl.Binary}
1545
assert df["a"].to_list() == [b"hello", b"world"]
1546
1547
s = pl.Series("a", [b"hello", b"world"], dtype=pl.Binary)
1548
assert s.dtype == pl.Binary
1549
assert s.to_list() == [b"hello", b"world"]
1550
1551
1552
def test_nested_categorical() -> None:
1553
s = pl.Series([["a"]], dtype=pl.List(pl.Categorical))
1554
assert s.to_list() == [["a"]]
1555
assert s.dtype == pl.List(pl.Categorical)
1556
1557
1558
def test_datetime_date_subclasses() -> None:
1559
class FakeDate(date): ...
1560
1561
class FakeDateChild(FakeDate): ...
1562
1563
class FakeDatetime(FakeDate, datetime): ...
1564
1565
result = pl.Series([FakeDate(2020, 1, 1)])
1566
expected = pl.Series([date(2020, 1, 1)])
1567
assert_series_equal(result, expected)
1568
1569
result = pl.Series([FakeDateChild(2020, 1, 1)])
1570
expected = pl.Series([date(2020, 1, 1)])
1571
assert_series_equal(result, expected)
1572
1573
result = pl.Series([FakeDatetime(2020, 1, 1, 3)])
1574
expected = pl.Series([datetime(2020, 1, 1, 3)])
1575
assert_series_equal(result, expected)
1576
1577
1578
def test_list_null_constructor() -> None:
1579
s = pl.Series("a", [[None], [None]], dtype=pl.List(pl.Null))
1580
assert s.dtype == pl.List(pl.Null)
1581
assert s.to_list() == [[None], [None]]
1582
1583
# nested
1584
dtype = pl.List(pl.List(pl.Int8))
1585
values = [
1586
[],
1587
[[], []],
1588
[[33, 112]],
1589
]
1590
s = pl.Series(
1591
name="colx",
1592
values=values,
1593
dtype=dtype,
1594
)
1595
assert s.dtype == dtype
1596
assert s.to_list() == values
1597
1598
# nested
1599
# small order change has influence
1600
dtype = pl.List(pl.List(pl.Int8))
1601
values = [
1602
[[], []],
1603
[],
1604
[[33, 112]],
1605
]
1606
s = pl.Series(
1607
name="colx",
1608
values=values,
1609
dtype=dtype,
1610
)
1611
assert s.dtype == dtype
1612
assert s.to_list() == values
1613
1614
1615
def test_numpy_float_construction_av() -> None:
1616
np_dict = {"a": np.float64(1)}
1617
assert_frame_equal(pl.DataFrame(np_dict), pl.DataFrame({"a": 1.0}))
1618
1619
1620
def test_df_init_dict_raise_on_expression_input() -> None:
1621
with pytest.raises(
1622
TypeError,
1623
match="passing Expr objects to the DataFrame constructor is not supported",
1624
):
1625
pl.DataFrame({"a": pl.int_range(0, 3)})
1626
with pytest.raises(TypeError):
1627
pl.DataFrame({"a": pl.int_range(0, 3), "b": [3, 4, 5]})
1628
1629
# Passing a list of expressions is allowed
1630
df = pl.DataFrame({"a": [pl.int_range(0, 3)]})
1631
assert df.get_column("a").dtype.is_object()
1632
1633
1634
def test_df_schema_sequences() -> None:
1635
schema = [
1636
["address", pl.String],
1637
["key", pl.Int64],
1638
["value", pl.Float32],
1639
]
1640
df = pl.DataFrame(schema=schema) # type: ignore[arg-type]
1641
assert df.schema == {"address": pl.String, "key": pl.Int64, "value": pl.Float32}
1642
1643
1644
def test_df_schema_sequences_incorrect_length() -> None:
1645
schema = [
1646
["address", pl.String, pl.Int8],
1647
["key", pl.Int64],
1648
["value", pl.Float32],
1649
]
1650
with pytest.raises(ValueError):
1651
pl.DataFrame(schema=schema) # type: ignore[arg-type]
1652
1653
1654
@pytest.mark.parametrize(
1655
("input", "infer_func", "expected_dtype"),
1656
[
1657
("f8", numpy_char_code_to_dtype, pl.Float64),
1658
("f4", numpy_char_code_to_dtype, pl.Float32),
1659
("i4", numpy_char_code_to_dtype, pl.Int32),
1660
("u1", numpy_char_code_to_dtype, pl.UInt8),
1661
("?", numpy_char_code_to_dtype, pl.Boolean),
1662
("m8", numpy_char_code_to_dtype, pl.Duration("us")),
1663
("M8", numpy_char_code_to_dtype, pl.Datetime("us")),
1664
],
1665
)
1666
def test_numpy_inference(
1667
input: Any,
1668
infer_func: Callable[[Any], PolarsDataType],
1669
expected_dtype: PolarsDataType,
1670
) -> None:
1671
result = infer_func(input)
1672
assert result == expected_dtype
1673
1674
1675
def test_array_construction() -> None:
1676
payload = [[1, 2, 3], None, [4, 2, 3]]
1677
1678
dtype = pl.Array(pl.Int64, 3)
1679
s = pl.Series(payload, dtype=dtype)
1680
assert s.dtype == dtype
1681
assert s.to_list() == payload
1682
1683
# inner type
1684
dtype = pl.Array(pl.UInt8, 2)
1685
payload = [[1, 2], None, [3, 4]]
1686
s = pl.Series(payload, dtype=dtype)
1687
assert s.dtype == dtype
1688
assert s.to_list() == payload
1689
1690
# create using schema
1691
df = pl.DataFrame(
1692
schema={
1693
"a": pl.Array(pl.Float32, 3),
1694
"b": pl.Array(pl.Datetime("ms"), 5),
1695
}
1696
)
1697
assert df.dtypes == [
1698
pl.Array(pl.Float32, 3),
1699
pl.Array(pl.Datetime("ms"), 5),
1700
]
1701
assert df.rows() == []
1702
1703
# from dicts
1704
rows = [
1705
{"row_id": "a", "data": [1, 2, 3]},
1706
{"row_id": "b", "data": [2, 3, 4]},
1707
]
1708
schema = {"row_id": pl.String(), "data": pl.Array(inner=pl.Int64, shape=3)}
1709
df = pl.from_dicts(rows, schema=schema)
1710
assert df.schema == schema
1711
assert df.rows() == [("a", [1, 2, 3]), ("b", [2, 3, 4])]
1712
1713
1714
@pytest.mark.may_fail_auto_streaming
1715
def test_pycapsule_interface(df: pl.DataFrame) -> None:
1716
df = df.rechunk()
1717
pyarrow_table = df.to_arrow()
1718
1719
# Array via C data interface
1720
pyarrow_array = pyarrow_table["bools"].chunk(0)
1721
round_trip_series = pl.Series(PyCapsuleArrayHolder(pyarrow_array))
1722
assert df["bools"].equals(round_trip_series, check_dtypes=True, check_names=False)
1723
1724
# empty Array via C data interface
1725
empty_pyarrow_array = pa.array([], type=pyarrow_array.type)
1726
round_trip_series = pl.Series(PyCapsuleArrayHolder(empty_pyarrow_array))
1727
assert df["bools"].dtype == round_trip_series.dtype
1728
1729
# RecordBatch via C array interface
1730
pyarrow_record_batch = pyarrow_table.to_batches()[0]
1731
round_trip_df = pl.DataFrame(PyCapsuleArrayHolder(pyarrow_record_batch))
1732
assert df.equals(round_trip_df)
1733
1734
# ChunkedArray via C stream interface
1735
pyarrow_chunked_array = pyarrow_table["bools"]
1736
round_trip_series = pl.Series(PyCapsuleStreamHolder(pyarrow_chunked_array))
1737
assert df["bools"].equals(round_trip_series, check_dtypes=True, check_names=False)
1738
1739
# empty ChunkedArray via C stream interface
1740
empty_chunked_array = pa.chunked_array([], type=pyarrow_chunked_array.type)
1741
round_trip_series = pl.Series(PyCapsuleStreamHolder(empty_chunked_array))
1742
assert df["bools"].dtype == round_trip_series.dtype
1743
1744
# Table via C stream interface
1745
round_trip_df = pl.DataFrame(PyCapsuleStreamHolder(pyarrow_table))
1746
assert df.equals(round_trip_df)
1747
1748
# empty Table via C stream interface
1749
empty_df = df[:0].to_arrow()
1750
round_trip_df = pl.DataFrame(PyCapsuleStreamHolder(empty_df))
1751
orig_schema = df.schema
1752
round_trip_schema = round_trip_df.schema
1753
1754
# The "enum" schema is not preserved because categories are lost via C data
1755
# interface
1756
orig_schema.pop("enum")
1757
round_trip_schema.pop("enum")
1758
1759
assert orig_schema == round_trip_schema
1760
1761
# RecordBatchReader via C stream interface
1762
pyarrow_reader = pa.RecordBatchReader.from_batches(
1763
pyarrow_table.schema, pyarrow_table.to_batches()
1764
)
1765
round_trip_df = pl.DataFrame(PyCapsuleStreamHolder(pyarrow_reader))
1766
assert df.equals(round_trip_df)
1767
1768
1769
@pytest.mark.parametrize(
1770
"tz",
1771
[
1772
None,
1773
ZoneInfo("Asia/Tokyo"),
1774
ZoneInfo("Europe/Amsterdam"),
1775
ZoneInfo("UTC"),
1776
timezone.utc,
1777
],
1778
)
1779
def test_init_list_of_dicts_with_timezone(tz: Any) -> None:
1780
dt = datetime(2023, 1, 1, 0, 0, 0, 0, tzinfo=tz)
1781
1782
df = pl.DataFrame([{"dt": dt}, {"dt": dt}])
1783
expected = pl.DataFrame({"dt": [dt, dt]})
1784
assert_frame_equal(df, expected)
1785
1786
assert df.schema == {"dt": pl.Datetime("us", time_zone=tz)}
1787
1788
1789
@pytest.mark.parametrize(
1790
"tz",
1791
[
1792
None,
1793
ZoneInfo("Asia/Tokyo"),
1794
ZoneInfo("Europe/Amsterdam"),
1795
ZoneInfo("UTC"),
1796
timezone.utc,
1797
],
1798
)
1799
def test_init_list_of_nested_dicts_with_timezone(tz: Any) -> None:
1800
dt = datetime(2021, 1, 1, 0, 0, 0, 0, tzinfo=tz)
1801
data = [{"timestamp": {"content": datetime(2021, 1, 1, 0, 0, tzinfo=tz)}}]
1802
1803
df = pl.DataFrame(data).unnest("timestamp")
1804
expected = pl.DataFrame({"content": [dt]})
1805
assert_frame_equal(df, expected)
1806
1807
assert df.schema == {"content": pl.Datetime("us", time_zone=tz)}
1808
1809
1810
def test_init_from_subclassed_types() -> None:
1811
# more detailed test of one custom subclass...
1812
import codecs
1813
1814
class SuperSecretString(str):
1815
def __new__(cls, value: str) -> Self:
1816
return super().__new__(cls, value)
1817
1818
def __repr__(self) -> str:
1819
return codecs.encode(self, "rot_13")
1820
1821
w = "windmolen"
1822
sstr = SuperSecretString(w)
1823
1824
assert sstr == w
1825
assert isinstance(sstr, str)
1826
assert repr(sstr) == "jvaqzbyra"
1827
assert_series_equal(pl.Series([w, w]), pl.Series([sstr, sstr]))
1828
1829
# ...then validate across other basic types
1830
for BaseType, value in (
1831
(int, 42),
1832
(float, 5.5),
1833
(bytes, b"value"),
1834
(str, "value"),
1835
):
1836
1837
class SubclassedType(BaseType): # type: ignore[misc,valid-type]
1838
def __new__(cls, value: Any) -> Self:
1839
return super().__new__(cls, value) # type: ignore[no-any-return]
1840
1841
assert (
1842
pl.Series([value]).to_list() == pl.Series([SubclassedType(value)]).to_list()
1843
)
1844
1845
1846
def test_series_init_with_python_type_7737() -> None:
1847
assert pl.Series([], dtype=int).dtype == pl.Int64 # type: ignore[arg-type]
1848
assert pl.Series([], dtype=float).dtype == pl.Float64 # type: ignore[arg-type]
1849
assert pl.Series([], dtype=bool).dtype == pl.Boolean # type: ignore[arg-type]
1850
assert pl.Series([], dtype=str).dtype == pl.Utf8 # type: ignore[arg-type]
1851
1852
with pytest.raises(TypeError):
1853
pl.Series(["a"], dtype=int) # type: ignore[arg-type]
1854
1855
with pytest.raises(TypeError):
1856
pl.Series([True], dtype=str) # type: ignore[arg-type]
1857
1858
1859
def test_init_from_list_shape_6968() -> None:
1860
df1 = pl.DataFrame([[1, None], [2, None], [3, None]])
1861
df2 = pl.DataFrame([[None, None], [2, None], [3, None]])
1862
assert df1.shape == (2, 3)
1863
assert df2.shape == (2, 3)
1864
1865