Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/constructors/test_dataframe.py
8398 views
1
from __future__ import annotations
2
3
import enum
4
import sys
5
from collections import OrderedDict
6
from collections.abc import Mapping
7
from datetime import date, datetime, time
8
from typing import TYPE_CHECKING, Any
9
10
import pytest
11
12
import polars as pl
13
from polars.exceptions import DataOrientationWarning, InvalidOperationError
14
15
if TYPE_CHECKING:
16
from collections.abc import Iterator
17
18
from polars._typing import SchemaDict
19
20
21
def test_df_mixed_dtypes_string() -> None:
22
data = {"x": [["abc", 12, 34.5]], "y": [1]}
23
24
with pytest.raises(TypeError, match="unexpected value"):
25
pl.DataFrame(data, strict=True)
26
27
df = pl.DataFrame(data, strict=False)
28
assert df.schema == {"x": pl.List(pl.String), "y": pl.Int64}
29
assert df.rows() == [(["abc", "12", "34.5"], 1)]
30
31
32
def test_df_mixed_dtypes_object() -> None:
33
data = {"x": [[b"abc", 12, 34.5]], "y": [1]}
34
35
with pytest.raises(TypeError):
36
pl.DataFrame(data, strict=True)
37
38
df = pl.DataFrame(data, strict=False)
39
assert df.schema == {"x": pl.Object, "y": pl.Int64}
40
assert df.rows() == [([b"abc", 12, 34.5], 1)]
41
42
43
def test_df_object() -> None:
44
class Foo:
45
def __init__(self, value: int) -> None:
46
self._value = value
47
48
def __eq__(self, other: object) -> bool:
49
return issubclass(other.__class__, self.__class__) and (
50
self._value == other._value # type: ignore[attr-defined]
51
)
52
53
def __repr__(self) -> str:
54
return f"{self.__class__.__name__}({self._value})"
55
56
df = pl.DataFrame({"a": [Foo(1), Foo(2)]})
57
assert df["a"].dtype.is_object()
58
assert df.rows() == [(Foo(1),), (Foo(2),)]
59
60
61
def test_df_init_from_generator_dict_view() -> None:
62
d = {0: "x", 1: "y", 2: "z"}
63
data = {
64
"keys": d.keys(),
65
"vals": d.values(),
66
"items": d.items(),
67
}
68
with pytest.raises(TypeError, match="unexpected value"):
69
pl.DataFrame(data, strict=True)
70
71
df = pl.DataFrame(data, strict=False)
72
assert df.schema == {
73
"keys": pl.Int64,
74
"vals": pl.String,
75
"items": pl.List(pl.String),
76
}
77
assert df.to_dict(as_series=False) == {
78
"keys": [0, 1, 2],
79
"vals": ["x", "y", "z"],
80
"items": [["0", "x"], ["1", "y"], ["2", "z"]],
81
}
82
83
84
@pytest.mark.skipif(
85
sys.version_info < (3, 11),
86
reason="reversed dict views not supported before Python 3.11",
87
)
88
def test_df_init_from_generator_reversed_dict_view() -> None:
89
d = {0: "x", 1: "y", 2: "z"}
90
data = {
91
"rev_keys": reversed(d.keys()),
92
"rev_vals": reversed(d.values()),
93
"rev_items": reversed(d.items()),
94
}
95
df = pl.DataFrame(data, schema_overrides={"rev_items": pl.Object})
96
97
assert df.schema == {
98
"rev_keys": pl.Int64,
99
"rev_vals": pl.String,
100
"rev_items": pl.Object,
101
}
102
assert df.to_dict(as_series=False) == {
103
"rev_keys": [2, 1, 0],
104
"rev_vals": ["z", "y", "x"],
105
"rev_items": [(2, "z"), (1, "y"), (0, "x")],
106
}
107
108
109
def test_df_init_strict() -> None:
110
data = {"a": [1, 2, 3.0]}
111
schema = {"a": pl.Int8}
112
with pytest.raises(TypeError):
113
pl.DataFrame(data, schema=schema, strict=True)
114
115
df = pl.DataFrame(data, schema=schema, strict=False)
116
assert df["a"].to_list() == [1, 2, 3]
117
assert df["a"].dtype == pl.Int8
118
119
120
def test_df_init_from_series_strict() -> None:
121
s = pl.Series("a", [-1, 0, 1])
122
schema = {"a": pl.UInt8}
123
with pytest.raises(InvalidOperationError):
124
pl.DataFrame(s, schema=schema, strict=True)
125
126
df = pl.DataFrame(s, schema=schema, strict=False)
127
assert df["a"].to_list() == [None, 0, 1]
128
assert df["a"].dtype == pl.UInt8
129
130
131
# https://github.com/pola-rs/polars/issues/15471
132
def test_df_init_rows_overrides_non_existing() -> None:
133
df = pl.DataFrame([{"a": 1}], schema_overrides={"a": pl.Int8(), "b": pl.Boolean()})
134
assert df.schema == OrderedDict({"a": pl.Int8})
135
136
df = pl.DataFrame(
137
[{"a": 3, "b": 1.0}],
138
schema_overrides={"a": pl.Int8, "c": pl.Utf8},
139
)
140
assert df.schema == OrderedDict({"a": pl.Int8, "b": pl.Float64})
141
142
143
# https://github.com/pola-rs/polars/issues/15245
144
def test_df_init_nested_mixed_types() -> None:
145
data = [{"key": [{"value": 1}, {"value": 1.0}]}]
146
147
with pytest.raises(TypeError, match="unexpected value"):
148
pl.DataFrame(data, strict=True)
149
150
df = pl.DataFrame(data, strict=False)
151
assert df.schema == {"key": pl.List(pl.Struct({"value": pl.Float64}))}
152
assert df.to_dicts() == [{"key": [{"value": 1.0}, {"value": 1.0}]}]
153
154
155
class CustomSchema(Mapping[str, Any]):
156
"""Dummy schema object for testing compatibility with Mapping."""
157
158
_entries: dict[str, Any]
159
160
def __init__(self, **named_entries: Any) -> None:
161
self._items = OrderedDict(named_entries.items())
162
163
def __getitem__(self, key: str) -> Any:
164
return self._items[key]
165
166
def __len__(self) -> int:
167
return len(self._items)
168
169
def __iter__(self) -> Iterator[str]:
170
yield from self._items
171
172
173
def test_custom_schema() -> None:
174
df = pl.DataFrame(schema=CustomSchema(bool=pl.Boolean, misc=pl.UInt8))
175
assert df.schema == OrderedDict([("bool", pl.Boolean), ("misc", pl.UInt8)])
176
177
with pytest.raises(TypeError):
178
pl.DataFrame(schema=CustomSchema(bool="boolean", misc="unsigned int"))
179
180
181
def test_list_null_constructor_schema() -> None:
182
expected = pl.List(pl.Null)
183
assert pl.DataFrame({"a": [[]]}).dtypes[0] == expected
184
assert pl.DataFrame(schema={"a": pl.List}).dtypes[0] == expected
185
186
187
def test_df_init_schema_object() -> None:
188
schema = pl.Schema({"a": pl.Int8(), "b": pl.String()})
189
df = pl.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}, schema=schema)
190
191
assert df.columns == schema.names()
192
assert df.dtypes == schema.dtypes()
193
194
195
def test_df_init_data_orientation_inference_warning() -> None:
196
with pytest.warns(DataOrientationWarning):
197
pl.from_records([[1, 2, 3], [4, 5, 6]], schema=["a", "b", "c"])
198
199
200
def test_df_init_enum_dtype() -> None:
201
class PythonEnum(str, enum.Enum):
202
A = "A"
203
B = "B"
204
C = "C"
205
206
df = pl.DataFrame({"Col 1": ["A", "B", "C"]}, schema={"Col 1": PythonEnum})
207
assert df.dtypes[0] == pl.Enum(["A", "B", "C"])
208
209
210
@pytest.mark.parametrize(
211
"schema_param",
212
[
213
{
214
"schema": {
215
"date": pl.Date,
216
"time": pl.Time,
217
"datetime": pl.Datetime,
218
},
219
},
220
{
221
"schema_overrides": {
222
"date": pl.Date(),
223
"time": pl.Time(),
224
"datetime": pl.Datetime(),
225
},
226
},
227
],
228
)
229
def test_temporal_string_schema_overrides(schema_param: dict[str, SchemaDict]) -> None:
230
df = pl.DataFrame(
231
{
232
"date": ["2024-01-01", "2025-10-07"],
233
"time": ["12:00:00", "13:30:00"],
234
"datetime": ["2024-01-01 23:59:59", "2024-01-02T13:30:00.123456"],
235
},
236
**schema_param, # type: ignore[arg-type]
237
)
238
assert df.schema == {
239
"date": pl.Date,
240
"time": pl.Time,
241
"datetime": pl.Datetime("us"),
242
}
243
assert df.to_dicts() == [
244
{
245
"date": date(2024, 1, 1),
246
"time": time(12, 0),
247
"datetime": datetime(2024, 1, 1, 23, 59, 59),
248
},
249
{
250
"date": date(2025, 10, 7),
251
"time": time(13, 30),
252
"datetime": datetime(2024, 1, 2, 13, 30, 0, 123456),
253
},
254
]
255
256