Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/dataframe/test_from_dict.py
6939 views
1
from __future__ import annotations
2
3
from datetime import date, datetime, time, timedelta
4
from typing import Any
5
6
import numpy as np
7
import pytest
8
9
import polars as pl
10
from polars.testing import assert_frame_equal
11
12
13
def test_from_dict_with_column_order() -> None:
14
# expect schema/columns order to take precedence
15
schema = {"a": pl.UInt8, "b": pl.UInt32}
16
data = {"b": [3, 4], "a": [1, 2]}
17
for df in (
18
pl.DataFrame(data, schema=schema),
19
pl.DataFrame(data, schema=["a", "b"], schema_overrides=schema),
20
):
21
# ┌─────┬─────┐
22
# │ a ┆ b │
23
# │ --- ┆ --- │
24
# │ u8 ┆ u32 │
25
# ╞═════╪═════╡
26
# │ 1 ┆ 3 │
27
# │ 2 ┆ 4 │
28
# └─────┴─────┘
29
assert df.columns == ["a", "b"]
30
assert df.schema == {"a": pl.UInt8, "b": pl.UInt32}
31
assert df.rows() == [(1, 3), (2, 4)]
32
33
# expect an error
34
mismatched_schema = {"x": pl.UInt8, "b": pl.UInt32}
35
with pytest.raises(ValueError):
36
pl.DataFrame({"b": [3, 4], "a": [1, 2]}, schema=mismatched_schema)
37
38
39
def test_from_dict_with_scalars() -> None:
40
# one or more valid arrays, with some scalars (inc. None)
41
df1 = pl.DataFrame(
42
{"key": ["aa", "bb", "cc"], "misc": "xyz", "other": None, "value": 0}
43
)
44
assert df1.to_dict(as_series=False) == {
45
"key": ["aa", "bb", "cc"],
46
"misc": ["xyz", "xyz", "xyz"],
47
"other": [None, None, None],
48
"value": [0, 0, 0],
49
}
50
51
# edge-case: all scalars
52
df2 = pl.DataFrame({"key": "aa", "misc": "xyz", "other": None, "value": 0})
53
assert df2.to_dict(as_series=False) == {
54
"key": ["aa"],
55
"misc": ["xyz"],
56
"other": [None],
57
"value": [0],
58
}
59
60
# edge-case: single unsized generator
61
df3 = pl.DataFrame({"vals": map(float, [1, 2, 3])})
62
assert df3.to_dict(as_series=False) == {"vals": [1.0, 2.0, 3.0]}
63
64
# ensure we don't accidentally consume or expand map/range/generator
65
# cols, and can properly apply schema dtype/ordering directives
66
df4 = pl.DataFrame(
67
{
68
"key": range(1, 4),
69
"misc": (x for x in [4, 5, 6]),
70
"other": map(float, [7, 8, 9]),
71
"value": {0: "x", 1: "y", 2: "z"}.values(),
72
},
73
schema={
74
"value": pl.String,
75
"other": pl.Float32,
76
"misc": pl.Int32,
77
"key": pl.Int8,
78
},
79
)
80
assert df4.columns == ["value", "other", "misc", "key"]
81
assert df4.to_dict(as_series=False) == {
82
"value": ["x", "y", "z"],
83
"other": [7.0, 8.0, 9.0],
84
"misc": [4, 5, 6],
85
"key": [1, 2, 3],
86
}
87
assert df4.schema == {
88
"value": pl.String,
89
"other": pl.Float32,
90
"misc": pl.Int32,
91
"key": pl.Int8,
92
}
93
94
# mixed with struct cols
95
for df5 in (
96
pl.from_dict(
97
{"x": {"b": [1, 3], "c": [2, 4]}, "y": [5, 6], "z": "x"},
98
schema_overrides={"y": pl.Int8},
99
),
100
pl.from_dict(
101
{"x": {"b": [1, 3], "c": [2, 4]}, "y": [5, 6], "z": "x"},
102
schema=["x", ("y", pl.Int8), "z"],
103
),
104
):
105
assert df5.rows() == [({"b": 1, "c": 2}, 5, "x"), ({"b": 3, "c": 4}, 6, "x")]
106
assert df5.schema == {
107
"x": pl.Struct([pl.Field("b", pl.Int64), pl.Field("c", pl.Int64)]),
108
"y": pl.Int8,
109
"z": pl.String,
110
}
111
112
# mixed with numpy cols...
113
df6 = pl.DataFrame(
114
{"x": np.ones(3), "y": np.zeros(3), "z": 1.0},
115
)
116
assert df6.rows() == [(1.0, 0.0, 1.0), (1.0, 0.0, 1.0), (1.0, 0.0, 1.0)]
117
118
# ...and trigger multithreaded load codepath
119
df7 = pl.DataFrame(
120
{
121
"w": np.zeros(1001, dtype=np.uint8),
122
"x": np.ones(1001, dtype=np.uint8),
123
"y": np.zeros(1001, dtype=np.uint8),
124
"z": 1,
125
},
126
schema_overrides={"z": pl.UInt8},
127
)
128
assert df7[999:].rows() == [(0, 1, 0, 1), (0, 1, 0, 1)]
129
assert df7.schema == {
130
"w": pl.UInt8,
131
"x": pl.UInt8,
132
"y": pl.UInt8,
133
"z": pl.UInt8,
134
}
135
136
# misc generators/iterables
137
df9 = pl.DataFrame(
138
{
139
"a": iter([0, 1, 2]),
140
"b": (2, 1, 0).__iter__(),
141
"c": (v for v in (0, 0, 0)),
142
"d": "x",
143
}
144
)
145
assert df9.rows() == [(0, 2, 0, "x"), (1, 1, 0, "x"), (2, 0, 0, "x")]
146
147
148
@pytest.mark.slow
149
def test_from_dict_with_values_mixed() -> None:
150
# a bit of everything
151
mixed_dtype_data: dict[str, Any] = {
152
"a": 0,
153
"b": 8,
154
"c": 9.5,
155
"d": None,
156
"e": True,
157
"f": False,
158
"g": time(0, 1, 2),
159
"h": date(2023, 3, 14),
160
"i": timedelta(seconds=3601),
161
"j": datetime(2111, 11, 11, 11, 11, 11, 11),
162
"k": "「趣味でヒーローをやっている者だ」",
163
}
164
# note: deliberately set this value large; if all dtypes are
165
# on the fast-path it'll only take ~0.03secs. if it becomes
166
# even remotely noticeable that will indicate a regression.
167
n_range = 1_000_000
168
index_and_data: dict[str, Any] = {"idx": range(n_range)}
169
index_and_data.update(mixed_dtype_data.items())
170
df = pl.DataFrame(
171
data=index_and_data,
172
schema={
173
"idx": pl.Int32,
174
"a": pl.UInt16,
175
"b": pl.UInt32,
176
"c": pl.Float64,
177
"d": pl.Float32,
178
"e": pl.Boolean,
179
"f": pl.Boolean,
180
"g": pl.Time,
181
"h": pl.Date,
182
"i": pl.Duration,
183
"j": pl.Datetime,
184
"k": pl.String,
185
},
186
)
187
dfx = df.select(pl.exclude("idx"))
188
189
assert df.height == n_range
190
assert dfx[:5].rows() == dfx[5:10].rows()
191
assert dfx[-10:-5].rows() == dfx[-5:].rows()
192
assert dfx.row(n_range // 2, named=True) == mixed_dtype_data
193
194
195
def test_from_dict_expand_nested_struct() -> None:
196
# confirm consistent init of nested struct from dict data
197
dt = date(2077, 10, 10)
198
expected = pl.DataFrame(
199
[
200
pl.Series("x", [dt]),
201
pl.Series("nested", [{"y": -1, "z": 1}]),
202
]
203
)
204
for df in (
205
pl.DataFrame({"x": dt, "nested": {"y": -1, "z": 1}}),
206
pl.DataFrame({"x": dt, "nested": [{"y": -1, "z": 1}]}),
207
pl.DataFrame({"x": [dt], "nested": {"y": -1, "z": 1}}),
208
pl.DataFrame({"x": [dt], "nested": [{"y": -1, "z": 1}]}),
209
):
210
assert_frame_equal(expected, df)
211
212
# confirm expansion to 'n' nested values
213
nested_values = [{"y": -1, "z": 1}, {"y": -1, "z": 1}, {"y": -1, "z": 1}]
214
expected = pl.DataFrame(
215
[
216
pl.Series("x", [0, 1, 2]),
217
pl.Series("nested", nested_values),
218
]
219
)
220
for df in (
221
pl.DataFrame({"x": range(3), "nested": {"y": -1, "z": 1}}),
222
pl.DataFrame({"x": [0, 1, 2], "nested": {"y": -1, "z": 1}}),
223
):
224
assert_frame_equal(expected, df)
225
226
227
def test_from_dict_duration_subseconds() -> None:
228
d = {"duration": [timedelta(seconds=1, microseconds=1000)]}
229
result = pl.from_dict(d)
230
expected = pl.select(duration=pl.duration(seconds=1, microseconds=1000))
231
assert_frame_equal(result, expected)
232
233
234
@pytest.mark.parametrize(
235
("dtype", "data"),
236
[
237
(pl.Date, date(2099, 12, 31)),
238
(pl.Datetime("ms"), datetime(1998, 10, 1, 10, 30)),
239
(pl.Duration("us"), timedelta(days=1)),
240
(pl.Time, time(2, 30, 10)),
241
],
242
)
243
def test_from_dict_cast_logical_type(dtype: pl.DataType, data: Any) -> None:
244
schema = {"data": dtype}
245
df = pl.DataFrame({"data": [data]}, schema=schema)
246
physical_dict = df.cast(pl.Int64).to_dict()
247
248
df_from_dicts = pl.from_dicts(
249
[
250
{
251
"data": physical_dict["data"][0],
252
}
253
],
254
schema=schema,
255
)
256
257
assert_frame_equal(df_from_dicts, df)
258
259