CoCalc -- test_from

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/dataframe/test_from_dict.py
⁶⁹³⁹ views
1
from __future__ import annotations
2

3
from datetime import date, datetime, time, timedelta
4
from typing import Any
5

6
import numpy as np
7
import pytest
8

9
import polars as pl
10
from polars.testing import assert_frame_equal
11

12

13
def test_from_dict_with_column_order() -> None:
14
    # expect schema/columns order to take precedence
15
    schema = {"a": pl.UInt8, "b": pl.UInt32}
16
    data = {"b": [3, 4], "a": [1, 2]}
17
    for df in (
18
        pl.DataFrame(data, schema=schema),
19
        pl.DataFrame(data, schema=["a", "b"], schema_overrides=schema),
20
    ):
21
        # ┌─────┬─────┐
22
        # │ a   ┆ b   │
23
        # │ --- ┆ --- │
24
        # │ u8  ┆ u32 │
25
        # ╞═════╪═════╡
26
        # │ 1   ┆ 3   │
27
        # │ 2   ┆ 4   │
28
        # └─────┴─────┘
29
        assert df.columns == ["a", "b"]
30
        assert df.schema == {"a": pl.UInt8, "b": pl.UInt32}
31
        assert df.rows() == [(1, 3), (2, 4)]
32

33
        # expect an error
34
        mismatched_schema = {"x": pl.UInt8, "b": pl.UInt32}
35
        with pytest.raises(ValueError):
36
            pl.DataFrame({"b": [3, 4], "a": [1, 2]}, schema=mismatched_schema)
37

38

39
def test_from_dict_with_scalars() -> None:
40
    # one or more valid arrays, with some scalars (inc. None)
41
    df1 = pl.DataFrame(
42
        {"key": ["aa", "bb", "cc"], "misc": "xyz", "other": None, "value": 0}
43
    )
44
    assert df1.to_dict(as_series=False) == {
45
        "key": ["aa", "bb", "cc"],
46
        "misc": ["xyz", "xyz", "xyz"],
47
        "other": [None, None, None],
48
        "value": [0, 0, 0],
49
    }
50

51
    # edge-case: all scalars
52
    df2 = pl.DataFrame({"key": "aa", "misc": "xyz", "other": None, "value": 0})
53
    assert df2.to_dict(as_series=False) == {
54
        "key": ["aa"],
55
        "misc": ["xyz"],
56
        "other": [None],
57
        "value": [0],
58
    }
59

60
    # edge-case: single unsized generator
61
    df3 = pl.DataFrame({"vals": map(float, [1, 2, 3])})
62
    assert df3.to_dict(as_series=False) == {"vals": [1.0, 2.0, 3.0]}
63

64
    # ensure we don't accidentally consume or expand map/range/generator
65
    # cols, and can properly apply schema dtype/ordering directives
66
    df4 = pl.DataFrame(
67
        {
68
            "key": range(1, 4),
69
            "misc": (x for x in [4, 5, 6]),
70
            "other": map(float, [7, 8, 9]),
71
            "value": {0: "x", 1: "y", 2: "z"}.values(),
72
        },
73
        schema={
74
            "value": pl.String,
75
            "other": pl.Float32,
76
            "misc": pl.Int32,
77
            "key": pl.Int8,
78
        },
79
    )
80
    assert df4.columns == ["value", "other", "misc", "key"]
81
    assert df4.to_dict(as_series=False) == {
82
        "value": ["x", "y", "z"],
83
        "other": [7.0, 8.0, 9.0],
84
        "misc": [4, 5, 6],
85
        "key": [1, 2, 3],
86
    }
87
    assert df4.schema == {
88
        "value": pl.String,
89
        "other": pl.Float32,
90
        "misc": pl.Int32,
91
        "key": pl.Int8,
92
    }
93

94
    # mixed with struct cols
95
    for df5 in (
96
        pl.from_dict(
97
            {"x": {"b": [1, 3], "c": [2, 4]}, "y": [5, 6], "z": "x"},
98
            schema_overrides={"y": pl.Int8},
99
        ),
100
        pl.from_dict(
101
            {"x": {"b": [1, 3], "c": [2, 4]}, "y": [5, 6], "z": "x"},
102
            schema=["x", ("y", pl.Int8), "z"],
103
        ),
104
    ):
105
        assert df5.rows() == [({"b": 1, "c": 2}, 5, "x"), ({"b": 3, "c": 4}, 6, "x")]
106
        assert df5.schema == {
107
            "x": pl.Struct([pl.Field("b", pl.Int64), pl.Field("c", pl.Int64)]),
108
            "y": pl.Int8,
109
            "z": pl.String,
110
        }
111

112
    # mixed with numpy cols...
113
    df6 = pl.DataFrame(
114
        {"x": np.ones(3), "y": np.zeros(3), "z": 1.0},
115
    )
116
    assert df6.rows() == [(1.0, 0.0, 1.0), (1.0, 0.0, 1.0), (1.0, 0.0, 1.0)]
117

118
    # ...and trigger multithreaded load codepath
119
    df7 = pl.DataFrame(
120
        {
121
            "w": np.zeros(1001, dtype=np.uint8),
122
            "x": np.ones(1001, dtype=np.uint8),
123
            "y": np.zeros(1001, dtype=np.uint8),
124
            "z": 1,
125
        },
126
        schema_overrides={"z": pl.UInt8},
127
    )
128
    assert df7[999:].rows() == [(0, 1, 0, 1), (0, 1, 0, 1)]
129
    assert df7.schema == {
130
        "w": pl.UInt8,
131
        "x": pl.UInt8,
132
        "y": pl.UInt8,
133
        "z": pl.UInt8,
134
    }
135

136
    # misc generators/iterables
137
    df9 = pl.DataFrame(
138
        {
139
            "a": iter([0, 1, 2]),
140
            "b": (2, 1, 0).__iter__(),
141
            "c": (v for v in (0, 0, 0)),
142
            "d": "x",
143
        }
144
    )
145
    assert df9.rows() == [(0, 2, 0, "x"), (1, 1, 0, "x"), (2, 0, 0, "x")]
146

147

148
@pytest.mark.slow
149
def test_from_dict_with_values_mixed() -> None:
150
    # a bit of everything
151
    mixed_dtype_data: dict[str, Any] = {
152
        "a": 0,
153
        "b": 8,
154
        "c": 9.5,
155
        "d": None,
156
        "e": True,
157
        "f": False,
158
        "g": time(0, 1, 2),
159
        "h": date(2023, 3, 14),
160
        "i": timedelta(seconds=3601),
161
        "j": datetime(2111, 11, 11, 11, 11, 11, 11),
162
        "k": "「趣味でヒーローをやっている者だ」",
163
    }
164
    # note: deliberately set this value large; if all dtypes are
165
    # on the fast-path it'll only take ~0.03secs. if it becomes
166
    # even remotely noticeable that will indicate a regression.
167
    n_range = 1_000_000
168
    index_and_data: dict[str, Any] = {"idx": range(n_range)}
169
    index_and_data.update(mixed_dtype_data.items())
170
    df = pl.DataFrame(
171
        data=index_and_data,
172
        schema={
173
            "idx": pl.Int32,
174
            "a": pl.UInt16,
175
            "b": pl.UInt32,
176
            "c": pl.Float64,
177
            "d": pl.Float32,
178
            "e": pl.Boolean,
179
            "f": pl.Boolean,
180
            "g": pl.Time,
181
            "h": pl.Date,
182
            "i": pl.Duration,
183
            "j": pl.Datetime,
184
            "k": pl.String,
185
        },
186
    )
187
    dfx = df.select(pl.exclude("idx"))
188

189
    assert df.height == n_range
190
    assert dfx[:5].rows() == dfx[5:10].rows()
191
    assert dfx[-10:-5].rows() == dfx[-5:].rows()
192
    assert dfx.row(n_range // 2, named=True) == mixed_dtype_data
193

194

195
def test_from_dict_expand_nested_struct() -> None:
196
    # confirm consistent init of nested struct from dict data
197
    dt = date(2077, 10, 10)
198
    expected = pl.DataFrame(
199
        [
200
            pl.Series("x", [dt]),
201
            pl.Series("nested", [{"y": -1, "z": 1}]),
202
        ]
203
    )
204
    for df in (
205
        pl.DataFrame({"x": dt, "nested": {"y": -1, "z": 1}}),
206
        pl.DataFrame({"x": dt, "nested": [{"y": -1, "z": 1}]}),
207
        pl.DataFrame({"x": [dt], "nested": {"y": -1, "z": 1}}),
208
        pl.DataFrame({"x": [dt], "nested": [{"y": -1, "z": 1}]}),
209
    ):
210
        assert_frame_equal(expected, df)
211

212
    # confirm expansion to 'n' nested values
213
    nested_values = [{"y": -1, "z": 1}, {"y": -1, "z": 1}, {"y": -1, "z": 1}]
214
    expected = pl.DataFrame(
215
        [
216
            pl.Series("x", [0, 1, 2]),
217
            pl.Series("nested", nested_values),
218
        ]
219
    )
220
    for df in (
221
        pl.DataFrame({"x": range(3), "nested": {"y": -1, "z": 1}}),
222
        pl.DataFrame({"x": [0, 1, 2], "nested": {"y": -1, "z": 1}}),
223
    ):
224
        assert_frame_equal(expected, df)
225

226

227
def test_from_dict_duration_subseconds() -> None:
228
    d = {"duration": [timedelta(seconds=1, microseconds=1000)]}
229
    result = pl.from_dict(d)
230
    expected = pl.select(duration=pl.duration(seconds=1, microseconds=1000))
231
    assert_frame_equal(result, expected)
232

233

234
@pytest.mark.parametrize(
235
    ("dtype", "data"),
236
    [
237
        (pl.Date, date(2099, 12, 31)),
238
        (pl.Datetime("ms"), datetime(1998, 10, 1, 10, 30)),
239
        (pl.Duration("us"), timedelta(days=1)),
240
        (pl.Time, time(2, 30, 10)),
241
    ],
242
)
243
def test_from_dict_cast_logical_type(dtype: pl.DataType, data: Any) -> None:
244
    schema = {"data": dtype}
245
    df = pl.DataFrame({"data": [data]}, schema=schema)
246
    physical_dict = df.cast(pl.Int64).to_dict()
247

248
    df_from_dicts = pl.from_dicts(
249
        [
250
            {
251
                "data": physical_dict["data"][0],
252
            }
253
        ],
254
        schema=schema,
255
    )
256

257
    assert_frame_equal(df_from_dicts, df)
258

259
Product

Resources

Company