Path: blob/main/py-polars/tests/unit/dataframe/test_from_dict.py
6939 views
from __future__ import annotations12from datetime import date, datetime, time, timedelta3from typing import Any45import numpy as np6import pytest78import polars as pl9from polars.testing import assert_frame_equal101112def test_from_dict_with_column_order() -> None:13# expect schema/columns order to take precedence14schema = {"a": pl.UInt8, "b": pl.UInt32}15data = {"b": [3, 4], "a": [1, 2]}16for df in (17pl.DataFrame(data, schema=schema),18pl.DataFrame(data, schema=["a", "b"], schema_overrides=schema),19):20# ┌─────┬─────┐21# │ a ┆ b │22# │ --- ┆ --- │23# │ u8 ┆ u32 │24# ╞═════╪═════╡25# │ 1 ┆ 3 │26# │ 2 ┆ 4 │27# └─────┴─────┘28assert df.columns == ["a", "b"]29assert df.schema == {"a": pl.UInt8, "b": pl.UInt32}30assert df.rows() == [(1, 3), (2, 4)]3132# expect an error33mismatched_schema = {"x": pl.UInt8, "b": pl.UInt32}34with pytest.raises(ValueError):35pl.DataFrame({"b": [3, 4], "a": [1, 2]}, schema=mismatched_schema)363738def test_from_dict_with_scalars() -> None:39# one or more valid arrays, with some scalars (inc. None)40df1 = pl.DataFrame(41{"key": ["aa", "bb", "cc"], "misc": "xyz", "other": None, "value": 0}42)43assert df1.to_dict(as_series=False) == {44"key": ["aa", "bb", "cc"],45"misc": ["xyz", "xyz", "xyz"],46"other": [None, None, None],47"value": [0, 0, 0],48}4950# edge-case: all scalars51df2 = pl.DataFrame({"key": "aa", "misc": "xyz", "other": None, "value": 0})52assert df2.to_dict(as_series=False) == {53"key": ["aa"],54"misc": ["xyz"],55"other": [None],56"value": [0],57}5859# edge-case: single unsized generator60df3 = pl.DataFrame({"vals": map(float, [1, 2, 3])})61assert df3.to_dict(as_series=False) == {"vals": [1.0, 2.0, 3.0]}6263# ensure we don't accidentally consume or expand map/range/generator64# cols, and can properly apply schema dtype/ordering directives65df4 = pl.DataFrame(66{67"key": range(1, 4),68"misc": (x for x in [4, 5, 6]),69"other": map(float, [7, 8, 9]),70"value": {0: "x", 1: "y", 2: "z"}.values(),71},72schema={73"value": pl.String,74"other": pl.Float32,75"misc": pl.Int32,76"key": pl.Int8,77},78)79assert df4.columns == ["value", "other", "misc", "key"]80assert df4.to_dict(as_series=False) == {81"value": ["x", "y", "z"],82"other": [7.0, 8.0, 9.0],83"misc": [4, 5, 6],84"key": [1, 2, 3],85}86assert df4.schema == {87"value": pl.String,88"other": pl.Float32,89"misc": pl.Int32,90"key": pl.Int8,91}9293# mixed with struct cols94for df5 in (95pl.from_dict(96{"x": {"b": [1, 3], "c": [2, 4]}, "y": [5, 6], "z": "x"},97schema_overrides={"y": pl.Int8},98),99pl.from_dict(100{"x": {"b": [1, 3], "c": [2, 4]}, "y": [5, 6], "z": "x"},101schema=["x", ("y", pl.Int8), "z"],102),103):104assert df5.rows() == [({"b": 1, "c": 2}, 5, "x"), ({"b": 3, "c": 4}, 6, "x")]105assert df5.schema == {106"x": pl.Struct([pl.Field("b", pl.Int64), pl.Field("c", pl.Int64)]),107"y": pl.Int8,108"z": pl.String,109}110111# mixed with numpy cols...112df6 = pl.DataFrame(113{"x": np.ones(3), "y": np.zeros(3), "z": 1.0},114)115assert df6.rows() == [(1.0, 0.0, 1.0), (1.0, 0.0, 1.0), (1.0, 0.0, 1.0)]116117# ...and trigger multithreaded load codepath118df7 = pl.DataFrame(119{120"w": np.zeros(1001, dtype=np.uint8),121"x": np.ones(1001, dtype=np.uint8),122"y": np.zeros(1001, dtype=np.uint8),123"z": 1,124},125schema_overrides={"z": pl.UInt8},126)127assert df7[999:].rows() == [(0, 1, 0, 1), (0, 1, 0, 1)]128assert df7.schema == {129"w": pl.UInt8,130"x": pl.UInt8,131"y": pl.UInt8,132"z": pl.UInt8,133}134135# misc generators/iterables136df9 = pl.DataFrame(137{138"a": iter([0, 1, 2]),139"b": (2, 1, 0).__iter__(),140"c": (v for v in (0, 0, 0)),141"d": "x",142}143)144assert df9.rows() == [(0, 2, 0, "x"), (1, 1, 0, "x"), (2, 0, 0, "x")]145146147@pytest.mark.slow148def test_from_dict_with_values_mixed() -> None:149# a bit of everything150mixed_dtype_data: dict[str, Any] = {151"a": 0,152"b": 8,153"c": 9.5,154"d": None,155"e": True,156"f": False,157"g": time(0, 1, 2),158"h": date(2023, 3, 14),159"i": timedelta(seconds=3601),160"j": datetime(2111, 11, 11, 11, 11, 11, 11),161"k": "「趣味でヒーローをやっている者だ」",162}163# note: deliberately set this value large; if all dtypes are164# on the fast-path it'll only take ~0.03secs. if it becomes165# even remotely noticeable that will indicate a regression.166n_range = 1_000_000167index_and_data: dict[str, Any] = {"idx": range(n_range)}168index_and_data.update(mixed_dtype_data.items())169df = pl.DataFrame(170data=index_and_data,171schema={172"idx": pl.Int32,173"a": pl.UInt16,174"b": pl.UInt32,175"c": pl.Float64,176"d": pl.Float32,177"e": pl.Boolean,178"f": pl.Boolean,179"g": pl.Time,180"h": pl.Date,181"i": pl.Duration,182"j": pl.Datetime,183"k": pl.String,184},185)186dfx = df.select(pl.exclude("idx"))187188assert df.height == n_range189assert dfx[:5].rows() == dfx[5:10].rows()190assert dfx[-10:-5].rows() == dfx[-5:].rows()191assert dfx.row(n_range // 2, named=True) == mixed_dtype_data192193194def test_from_dict_expand_nested_struct() -> None:195# confirm consistent init of nested struct from dict data196dt = date(2077, 10, 10)197expected = pl.DataFrame(198[199pl.Series("x", [dt]),200pl.Series("nested", [{"y": -1, "z": 1}]),201]202)203for df in (204pl.DataFrame({"x": dt, "nested": {"y": -1, "z": 1}}),205pl.DataFrame({"x": dt, "nested": [{"y": -1, "z": 1}]}),206pl.DataFrame({"x": [dt], "nested": {"y": -1, "z": 1}}),207pl.DataFrame({"x": [dt], "nested": [{"y": -1, "z": 1}]}),208):209assert_frame_equal(expected, df)210211# confirm expansion to 'n' nested values212nested_values = [{"y": -1, "z": 1}, {"y": -1, "z": 1}, {"y": -1, "z": 1}]213expected = pl.DataFrame(214[215pl.Series("x", [0, 1, 2]),216pl.Series("nested", nested_values),217]218)219for df in (220pl.DataFrame({"x": range(3), "nested": {"y": -1, "z": 1}}),221pl.DataFrame({"x": [0, 1, 2], "nested": {"y": -1, "z": 1}}),222):223assert_frame_equal(expected, df)224225226def test_from_dict_duration_subseconds() -> None:227d = {"duration": [timedelta(seconds=1, microseconds=1000)]}228result = pl.from_dict(d)229expected = pl.select(duration=pl.duration(seconds=1, microseconds=1000))230assert_frame_equal(result, expected)231232233@pytest.mark.parametrize(234("dtype", "data"),235[236(pl.Date, date(2099, 12, 31)),237(pl.Datetime("ms"), datetime(1998, 10, 1, 10, 30)),238(pl.Duration("us"), timedelta(days=1)),239(pl.Time, time(2, 30, 10)),240],241)242def test_from_dict_cast_logical_type(dtype: pl.DataType, data: Any) -> None:243schema = {"data": dtype}244df = pl.DataFrame({"data": [data]}, schema=schema)245physical_dict = df.cast(pl.Int64).to_dict()246247df_from_dicts = pl.from_dicts(248[249{250"data": physical_dict["data"][0],251}252],253schema=schema,254)255256assert_frame_equal(df_from_dicts, df)257258259