Path: blob/main/py-polars/tests/unit/interop/test_from_pandas.py
8430 views
from __future__ import annotations12from datetime import datetime, timedelta3from typing import TYPE_CHECKING, Any45import numpy as np6import pandas as pd7import pytest89import polars as pl10from polars.testing import assert_frame_equal11from polars.testing.asserts.series import assert_series_equal1213if TYPE_CHECKING:14from polars._typing import PolarsDataType15from tests.conftest import PlMonkeyPatch161718def test_index_not_silently_excluded() -> None:19ddict = {"a": [1, 2, 3], "b": [4, 5, 6]}20df = pd.DataFrame(ddict, index=pd.Index([7, 8, 9], name="a"))21with pytest.raises(ValueError, match="indices and column names must not overlap"):22pl.from_pandas(df, include_index=True)232425def test_nameless_multiindex_doesnt_raise_with_include_index_false_18130() -> None:26df = pd.DataFrame(27range(4),28columns=["A"],29index=pd.MultiIndex.from_product((["C", "D"], [3, 4])),30)31result = pl.from_pandas(df)32expected = pl.DataFrame({"A": [0, 1, 2, 3]})33assert_frame_equal(result, expected)343536def test_from_pandas() -> None:37df = pd.DataFrame(38{39"bools": [False, True, False],40"bools_nulls": [None, True, False],41"int": [1, 2, 3],42"int_nulls": [1, None, 3],43"floats": [1.0, 2.0, 3.0],44"floats_nulls": [1.0, None, 3.0],45"strings": ["foo", "bar", "ham"],46"strings_nulls": ["foo", None, "ham"],47"strings-cat": ["foo", "bar", "ham"],48}49)50df["strings-cat"] = df["strings-cat"].astype("category")5152out = pl.from_pandas(df)53assert out.shape == (3, 9)54assert out.schema == {55"bools": pl.Boolean,56"bools_nulls": pl.Boolean,57"int": pl.Int64,58"int_nulls": pl.Float64,59"floats": pl.Float64,60"floats_nulls": pl.Float64,61"strings": pl.String,62"strings_nulls": pl.String,63"strings-cat": pl.Categorical(),64}65assert out.rows() == [66(False, None, 1, 1.0, 1.0, 1.0, "foo", "foo", "foo"),67(True, True, 2, None, 2.0, None, "bar", None, "bar"),68(False, False, 3, 3.0, 3.0, 3.0, "ham", "ham", "ham"),69]7071# partial dtype overrides from pandas72overrides = {"int": pl.Int8, "int_nulls": pl.Int32, "floats": pl.Float32}73out = pl.from_pandas(df, schema_overrides=overrides)74for col, dtype in overrides.items():75assert out.schema[col] == dtype767778@pytest.mark.parametrize(79"nulls",80[81[],82[None],83[None, None],84[None, None, None],85],86)87def test_from_pandas_nulls(nulls: list[None]) -> None:88# empty and/or all null values, no pandas dtype89ps = pd.Series(nulls)90s = pl.from_pandas(ps)91assert nulls == s.to_list()929394def test_from_pandas_nan_to_null() -> None:95df = pd.DataFrame(96{97"bools_nulls": [None, True, False],98"int_nulls": [1, None, 3],99"floats_nulls": [1.0, None, 3.0],100"strings_nulls": ["foo", None, "ham"],101"nulls": [None, np.nan, np.nan],102}103)104out_true = pl.from_pandas(df)105out_false = pl.from_pandas(df, nan_to_null=False)106assert all(val is None for val in out_true["nulls"])107assert all(np.isnan(val) for val in out_false["nulls"][1:])108109df = pd.Series([2, np.nan, None], name="pd") # type: ignore[assignment]110out_true = pl.from_pandas(df)111out_false = pl.from_pandas(df, nan_to_null=False)112assert [val is None for val in out_true]113assert [np.isnan(val) for val in out_false[1:]]114115116def test_from_pandas_datetime() -> None:117ts = datetime(2021, 1, 1, 20, 20, 20, 20)118pd_s = pd.Series([ts, ts])119tmp = pl.from_pandas(pd_s.to_frame("a"))120s = tmp["a"]121assert s.dt.hour()[0] == 20122assert s.dt.minute()[0] == 20123assert s.dt.second()[0] == 20124125date_times = pd.date_range("2021-06-24 00:00:00", "2021-06-24 09:00:00", freq="1h")126s = pl.from_pandas(date_times)127assert s[0] == datetime(2021, 6, 24, 0, 0)128assert s[-1] == datetime(2021, 6, 24, 9, 0)129130131@pytest.mark.parametrize(132("index_class", "index_data", "index_params", "expected_data", "expected_dtype"),133[134(pd.Index, [100, 200, 300], {}, None, pl.Int64),135(pd.Index, [1, 2, 3], {"dtype": "uint32"}, None, pl.UInt32),136(pd.RangeIndex, 5, {}, [0, 1, 2, 3, 4], pl.Int64),137(pd.CategoricalIndex, ["N", "E", "S", "W"], {}, None, pl.Categorical),138(139pd.DatetimeIndex,140[datetime(1960, 12, 31), datetime(2077, 10, 20)],141{"dtype": "datetime64[ms]"},142None,143pl.Datetime("ms"),144),145(146pd.TimedeltaIndex,147["24 hours", "2 days 8 hours", "3 days 42 seconds"],148{"dtype": "timedelta64[us]"},149[timedelta(1), timedelta(days=2, hours=8), timedelta(days=3, seconds=42)],150pl.Duration("us"),151),152],153)154def test_from_pandas_index(155index_class: Any,156index_data: Any,157index_params: dict[str, Any],158expected_data: list[Any] | None,159expected_dtype: PolarsDataType,160) -> None:161if expected_data is None:162expected_data = index_data163164s = pl.from_pandas(index_class(index_data, **index_params))165assert s.to_list() == expected_data166assert s.dtype == expected_dtype167168169def test_from_pandas_include_indexes() -> None:170data = {171"dtm": [datetime(2021, 1, 1), datetime(2021, 1, 2), datetime(2021, 1, 3)],172"val": [100, 200, 300],173"misc": ["x", "y", "z"],174}175pd_df = pd.DataFrame(data)176177df = pl.from_pandas(pd_df.set_index(["dtm"]))178assert df.to_dict(as_series=False) == {179"val": [100, 200, 300],180"misc": ["x", "y", "z"],181}182183df = pl.from_pandas(pd_df.set_index(["dtm", "val"]))184assert df.to_dict(as_series=False) == {"misc": ["x", "y", "z"]}185186df = pl.from_pandas(pd_df.set_index(["dtm"]), include_index=True)187assert df.to_dict(as_series=False) == data188189df = pl.from_pandas(pd_df.set_index(["dtm", "val"]), include_index=True)190assert df.to_dict(as_series=False) == data191192193def test_from_pandas_series_include_indexes() -> None:194# no default index195pd_series = pd.Series({"a": 1, "b": 2}, name="number").rename_axis(["letter"])196df = pl.from_pandas(pd_series, include_index=True)197assert df.to_dict(as_series=False) == {"letter": ["a", "b"], "number": [1, 2]}198199# default index200pd_series = pd.Series(range(2))201df = pl.from_pandas(pd_series, include_index=True)202assert df.to_dict(as_series=False) == {"index": [0, 1], "0": [0, 1]}203204205def test_duplicate_cols_diff_types() -> None:206df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["0", 0, "1", 1])207with pytest.raises(208ValueError,209match="Pandas dataframe contains non-unique indices and/or column names",210):211pl.from_pandas(df)212213214def test_from_pandas_duplicated_columns() -> None:215df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["a", "b", "c", "b"])216with pytest.raises(217ValueError,218match="Pandas dataframe contains non-unique indices and/or column names",219):220pl.from_pandas(df)221222223def test_from_pandas_null() -> None:224# null column is an object dtype, so pl.Utf8 is most close225df = pd.DataFrame([{"a": None}, {"a": None}])226out = pl.DataFrame(df)227assert out.dtypes == [pl.String]228assert out["a"][0] is None229230df = pd.DataFrame([{"a": None, "b": 1}, {"a": None, "b": 2}])231out = pl.DataFrame(df)232assert out.dtypes == [pl.String, pl.Int64]233234235def test_from_pandas_nested_list() -> None:236# this panicked in https://github.com/pola-rs/polars/issues/1615237pddf = pd.DataFrame(238{"a": [1, 2, 3, 4], "b": [["x", "y"], ["x", "y", "z"], ["x"], ["x", "y"]]}239)240pldf = pl.from_pandas(pddf)241assert pldf.shape == (4, 2)242assert pldf.rows() == [243(1, ["x", "y"]),244(2, ["x", "y", "z"]),245(3, ["x"]),246(4, ["x", "y"]),247]248249250def test_from_pandas_categorical_none() -> None:251s = pd.Series(["a", "b", "c", pd.NA], dtype="category")252out = pl.from_pandas(s)253assert out.dtype == pl.Categorical254assert out.to_list() == ["a", "b", "c", None]255256257def test_from_pandas_dataframe() -> None:258pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])259df = pl.from_pandas(pd_df)260assert df.shape == (2, 3)261assert df.rows() == [(1, 2, 3), (4, 5, 6)]262263# if not a pandas dataframe, raise a ValueError264with pytest.raises(TypeError):265_ = pl.from_pandas([1, 2]) # type: ignore[call-overload]266267268def test_from_pandas_series() -> None:269pd_series = pd.Series([1, 2, 3], name="pd")270s = pl.from_pandas(pd_series)271assert s.shape == (3,)272assert list(s) == [1, 2, 3]273274275def test_from_empty_pandas() -> None:276pandas_df = pd.DataFrame(277{278"A": [],279"fruits": [],280}281)282polars_df = pl.from_pandas(pandas_df)283assert polars_df.columns == ["A", "fruits"]284assert polars_df.dtypes == [pl.Float64, pl.Float64]285286287def test_from_null_column() -> None:288df = pl.from_pandas(pd.DataFrame(data=[pd.NA, pd.NA], columns=["n/a"]))289290assert df.shape == (2, 1)291assert df.columns == ["n/a"]292assert df.dtypes[0] == pl.Null293294295def test_from_pandas_ns_resolution() -> None:296df = pd.DataFrame(297[pd.Timestamp(year=2021, month=1, day=1, hour=1, second=1, nanosecond=1)],298columns=["date"],299)300assert pl.from_pandas(df)[0, 0] == datetime(2021, 1, 1, 1, 0, 1)301302303def test_pandas_string_none_conversion_3298() -> None:304data: dict[str, list[str | None]] = {"col_1": ["a", "b", "c", "d"]}305data["col_1"][0] = None306df_pd = pd.DataFrame(data)307df_pl = pl.DataFrame(df_pd)308assert df_pl.to_series().to_list() == [None, "b", "c", "d"]309310311def test_from_pandas_null_struct_6412() -> None:312data = [313{314"a": {315"b": None,316},317},318{"a": None},319]320df_pandas = pd.DataFrame(data)321assert pl.from_pandas(df_pandas).to_dict(as_series=False) == {322"a": [{"b": None}, None]323}324325326def test_untrusted_categorical_input() -> None:327df_pd = pd.DataFrame({"x": pd.Categorical(["x"], ["x", "y"])})328df = pl.from_pandas(df_pd)329result = df.group_by("x").len()330expected = pl.DataFrame(331{"x": ["x"], "len": [1]},332schema={"x": pl.Categorical, "len": pl.get_index_type()},333)334assert_frame_equal(result, expected, categorical_as_str=True)335336337@pytest.fixture338def _set_pyarrow_unavailable(plmonkeypatch: PlMonkeyPatch) -> None:339plmonkeypatch.setattr(340"polars._utils.construction.dataframe._PYARROW_AVAILABLE", False341)342plmonkeypatch.setattr("polars._utils.construction.series._PYARROW_AVAILABLE", False)343344345@pytest.mark.usefixtures("_set_pyarrow_unavailable")346def test_from_pandas_pyarrow_not_available_succeeds() -> None:347data: dict[str, Any] = {348"a": [1, 2],349"b": [3, 4],350"c": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[ns]"),351"d": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[us]"),352"e": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[ms]"),353"f": np.array([1, 2], dtype="timedelta64[ns]"),354"g": np.array([1, 2], dtype="timedelta64[us]"),355"h": np.array([1, 2], dtype="timedelta64[ms]"),356"i": [True, False],357}358359# DataFrame360result = pl.from_pandas(pd.DataFrame(data))361expected = pl.DataFrame(data)362assert_frame_equal(result, expected)363364# Series365for col in data:366s_pd = pd.Series(data[col])367result_s = pl.from_pandas(s_pd)368expected_s = pl.Series(data[col])369assert_series_equal(result_s, expected_s)370371372@pytest.mark.usefixtures("_set_pyarrow_unavailable")373def test_from_pandas_pyarrow_not_available_fails() -> None:374with pytest.raises(ImportError, match="pyarrow is required"):375pl.from_pandas(pd.DataFrame({"a": [1, 2, 3]}, dtype="Int64"))376with pytest.raises(ImportError, match="pyarrow is required"):377pl.from_pandas(pd.Series([1, 2, 3], dtype="Int64"))378with pytest.raises(ImportError, match="pyarrow is required"):379pl.from_pandas(380pd.DataFrame({"a": pd.to_datetime(["2020-01-01T00:00+01:00"]).to_series()})381)382with pytest.raises(ImportError, match="pyarrow is required"):383pl.from_pandas(pd.DataFrame({"a": [None, "foo"]}))384385386def test_from_pandas_nan_to_null_16453(plmonkeypatch: PlMonkeyPatch) -> None:387plmonkeypatch.setattr(388"polars._utils.construction.dataframe._MIN_NUMPY_SIZE_FOR_MULTITHREADING", 2389)390df = pd.DataFrame(391{"a": [np.nan, 1.0, 2], "b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]}392)393result = pl.from_pandas(df, nan_to_null=True)394expected = pl.DataFrame(395{"a": [None, 1.0, 2], "b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]}396)397assert_frame_equal(result, expected)398399400@pytest.mark.parametrize("null", [pd.NA, np.nan, None, float("nan")])401def test_from_pandas_string_with_natype_17355(null: Any) -> None:402# https://github.com/pola-rs/polars/issues/17355403404pd_df = pd.DataFrame({"col": ["a", null]})405result = pl.from_pandas(pd_df)406expected = pl.DataFrame({"col": ["a", None]})407assert_frame_equal(result, expected)408409410