Path: blob/main/py-polars/tests/unit/interop/test_from_pandas.py
6939 views
from __future__ import annotations12from datetime import datetime, timedelta3from typing import TYPE_CHECKING, Any45import numpy as np6import pandas as pd7import pytest89import polars as pl10from polars.testing import assert_frame_equal11from polars.testing.asserts.series import assert_series_equal1213if TYPE_CHECKING:14from polars._typing import PolarsDataType151617def test_index_not_silently_excluded() -> None:18ddict = {"a": [1, 2, 3], "b": [4, 5, 6]}19df = pd.DataFrame(ddict, index=pd.Index([7, 8, 9], name="a"))20with pytest.raises(ValueError, match="indices and column names must not overlap"):21pl.from_pandas(df, include_index=True)222324def test_nameless_multiindex_doesnt_raise_with_include_index_false_18130() -> None:25df = pd.DataFrame(26range(4),27columns=["A"],28index=pd.MultiIndex.from_product((["C", "D"], [3, 4])),29)30result = pl.from_pandas(df)31expected = pl.DataFrame({"A": [0, 1, 2, 3]})32assert_frame_equal(result, expected)333435def test_from_pandas() -> None:36df = pd.DataFrame(37{38"bools": [False, True, False],39"bools_nulls": [None, True, False],40"int": [1, 2, 3],41"int_nulls": [1, None, 3],42"floats": [1.0, 2.0, 3.0],43"floats_nulls": [1.0, None, 3.0],44"strings": ["foo", "bar", "ham"],45"strings_nulls": ["foo", None, "ham"],46"strings-cat": ["foo", "bar", "ham"],47}48)49df["strings-cat"] = df["strings-cat"].astype("category")5051out = pl.from_pandas(df)52assert out.shape == (3, 9)53assert out.schema == {54"bools": pl.Boolean,55"bools_nulls": pl.Boolean,56"int": pl.Int64,57"int_nulls": pl.Float64,58"floats": pl.Float64,59"floats_nulls": pl.Float64,60"strings": pl.String,61"strings_nulls": pl.String,62"strings-cat": pl.Categorical(ordering="lexical"),63}64assert out.rows() == [65(False, None, 1, 1.0, 1.0, 1.0, "foo", "foo", "foo"),66(True, True, 2, None, 2.0, None, "bar", None, "bar"),67(False, False, 3, 3.0, 3.0, 3.0, "ham", "ham", "ham"),68]6970# partial dtype overrides from pandas71overrides = {"int": pl.Int8, "int_nulls": pl.Int32, "floats": pl.Float32}72out = pl.from_pandas(df, schema_overrides=overrides)73for col, dtype in overrides.items():74assert out.schema[col] == dtype757677@pytest.mark.parametrize(78"nulls",79[80[],81[None],82[None, None],83[None, None, None],84],85)86def test_from_pandas_nulls(nulls: list[None]) -> None:87# empty and/or all null values, no pandas dtype88ps = pd.Series(nulls)89s = pl.from_pandas(ps)90assert nulls == s.to_list()919293def test_from_pandas_nan_to_null() -> None:94df = pd.DataFrame(95{96"bools_nulls": [None, True, False],97"int_nulls": [1, None, 3],98"floats_nulls": [1.0, None, 3.0],99"strings_nulls": ["foo", None, "ham"],100"nulls": [None, np.nan, np.nan],101}102)103out_true = pl.from_pandas(df)104out_false = pl.from_pandas(df, nan_to_null=False)105assert all(val is None for val in out_true["nulls"])106assert all(np.isnan(val) for val in out_false["nulls"][1:])107108df = pd.Series([2, np.nan, None], name="pd") # type: ignore[assignment]109out_true = pl.from_pandas(df)110out_false = pl.from_pandas(df, nan_to_null=False)111assert [val is None for val in out_true]112assert [np.isnan(val) for val in out_false[1:]]113114115def test_from_pandas_datetime() -> None:116ts = datetime(2021, 1, 1, 20, 20, 20, 20)117pd_s = pd.Series([ts, ts])118tmp = pl.from_pandas(pd_s.to_frame("a"))119s = tmp["a"]120assert s.dt.hour()[0] == 20121assert s.dt.minute()[0] == 20122assert s.dt.second()[0] == 20123124date_times = pd.date_range("2021-06-24 00:00:00", "2021-06-24 09:00:00", freq="1h")125s = pl.from_pandas(date_times)126assert s[0] == datetime(2021, 6, 24, 0, 0)127assert s[-1] == datetime(2021, 6, 24, 9, 0)128129130@pytest.mark.parametrize(131("index_class", "index_data", "index_params", "expected_data", "expected_dtype"),132[133(pd.Index, [100, 200, 300], {}, None, pl.Int64),134(pd.Index, [1, 2, 3], {"dtype": "uint32"}, None, pl.UInt32),135(pd.RangeIndex, 5, {}, [0, 1, 2, 3, 4], pl.Int64),136(pd.CategoricalIndex, ["N", "E", "S", "W"], {}, None, pl.Categorical),137(138pd.DatetimeIndex,139[datetime(1960, 12, 31), datetime(2077, 10, 20)],140{"dtype": "datetime64[ms]"},141None,142pl.Datetime("ms"),143),144(145pd.TimedeltaIndex,146["24 hours", "2 days 8 hours", "3 days 42 seconds"],147{},148[timedelta(1), timedelta(days=2, hours=8), timedelta(days=3, seconds=42)],149pl.Duration("ns"),150),151],152)153def test_from_pandas_index(154index_class: Any,155index_data: Any,156index_params: dict[str, Any],157expected_data: list[Any] | None,158expected_dtype: PolarsDataType,159) -> None:160if expected_data is None:161expected_data = index_data162163s = pl.from_pandas(index_class(index_data, **index_params))164assert s.to_list() == expected_data165assert s.dtype == expected_dtype166167168def test_from_pandas_include_indexes() -> None:169data = {170"dtm": [datetime(2021, 1, 1), datetime(2021, 1, 2), datetime(2021, 1, 3)],171"val": [100, 200, 300],172"misc": ["x", "y", "z"],173}174pd_df = pd.DataFrame(data)175176df = pl.from_pandas(pd_df.set_index(["dtm"]))177assert df.to_dict(as_series=False) == {178"val": [100, 200, 300],179"misc": ["x", "y", "z"],180}181182df = pl.from_pandas(pd_df.set_index(["dtm", "val"]))183assert df.to_dict(as_series=False) == {"misc": ["x", "y", "z"]}184185df = pl.from_pandas(pd_df.set_index(["dtm"]), include_index=True)186assert df.to_dict(as_series=False) == data187188df = pl.from_pandas(pd_df.set_index(["dtm", "val"]), include_index=True)189assert df.to_dict(as_series=False) == data190191192def test_from_pandas_series_include_indexes() -> None:193# no default index194pd_series = pd.Series({"a": 1, "b": 2}, name="number").rename_axis(["letter"])195df = pl.from_pandas(pd_series, include_index=True)196assert df.to_dict(as_series=False) == {"letter": ["a", "b"], "number": [1, 2]}197198# default index199pd_series = pd.Series(range(2))200df = pl.from_pandas(pd_series, include_index=True)201assert df.to_dict(as_series=False) == {"index": [0, 1], "0": [0, 1]}202203204def test_duplicate_cols_diff_types() -> None:205df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["0", 0, "1", 1])206with pytest.raises(207ValueError,208match="Pandas dataframe contains non-unique indices and/or column names",209):210pl.from_pandas(df)211212213def test_from_pandas_duplicated_columns() -> None:214df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["a", "b", "c", "b"])215with pytest.raises(216ValueError,217match="Pandas dataframe contains non-unique indices and/or column names",218):219pl.from_pandas(df)220221222def test_from_pandas_null() -> None:223# null column is an object dtype, so pl.Utf8 is most close224df = pd.DataFrame([{"a": None}, {"a": None}])225out = pl.DataFrame(df)226assert out.dtypes == [pl.String]227assert out["a"][0] is None228229df = pd.DataFrame([{"a": None, "b": 1}, {"a": None, "b": 2}])230out = pl.DataFrame(df)231assert out.dtypes == [pl.String, pl.Int64]232233234def test_from_pandas_nested_list() -> None:235# this panicked in https://github.com/pola-rs/polars/issues/1615236pddf = pd.DataFrame(237{"a": [1, 2, 3, 4], "b": [["x", "y"], ["x", "y", "z"], ["x"], ["x", "y"]]}238)239pldf = pl.from_pandas(pddf)240assert pldf.shape == (4, 2)241assert pldf.rows() == [242(1, ["x", "y"]),243(2, ["x", "y", "z"]),244(3, ["x"]),245(4, ["x", "y"]),246]247248249def test_from_pandas_categorical_none() -> None:250s = pd.Series(["a", "b", "c", pd.NA], dtype="category")251out = pl.from_pandas(s)252assert out.dtype == pl.Categorical253assert out.to_list() == ["a", "b", "c", None]254255256def test_from_pandas_dataframe() -> None:257pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])258df = pl.from_pandas(pd_df)259assert df.shape == (2, 3)260assert df.rows() == [(1, 2, 3), (4, 5, 6)]261262# if not a pandas dataframe, raise a ValueError263with pytest.raises(TypeError):264_ = pl.from_pandas([1, 2]) # type: ignore[call-overload]265266267def test_from_pandas_series() -> None:268pd_series = pd.Series([1, 2, 3], name="pd")269s = pl.from_pandas(pd_series)270assert s.shape == (3,)271assert list(s) == [1, 2, 3]272273274def test_from_empty_pandas() -> None:275pandas_df = pd.DataFrame(276{277"A": [],278"fruits": [],279}280)281polars_df = pl.from_pandas(pandas_df)282assert polars_df.columns == ["A", "fruits"]283assert polars_df.dtypes == [pl.Float64, pl.Float64]284285286def test_from_null_column() -> None:287df = pl.from_pandas(pd.DataFrame(data=[pd.NA, pd.NA], columns=["n/a"]))288289assert df.shape == (2, 1)290assert df.columns == ["n/a"]291assert df.dtypes[0] == pl.Null292293294def test_from_pandas_ns_resolution() -> None:295df = pd.DataFrame(296[pd.Timestamp(year=2021, month=1, day=1, hour=1, second=1, nanosecond=1)],297columns=["date"],298)299assert pl.from_pandas(df)[0, 0] == datetime(2021, 1, 1, 1, 0, 1)300301302def test_pandas_string_none_conversion_3298() -> None:303data: dict[str, list[str | None]] = {"col_1": ["a", "b", "c", "d"]}304data["col_1"][0] = None305df_pd = pd.DataFrame(data)306df_pl = pl.DataFrame(df_pd)307assert df_pl.to_series().to_list() == [None, "b", "c", "d"]308309310def test_from_pandas_null_struct_6412() -> None:311data = [312{313"a": {314"b": None,315},316},317{"a": None},318]319df_pandas = pd.DataFrame(data)320assert pl.from_pandas(df_pandas).to_dict(as_series=False) == {321"a": [{"b": None}, None]322}323324325def test_untrusted_categorical_input() -> None:326df_pd = pd.DataFrame({"x": pd.Categorical(["x"], ["x", "y"])})327df = pl.from_pandas(df_pd)328result = df.group_by("x").len()329expected = pl.DataFrame(330{"x": ["x"], "len": [1]}, schema={"x": pl.Categorical, "len": pl.UInt32}331)332assert_frame_equal(result, expected, categorical_as_str=True)333334335@pytest.fixture336def _set_pyarrow_unavailable(monkeypatch: pytest.MonkeyPatch) -> None:337monkeypatch.setattr(338"polars._utils.construction.dataframe._PYARROW_AVAILABLE", False339)340monkeypatch.setattr("polars._utils.construction.series._PYARROW_AVAILABLE", False)341342343@pytest.mark.usefixtures("_set_pyarrow_unavailable")344def test_from_pandas_pyarrow_not_available_succeeds() -> None:345data: dict[str, Any] = {346"a": [1, 2],347"b": ["one", "two"],348"c": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[ns]"),349"d": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[us]"),350"e": np.array(["2020-01-01", "2020-01-02"], dtype="datetime64[ms]"),351"f": np.array([1, 2], dtype="timedelta64[ns]"),352"g": np.array([1, 2], dtype="timedelta64[us]"),353"h": np.array([1, 2], dtype="timedelta64[ms]"),354"i": [True, False],355}356357# DataFrame358result = pl.from_pandas(pd.DataFrame(data))359expected = pl.DataFrame(data)360assert_frame_equal(result, expected)361362# Series363for col in data:364s_pd = pd.Series(data[col])365result_s = pl.from_pandas(s_pd)366expected_s = pl.Series(data[col])367assert_series_equal(result_s, expected_s)368369370@pytest.mark.usefixtures("_set_pyarrow_unavailable")371def test_from_pandas_pyarrow_not_available_fails() -> None:372with pytest.raises(ImportError, match="pyarrow is required"):373pl.from_pandas(pd.DataFrame({"a": [1, 2, 3]}, dtype="Int64"))374with pytest.raises(ImportError, match="pyarrow is required"):375pl.from_pandas(pd.Series([1, 2, 3], dtype="Int64"))376with pytest.raises(ImportError, match="pyarrow is required"):377pl.from_pandas(378pd.DataFrame({"a": pd.to_datetime(["2020-01-01T00:00+01:00"]).to_series()})379)380with pytest.raises(ImportError, match="pyarrow is required"):381pl.from_pandas(pd.DataFrame({"a": [None, "foo"]}))382383384def test_from_pandas_nan_to_null_16453(monkeypatch: pytest.MonkeyPatch) -> None:385monkeypatch.setattr(386"polars._utils.construction.dataframe._MIN_NUMPY_SIZE_FOR_MULTITHREADING", 2387)388df = pd.DataFrame(389{"a": [np.nan, 1.0, 2], "b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]}390)391result = pl.from_pandas(df, nan_to_null=True)392expected = pl.DataFrame(393{"a": [None, 1.0, 2], "b": [1.0, 2.0, 3.0], "c": [4.0, 5.0, 6.0]}394)395assert_frame_equal(result, expected)396397398@pytest.mark.parametrize("null", [pd.NA, np.nan, None, float("nan")])399def test_from_pandas_string_with_natype_17355(null: Any) -> None:400# https://github.com/pola-rs/polars/issues/17355401402pd_df = pd.DataFrame({"col": ["a", null]})403result = pl.from_pandas(pd_df)404expected = pl.DataFrame({"col": ["a", None]})405assert_frame_equal(result, expected)406407408