Path: blob/main/py-polars/tests/unit/interop/test_to_pandas.py
8458 views
from __future__ import annotations12from datetime import date, datetime3from typing import TYPE_CHECKING, Literal45import hypothesis.strategies as st6import numpy as np7import pandas as pd8import pyarrow as pa9import pytest10from hypothesis import given1112import polars as pl13from polars._utils.various import parse_version1415if TYPE_CHECKING:16from polars._typing import PolarsDataType171819def test_df_to_pandas_empty() -> None:20df = pl.DataFrame()21result = df.to_pandas()22expected = pd.DataFrame()23pd.testing.assert_frame_equal(result, expected)242526def test_to_pandas() -> None:27df = pl.DataFrame(28{29"a": [1, 2, 3],30"b": [6, None, 8],31"c": [10.0, 25.0, 50.5],32"d": [date(2023, 7, 5), None, date(1999, 12, 13)],33"e": ["a", "b", "c"],34"f": [None, "e", "f"],35"g": [datetime.now(), datetime.now(), None],36},37schema_overrides={"a": pl.UInt8},38).with_columns(39pl.col("e").cast(pl.Categorical).alias("h"),40pl.col("f").cast(pl.Categorical).alias("i"),41)4243pd_out = df.to_pandas()4445pd_version = parse_version(pd.__version__)46string_dtype = (47pd.StringDtype(na_value=float("nan")) if pd_version >= (3,) else np.object_48)49pd_out_dtypes_expected = [50np.dtype(np.uint8),51np.dtype(np.float64),52np.dtype(np.float64),53np.dtype("datetime64[ms]"),54string_dtype,55string_dtype,56np.dtype("datetime64[us]"),57]58assert pd_out_dtypes_expected == pd_out.dtypes.to_list()[:-2]59assert all(60isinstance(dt, pd.CategoricalDtype) for dt in pd_out.dtypes.to_list()[-2:]61)6263pd_out_dtypes_expected[3] = np.dtype("O")64pd_out = df.to_pandas(date_as_object=True)65assert pd_out_dtypes_expected == pd_out.dtypes.to_list()[:-2]6667pd_pa_out = df.to_pandas(use_pyarrow_extension_array=True)68pd_pa_dtypes_names = [dtype.name for dtype in pd_pa_out.dtypes]69pd_pa_dtypes_names_expected = [70"uint8[pyarrow]",71"int64[pyarrow]",72"double[pyarrow]",73"date32[day][pyarrow]",74"large_string[pyarrow]",75"large_string[pyarrow]",76"timestamp[us][pyarrow]",77"dictionary<values=large_string, indices=int64, ordered=0>[pyarrow]",78"dictionary<values=large_string, indices=int64, ordered=0>[pyarrow]",79]80assert pd_pa_dtypes_names == pd_pa_dtypes_names_expected818283@pytest.mark.parametrize("dtype", [pl.Categorical, pl.Enum(["best", "test"])])84def test_cat_to_pandas(dtype: pl.DataType) -> None:85df = pl.DataFrame({"a": ["best", "test"]})86df = df.with_columns(pl.all().cast(dtype))8788pd_out = df.to_pandas()89assert isinstance(pd_out["a"].dtype, pd.CategoricalDtype)9091pd_pa_out = df.to_pandas(use_pyarrow_extension_array=True)92assert pd_pa_out["a"].dtype == pd.ArrowDtype(93pa.dictionary(pa.int64(), pa.large_string())94)9596assert pl.Series(dtype=pl.Enum(["A"])).to_pandas().dtype.categories.tolist() == [ # type: ignore[union-attr]97"A"98]99100101@given(102column_type_names=st.lists(103st.one_of(st.just("Object"), st.just("Int32")), min_size=1, max_size=8104)105)106def test_object_to_pandas(column_type_names: list[Literal["Object", "Int32"]]) -> None:107"""108Converting ``pl.Object`` dtype columns to Pandas is handled correctly.109110This edge case is handled with a separate code path than other data types,111so we test it more thoroughly.112"""113column_types = [getattr(pl, name) for name in column_type_names]114data = {115f"col_{i}": [object()] if dtype.is_object() else [-i]116for i, dtype in enumerate(column_types)117}118df = pl.DataFrame(119data, schema={f"col_{i}": column_types[i] for i in range(len(column_types))}120)121for pyarrow in [True, False]:122pandas_df = df.to_pandas(use_pyarrow_extension_array=pyarrow)123assert isinstance(pandas_df, pd.DataFrame)124assert pandas_df.to_dict(orient="list") == data125126127def test_from_empty_pandas_with_dtypes() -> None:128df = pd.DataFrame(columns=["a", "b"])129df["a"] = df["a"].astype(str)130df["b"] = df["b"].astype(float)131assert pl.from_pandas(df).dtypes == [pl.String, pl.Float64]132133df = pl.DataFrame(134data=[],135schema={136"a": pl.Int32,137"b": pl.Datetime,138"c": pl.Float32,139"d": pl.Duration,140"e": pl.String,141},142).to_pandas()143144assert pl.from_pandas(df).dtypes == [145pl.Int32,146pl.Datetime,147pl.Float32,148pl.Duration,149pl.String,150]151152153def test_to_pandas_series() -> None:154assert (pl.Series("a", [1, 2, 3]).to_pandas() == pd.Series([1, 2, 3])).all()155156157def test_to_pandas_date() -> None:158data = [date(1990, 1, 1), date(2024, 12, 31)]159s = pl.Series("a", data)160161result_series = s.to_pandas()162expected_series = pd.Series(data, dtype="datetime64[ms]", name="a")163pd.testing.assert_series_equal(result_series, expected_series)164165result_df = s.to_frame().to_pandas()166expected_df = expected_series.to_frame()167pd.testing.assert_frame_equal(result_df, expected_df)168169170def test_to_pandas_datetime() -> None:171data = [datetime(1990, 1, 1, 0, 0, 0), datetime(2024, 12, 31, 23, 59, 59)]172s = pl.Series("a", data)173174result_series = s.to_pandas()175expected_series = pd.Series(data, dtype="datetime64[us]", name="a")176pd.testing.assert_series_equal(result_series, expected_series)177178result_df = s.to_frame().to_pandas()179expected_df = expected_series.to_frame()180pd.testing.assert_frame_equal(result_df, expected_df)181182183@pytest.mark.parametrize("use_pyarrow_extension_array", [True, False])184def test_object_to_pandas_series(use_pyarrow_extension_array: bool) -> None:185values = [object(), [1, 2, 3]]186pd.testing.assert_series_equal(187pl.Series("a", values, dtype=pl.Object).to_pandas(188use_pyarrow_extension_array=use_pyarrow_extension_array189),190pd.Series(values, dtype=object, name="a"),191)192193194@pytest.mark.parametrize("polars_dtype", [pl.Categorical, pl.Enum(["a", "b"])])195def test_series_to_pandas_categorical(polars_dtype: PolarsDataType) -> None:196s = pl.Series("x", ["a", "b", "a"], dtype=polars_dtype)197result = s.to_pandas()198expected = pd.Series(["a", "b", "a"], name="x", dtype="category")199assert isinstance(result.dtype, pd.CategoricalDtype)200pd.testing.assert_series_equal(result, expected, check_categorical=False)201202203@pytest.mark.parametrize("polars_dtype", [pl.Categorical, pl.Enum(["a", "b"])])204def test_series_to_pandas_categorical_pyarrow(polars_dtype: PolarsDataType) -> None:205s = pl.Series("x", ["a", "b", "a"], dtype=polars_dtype)206result = s.to_pandas(use_pyarrow_extension_array=True)207assert s.to_list() == result.to_list()208209210