Path: blob/main/py-polars/tests/unit/interop/test_to_pandas.py
6939 views
from __future__ import annotations12from datetime import date, datetime3from typing import TYPE_CHECKING, Literal45import hypothesis.strategies as st6import numpy as np7import pandas as pd8import pyarrow as pa9import pytest10from hypothesis import given1112import polars as pl1314if TYPE_CHECKING:15from polars._typing import PolarsDataType161718def test_df_to_pandas_empty() -> None:19df = pl.DataFrame()20result = df.to_pandas()21expected = pd.DataFrame()22pd.testing.assert_frame_equal(result, expected)232425def test_to_pandas() -> None:26df = pl.DataFrame(27{28"a": [1, 2, 3],29"b": [6, None, 8],30"c": [10.0, 25.0, 50.5],31"d": [date(2023, 7, 5), None, date(1999, 12, 13)],32"e": ["a", "b", "c"],33"f": [None, "e", "f"],34"g": [datetime.now(), datetime.now(), None],35},36schema_overrides={"a": pl.UInt8},37).with_columns(38pl.col("e").cast(pl.Categorical).alias("h"),39pl.col("f").cast(pl.Categorical).alias("i"),40)4142pd_out = df.to_pandas()4344pd_out_dtypes_expected = [45np.dtype(np.uint8),46np.dtype(np.float64),47np.dtype(np.float64),48np.dtype("datetime64[ms]"),49np.dtype(np.object_),50np.dtype(np.object_),51np.dtype("datetime64[us]"),52]53assert pd_out_dtypes_expected == pd_out.dtypes.to_list()[:-2]54assert all(55isinstance(dt, pd.CategoricalDtype) for dt in pd_out.dtypes.to_list()[-2:]56)5758pd_out_dtypes_expected[3] = np.dtype("O")59pd_out = df.to_pandas(date_as_object=True)60assert pd_out_dtypes_expected == pd_out.dtypes.to_list()[:-2]6162pd_pa_out = df.to_pandas(use_pyarrow_extension_array=True)63pd_pa_dtypes_names = [dtype.name for dtype in pd_pa_out.dtypes]64pd_pa_dtypes_names_expected = [65"uint8[pyarrow]",66"int64[pyarrow]",67"double[pyarrow]",68"date32[day][pyarrow]",69"large_string[pyarrow]",70"large_string[pyarrow]",71"timestamp[us][pyarrow]",72"dictionary<values=large_string, indices=int64, ordered=0>[pyarrow]",73"dictionary<values=large_string, indices=int64, ordered=0>[pyarrow]",74]75assert pd_pa_dtypes_names == pd_pa_dtypes_names_expected767778@pytest.mark.parametrize("dtype", [pl.Categorical, pl.Enum(["best", "test"])])79def test_cat_to_pandas(dtype: pl.DataType) -> None:80df = pl.DataFrame({"a": ["best", "test"]})81df = df.with_columns(pl.all().cast(dtype))8283pd_out = df.to_pandas()84assert isinstance(pd_out["a"].dtype, pd.CategoricalDtype)8586pd_pa_out = df.to_pandas(use_pyarrow_extension_array=True)87assert pd_pa_out["a"].dtype == pd.ArrowDtype(88pa.dictionary(pa.int64(), pa.large_string())89)909192@given(93column_type_names=st.lists(94st.one_of(st.just("Object"), st.just("Int32")), min_size=1, max_size=895)96)97def test_object_to_pandas(column_type_names: list[Literal["Object", "Int32"]]) -> None:98"""99Converting ``pl.Object`` dtype columns to Pandas is handled correctly.100101This edge case is handled with a separate code path than other data types,102so we test it more thoroughly.103"""104column_types = [getattr(pl, name) for name in column_type_names]105data = {106f"col_{i}": [object()] if dtype.is_object() else [-i]107for i, dtype in enumerate(column_types)108}109df = pl.DataFrame(110data, schema={f"col_{i}": column_types[i] for i in range(len(column_types))}111)112for pyarrow in [True, False]:113pandas_df = df.to_pandas(use_pyarrow_extension_array=pyarrow)114assert isinstance(pandas_df, pd.DataFrame)115assert pandas_df.to_dict(orient="list") == data116117118def test_from_empty_pandas_with_dtypes() -> None:119df = pd.DataFrame(columns=["a", "b"])120df["a"] = df["a"].astype(str)121df["b"] = df["b"].astype(float)122assert pl.from_pandas(df).dtypes == [pl.String, pl.Float64]123124df = pl.DataFrame(125data=[],126schema={127"a": pl.Int32,128"b": pl.Datetime,129"c": pl.Float32,130"d": pl.Duration,131"e": pl.String,132},133).to_pandas()134135assert pl.from_pandas(df).dtypes == [136pl.Int32,137pl.Datetime,138pl.Float32,139pl.Duration,140pl.String,141]142143144def test_to_pandas_series() -> None:145assert (pl.Series("a", [1, 2, 3]).to_pandas() == pd.Series([1, 2, 3])).all()146147148def test_to_pandas_date() -> None:149data = [date(1990, 1, 1), date(2024, 12, 31)]150s = pl.Series("a", data)151152result_series = s.to_pandas()153expected_series = pd.Series(data, dtype="datetime64[ms]", name="a")154pd.testing.assert_series_equal(result_series, expected_series)155156result_df = s.to_frame().to_pandas()157expected_df = expected_series.to_frame()158pd.testing.assert_frame_equal(result_df, expected_df)159160161def test_to_pandas_datetime() -> None:162data = [datetime(1990, 1, 1, 0, 0, 0), datetime(2024, 12, 31, 23, 59, 59)]163s = pl.Series("a", data)164165result_series = s.to_pandas()166expected_series = pd.Series(data, dtype="datetime64[us]", name="a")167pd.testing.assert_series_equal(result_series, expected_series)168169result_df = s.to_frame().to_pandas()170expected_df = expected_series.to_frame()171pd.testing.assert_frame_equal(result_df, expected_df)172173174@pytest.mark.parametrize("use_pyarrow_extension_array", [True, False])175def test_object_to_pandas_series(use_pyarrow_extension_array: bool) -> None:176values = [object(), [1, 2, 3]]177pd.testing.assert_series_equal(178pl.Series("a", values, dtype=pl.Object).to_pandas(179use_pyarrow_extension_array=use_pyarrow_extension_array180),181pd.Series(values, dtype=object, name="a"),182)183184185@pytest.mark.parametrize("polars_dtype", [pl.Categorical, pl.Enum(["a", "b"])])186def test_series_to_pandas_categorical(polars_dtype: PolarsDataType) -> None:187s = pl.Series("x", ["a", "b", "a"], dtype=polars_dtype)188result = s.to_pandas()189expected = pd.Series(["a", "b", "a"], name="x", dtype="category")190assert isinstance(result.dtype, pd.CategoricalDtype)191pd.testing.assert_series_equal(result, expected, check_categorical=False)192193194@pytest.mark.parametrize("polars_dtype", [pl.Categorical, pl.Enum(["a", "b"])])195def test_series_to_pandas_categorical_pyarrow(polars_dtype: PolarsDataType) -> None:196s = pl.Series("x", ["a", "b", "a"], dtype=polars_dtype)197result = s.to_pandas(use_pyarrow_extension_array=True)198assert s.to_list() == result.to_list()199200201