Path: blob/main/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py
6939 views
from __future__ import annotations12from datetime import datetime3from decimal import Decimal as D4from typing import TYPE_CHECKING, Any56import numpy as np7import pytest8from hypothesis import given9from numpy.testing import assert_array_equal, assert_equal1011import polars as pl12from polars.testing import assert_frame_equal13from polars.testing.parametric import series1415if TYPE_CHECKING:16import numpy.typing as npt1718from polars._typing import IndexOrder, PolarsDataType192021def assert_zero_copy(s: pl.Series, arr: np.ndarray[Any, Any]) -> None:22if s.len() == 0:23return24s_ptr = s._get_buffers()["values"]._get_buffer_info()[0]25arr_ptr = arr.__array_interface__["data"][0]26assert s_ptr == arr_ptr272829@pytest.mark.may_fail_cloud30@pytest.mark.may_fail_auto_streaming31@given(32s=series(33min_size=6,34max_size=6,35allowed_dtypes=[pl.Datetime, pl.Duration],36allow_null=False,37allow_chunks=False,38)39)40def test_df_to_numpy_zero_copy(s: pl.Series) -> None:41df = pl.DataFrame({"a": s[:3], "b": s[3:]})4243result = df.to_numpy(allow_copy=False)4445assert_zero_copy(s, result)46assert result.flags.writeable is False474849@pytest.mark.parametrize(50("order", "f_contiguous", "c_contiguous"),51[52("fortran", True, False),53("c", False, True),54],55)56def test_to_numpy(order: IndexOrder, f_contiguous: bool, c_contiguous: bool) -> None:57df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})5859out_array = df.to_numpy(order=order)60expected_array = np.array([[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]], dtype=np.float64)61assert_array_equal(out_array, expected_array)62assert out_array.flags["F_CONTIGUOUS"] == f_contiguous63assert out_array.flags["C_CONTIGUOUS"] == c_contiguous6465structured_array = df.to_numpy(structured=True, order=order)66expected_array = np.array(67[(1, 1.0), (2, 2.0), (3, 3.0)], dtype=[("a", "<i8"), ("b", "<f8")]68)69assert_array_equal(structured_array, expected_array)70assert structured_array.flags["F_CONTIGUOUS"]7172# check string conversion; if no nulls can optimise as a fixed-width dtype73df = pl.DataFrame({"s": ["x", "y", None]})74assert df["s"].has_nulls()75assert_array_equal(76df.to_numpy(structured=True),77np.array([("x",), ("y",), (None,)], dtype=[("s", "O")]),78)79assert not df["s"][:2].has_nulls()80assert_array_equal(81df[:2].to_numpy(structured=True),82np.array([("x",), ("y",)], dtype=[("s", "<U1")]),83)848586def test_to_numpy_structured() -> None:87# round-trip structured array: validate init/export88structured_array = np.array(89[90("Google Pixel 7", 521.90, True),91("Apple iPhone 14 Pro", 999.00, True),92("OnePlus 11", 699.00, True),93("Samsung Galaxy S23 Ultra", 1199.99, False),94],95dtype=np.dtype(96[97("product", "U24"),98("price_usd", "float64"),99("in_stock", "bool"),100]101),102)103df = pl.from_numpy(structured_array)104assert df.schema == {105"product": pl.String,106"price_usd": pl.Float64,107"in_stock": pl.Boolean,108}109exported_array = df.to_numpy(structured=True)110assert exported_array["product"].dtype == np.dtype("U24")111assert_array_equal(exported_array, structured_array)112113# none/nan values114df = pl.DataFrame({"x": ["a", None, "b"], "y": [5.5, None, -5.5]})115exported_array = df.to_numpy(structured=True)116117assert exported_array.dtype == np.dtype([("x", object), ("y", float)])118for name in df.columns:119assert_equal(120list(exported_array[name]),121(122df[name].fill_null(float("nan"))123if df.schema[name].is_float()124else df[name]125).to_list(),126)127128129def test_numpy_preserve_uint64_4112() -> None:130df = pl.DataFrame({"a": [1, 2, 3]}).with_columns(pl.col("a").hash())131assert df.to_numpy().dtype == np.dtype("uint64")132assert df.to_numpy(structured=True).dtype == np.dtype([("a", "uint64")])133134135def test_df_to_numpy_decimal() -> None:136decimal_data = [D("1.234"), D("2.345"), D("-3.456")]137df = pl.Series("n", decimal_data).to_frame()138139result = df.to_numpy()140141expected = np.array(decimal_data).reshape((-1, 1))142assert_array_equal(result, expected)143144145def test_df_to_numpy_zero_copy_path() -> None:146rows = 10147cols = 5148x = np.ones((rows, cols), order="F")149x[:, 1] = 2.0150df = pl.DataFrame(x)151x = df.to_numpy(allow_copy=False)152assert x.flags.f_contiguous is True153assert x.flags.writeable is False154assert str(x[0, :]) == "[1. 2. 1. 1. 1.]"155156157@pytest.mark.may_fail_cloud158@pytest.mark.may_fail_auto_streaming159def test_df_to_numpy_zero_copy_path_temporal() -> None:160values = [datetime(1970 + i, 1, 1) for i in range(12)]161s = pl.Series(values)162df = pl.DataFrame({"a": s[:4], "b": s[4:8], "c": s[8:]})163164result: npt.NDArray[np.generic] = df.to_numpy(allow_copy=False)165assert result.flags.f_contiguous is True166assert result.flags.writeable is False167assert result.tolist() == [list(row) for row in df.iter_rows()]168169170def test_to_numpy_zero_copy_path_writable() -> None:171rows = 10172cols = 5173x = np.ones((rows, cols), order="F")174x[:, 1] = 2.0175df = pl.DataFrame(x)176x = df.to_numpy(writable=True)177assert x.flags["WRITEABLE"]178179180def test_df_to_numpy_structured_not_zero_copy() -> None:181df = pl.DataFrame({"a": [1, 2]})182msg = "cannot create structured array without copying data"183with pytest.raises(RuntimeError, match=msg):184df.to_numpy(structured=True, allow_copy=False)185186187def test_df_to_numpy_writable_not_zero_copy() -> None:188df = pl.DataFrame({"a": [1, 2]})189msg = "copy not allowed: cannot create a writable array without copying data"190with pytest.raises(RuntimeError, match=msg):191df.to_numpy(allow_copy=False, writable=True)192193194def test_df_to_numpy_not_zero_copy() -> None:195df = pl.DataFrame({"a": [1, 2, None]})196with pytest.raises(RuntimeError):197df.to_numpy(allow_copy=False)198199200@pytest.mark.parametrize(201("schema", "expected_dtype"),202[203({"a": pl.Int8, "b": pl.Int8}, np.int8),204({"a": pl.Int8, "b": pl.UInt16}, np.int32),205({"a": pl.Int8, "b": pl.String}, np.object_),206],207)208def test_df_to_numpy_empty_dtype_viewable(209schema: dict[str, PolarsDataType], expected_dtype: npt.DTypeLike210) -> None:211df = pl.DataFrame(schema=schema)212result = df.to_numpy(allow_copy=False)213assert result.shape == (0, 2)214assert result.dtype == expected_dtype215assert result.flags.writeable is True216217218def test_df_to_numpy_structured_nested() -> None:219df = pl.DataFrame(220{221"a": [1, 2],222"b": [3.0, 4.0],223"c": [{"x": "a", "y": 1.0}, {"x": "b", "y": 2.0}],224}225)226result = df.to_numpy(structured=True)227228expected = np.array(229[230(1, 3.0, ("a", 1.0)),231(2, 4.0, ("b", 2.0)),232],233dtype=[234("a", "<i8"),235("b", "<f8"),236("c", [("x", "<U1"), ("y", "<f8")]),237],238)239assert_array_equal(result, expected)240241242def test_df_to_numpy_stacking_array() -> None:243df = pl.DataFrame(244{"a": [[1, 2]], "b": 1},245schema={"a": pl.Array(pl.Int64, 2), "b": pl.Int32},246)247result = df.to_numpy()248249expected = np.array([[np.array([1, 2]), 1]], dtype=np.object_)250251assert result.shape == (1, 2)252assert result[0].shape == (2,)253assert_array_equal(result[0][0], expected[0][0])254255256@pytest.mark.parametrize("order", ["c", "fortran"])257def test_df_to_numpy_stacking_string(order: IndexOrder) -> None:258df = pl.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]})259result = df.to_numpy(order=order)260261expected = np.array([[1, "x"], [2, "y"], [3, "z"]], dtype=np.object_)262263assert_array_equal(result, expected)264if order == "c":265assert result.flags.c_contiguous is True266else:267assert result.flags.f_contiguous is True268269270def test_to_numpy_chunked_16375() -> None:271assert (272pl.concat(273[274pl.DataFrame({"a": [1, 1, 2], "b": [2, 3, 4]}),275pl.DataFrame({"a": [1, 1, 2], "b": [2, 3, 4]}),276],277rechunk=False,278).to_numpy()279== np.array([[1, 2], [1, 3], [2, 4], [1, 2], [1, 3], [2, 4]])280).all()281282283def test_to_numpy_c_order_1700() -> None:284rng = np.random.default_rng()285df = pl.DataFrame({f"col_{i}": rng.normal(size=20) for i in range(3)})286df_chunked = pl.concat([df.slice(i * 10, 10) for i in range(3)])287assert_frame_equal(288df_chunked,289pl.from_numpy(df_chunked.to_numpy(order="c"), schema=df_chunked.schema),290)291292293def test_to_numpy_array_shape_23426() -> None:294df = pl.DataFrame(295{296"x": [1, 2],297"y": [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]],298"z": [[[-1, -1, -2], [4, 5, 6]], [[-3, -5, -8], [10, 20, 30]]],299},300schema={301"x": pl.UInt8,302"y": pl.Array(pl.Float32, 3),303"z": pl.Array(pl.Int16, (2, 3)),304},305)306307assert_frame_equal(df, pl.from_numpy(df.to_numpy(structured=True)))308309310