Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/interop/numpy/test_to_numpy_df.py
6939 views
1
from __future__ import annotations
2
3
from datetime import datetime
4
from decimal import Decimal as D
5
from typing import TYPE_CHECKING, Any
6
7
import numpy as np
8
import pytest
9
from hypothesis import given
10
from numpy.testing import assert_array_equal, assert_equal
11
12
import polars as pl
13
from polars.testing import assert_frame_equal
14
from polars.testing.parametric import series
15
16
if TYPE_CHECKING:
17
import numpy.typing as npt
18
19
from polars._typing import IndexOrder, PolarsDataType
20
21
22
def assert_zero_copy(s: pl.Series, arr: np.ndarray[Any, Any]) -> None:
23
if s.len() == 0:
24
return
25
s_ptr = s._get_buffers()["values"]._get_buffer_info()[0]
26
arr_ptr = arr.__array_interface__["data"][0]
27
assert s_ptr == arr_ptr
28
29
30
@pytest.mark.may_fail_cloud
31
@pytest.mark.may_fail_auto_streaming
32
@given(
33
s=series(
34
min_size=6,
35
max_size=6,
36
allowed_dtypes=[pl.Datetime, pl.Duration],
37
allow_null=False,
38
allow_chunks=False,
39
)
40
)
41
def test_df_to_numpy_zero_copy(s: pl.Series) -> None:
42
df = pl.DataFrame({"a": s[:3], "b": s[3:]})
43
44
result = df.to_numpy(allow_copy=False)
45
46
assert_zero_copy(s, result)
47
assert result.flags.writeable is False
48
49
50
@pytest.mark.parametrize(
51
("order", "f_contiguous", "c_contiguous"),
52
[
53
("fortran", True, False),
54
("c", False, True),
55
],
56
)
57
def test_to_numpy(order: IndexOrder, f_contiguous: bool, c_contiguous: bool) -> None:
58
df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
59
60
out_array = df.to_numpy(order=order)
61
expected_array = np.array([[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]], dtype=np.float64)
62
assert_array_equal(out_array, expected_array)
63
assert out_array.flags["F_CONTIGUOUS"] == f_contiguous
64
assert out_array.flags["C_CONTIGUOUS"] == c_contiguous
65
66
structured_array = df.to_numpy(structured=True, order=order)
67
expected_array = np.array(
68
[(1, 1.0), (2, 2.0), (3, 3.0)], dtype=[("a", "<i8"), ("b", "<f8")]
69
)
70
assert_array_equal(structured_array, expected_array)
71
assert structured_array.flags["F_CONTIGUOUS"]
72
73
# check string conversion; if no nulls can optimise as a fixed-width dtype
74
df = pl.DataFrame({"s": ["x", "y", None]})
75
assert df["s"].has_nulls()
76
assert_array_equal(
77
df.to_numpy(structured=True),
78
np.array([("x",), ("y",), (None,)], dtype=[("s", "O")]),
79
)
80
assert not df["s"][:2].has_nulls()
81
assert_array_equal(
82
df[:2].to_numpy(structured=True),
83
np.array([("x",), ("y",)], dtype=[("s", "<U1")]),
84
)
85
86
87
def test_to_numpy_structured() -> None:
88
# round-trip structured array: validate init/export
89
structured_array = np.array(
90
[
91
("Google Pixel 7", 521.90, True),
92
("Apple iPhone 14 Pro", 999.00, True),
93
("OnePlus 11", 699.00, True),
94
("Samsung Galaxy S23 Ultra", 1199.99, False),
95
],
96
dtype=np.dtype(
97
[
98
("product", "U24"),
99
("price_usd", "float64"),
100
("in_stock", "bool"),
101
]
102
),
103
)
104
df = pl.from_numpy(structured_array)
105
assert df.schema == {
106
"product": pl.String,
107
"price_usd": pl.Float64,
108
"in_stock": pl.Boolean,
109
}
110
exported_array = df.to_numpy(structured=True)
111
assert exported_array["product"].dtype == np.dtype("U24")
112
assert_array_equal(exported_array, structured_array)
113
114
# none/nan values
115
df = pl.DataFrame({"x": ["a", None, "b"], "y": [5.5, None, -5.5]})
116
exported_array = df.to_numpy(structured=True)
117
118
assert exported_array.dtype == np.dtype([("x", object), ("y", float)])
119
for name in df.columns:
120
assert_equal(
121
list(exported_array[name]),
122
(
123
df[name].fill_null(float("nan"))
124
if df.schema[name].is_float()
125
else df[name]
126
).to_list(),
127
)
128
129
130
def test_numpy_preserve_uint64_4112() -> None:
131
df = pl.DataFrame({"a": [1, 2, 3]}).with_columns(pl.col("a").hash())
132
assert df.to_numpy().dtype == np.dtype("uint64")
133
assert df.to_numpy(structured=True).dtype == np.dtype([("a", "uint64")])
134
135
136
def test_df_to_numpy_decimal() -> None:
137
decimal_data = [D("1.234"), D("2.345"), D("-3.456")]
138
df = pl.Series("n", decimal_data).to_frame()
139
140
result = df.to_numpy()
141
142
expected = np.array(decimal_data).reshape((-1, 1))
143
assert_array_equal(result, expected)
144
145
146
def test_df_to_numpy_zero_copy_path() -> None:
147
rows = 10
148
cols = 5
149
x = np.ones((rows, cols), order="F")
150
x[:, 1] = 2.0
151
df = pl.DataFrame(x)
152
x = df.to_numpy(allow_copy=False)
153
assert x.flags.f_contiguous is True
154
assert x.flags.writeable is False
155
assert str(x[0, :]) == "[1. 2. 1. 1. 1.]"
156
157
158
@pytest.mark.may_fail_cloud
159
@pytest.mark.may_fail_auto_streaming
160
def test_df_to_numpy_zero_copy_path_temporal() -> None:
161
values = [datetime(1970 + i, 1, 1) for i in range(12)]
162
s = pl.Series(values)
163
df = pl.DataFrame({"a": s[:4], "b": s[4:8], "c": s[8:]})
164
165
result: npt.NDArray[np.generic] = df.to_numpy(allow_copy=False)
166
assert result.flags.f_contiguous is True
167
assert result.flags.writeable is False
168
assert result.tolist() == [list(row) for row in df.iter_rows()]
169
170
171
def test_to_numpy_zero_copy_path_writable() -> None:
172
rows = 10
173
cols = 5
174
x = np.ones((rows, cols), order="F")
175
x[:, 1] = 2.0
176
df = pl.DataFrame(x)
177
x = df.to_numpy(writable=True)
178
assert x.flags["WRITEABLE"]
179
180
181
def test_df_to_numpy_structured_not_zero_copy() -> None:
182
df = pl.DataFrame({"a": [1, 2]})
183
msg = "cannot create structured array without copying data"
184
with pytest.raises(RuntimeError, match=msg):
185
df.to_numpy(structured=True, allow_copy=False)
186
187
188
def test_df_to_numpy_writable_not_zero_copy() -> None:
189
df = pl.DataFrame({"a": [1, 2]})
190
msg = "copy not allowed: cannot create a writable array without copying data"
191
with pytest.raises(RuntimeError, match=msg):
192
df.to_numpy(allow_copy=False, writable=True)
193
194
195
def test_df_to_numpy_not_zero_copy() -> None:
196
df = pl.DataFrame({"a": [1, 2, None]})
197
with pytest.raises(RuntimeError):
198
df.to_numpy(allow_copy=False)
199
200
201
@pytest.mark.parametrize(
202
("schema", "expected_dtype"),
203
[
204
({"a": pl.Int8, "b": pl.Int8}, np.int8),
205
({"a": pl.Int8, "b": pl.UInt16}, np.int32),
206
({"a": pl.Int8, "b": pl.String}, np.object_),
207
],
208
)
209
def test_df_to_numpy_empty_dtype_viewable(
210
schema: dict[str, PolarsDataType], expected_dtype: npt.DTypeLike
211
) -> None:
212
df = pl.DataFrame(schema=schema)
213
result = df.to_numpy(allow_copy=False)
214
assert result.shape == (0, 2)
215
assert result.dtype == expected_dtype
216
assert result.flags.writeable is True
217
218
219
def test_df_to_numpy_structured_nested() -> None:
220
df = pl.DataFrame(
221
{
222
"a": [1, 2],
223
"b": [3.0, 4.0],
224
"c": [{"x": "a", "y": 1.0}, {"x": "b", "y": 2.0}],
225
}
226
)
227
result = df.to_numpy(structured=True)
228
229
expected = np.array(
230
[
231
(1, 3.0, ("a", 1.0)),
232
(2, 4.0, ("b", 2.0)),
233
],
234
dtype=[
235
("a", "<i8"),
236
("b", "<f8"),
237
("c", [("x", "<U1"), ("y", "<f8")]),
238
],
239
)
240
assert_array_equal(result, expected)
241
242
243
def test_df_to_numpy_stacking_array() -> None:
244
df = pl.DataFrame(
245
{"a": [[1, 2]], "b": 1},
246
schema={"a": pl.Array(pl.Int64, 2), "b": pl.Int32},
247
)
248
result = df.to_numpy()
249
250
expected = np.array([[np.array([1, 2]), 1]], dtype=np.object_)
251
252
assert result.shape == (1, 2)
253
assert result[0].shape == (2,)
254
assert_array_equal(result[0][0], expected[0][0])
255
256
257
@pytest.mark.parametrize("order", ["c", "fortran"])
258
def test_df_to_numpy_stacking_string(order: IndexOrder) -> None:
259
df = pl.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]})
260
result = df.to_numpy(order=order)
261
262
expected = np.array([[1, "x"], [2, "y"], [3, "z"]], dtype=np.object_)
263
264
assert_array_equal(result, expected)
265
if order == "c":
266
assert result.flags.c_contiguous is True
267
else:
268
assert result.flags.f_contiguous is True
269
270
271
def test_to_numpy_chunked_16375() -> None:
272
assert (
273
pl.concat(
274
[
275
pl.DataFrame({"a": [1, 1, 2], "b": [2, 3, 4]}),
276
pl.DataFrame({"a": [1, 1, 2], "b": [2, 3, 4]}),
277
],
278
rechunk=False,
279
).to_numpy()
280
== np.array([[1, 2], [1, 3], [2, 4], [1, 2], [1, 3], [2, 4]])
281
).all()
282
283
284
def test_to_numpy_c_order_1700() -> None:
285
rng = np.random.default_rng()
286
df = pl.DataFrame({f"col_{i}": rng.normal(size=20) for i in range(3)})
287
df_chunked = pl.concat([df.slice(i * 10, 10) for i in range(3)])
288
assert_frame_equal(
289
df_chunked,
290
pl.from_numpy(df_chunked.to_numpy(order="c"), schema=df_chunked.schema),
291
)
292
293
294
def test_to_numpy_array_shape_23426() -> None:
295
df = pl.DataFrame(
296
{
297
"x": [1, 2],
298
"y": [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]],
299
"z": [[[-1, -1, -2], [4, 5, 6]], [[-3, -5, -8], [10, 20, 30]]],
300
},
301
schema={
302
"x": pl.UInt8,
303
"y": pl.Array(pl.Float32, 3),
304
"z": pl.Array(pl.Int16, (2, 3)),
305
},
306
)
307
308
assert_frame_equal(df, pl.from_numpy(df.to_numpy(structured=True)))
309
310