Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py
6939 views
1
from __future__ import annotations
2
3
from datetime import date, datetime, time, timedelta
4
from decimal import Decimal as D
5
from pathlib import Path
6
from typing import TYPE_CHECKING, Any
7
8
import numpy as np
9
import pytest
10
from hypothesis import given, settings
11
from numpy.testing import assert_array_equal
12
13
import polars as pl
14
from polars.testing import assert_series_equal
15
from polars.testing.parametric import series
16
17
if TYPE_CHECKING:
18
import numpy.typing as npt
19
20
from polars._typing import PolarsDataType
21
22
23
def assert_zero_copy(s: pl.Series, arr: np.ndarray[Any, Any]) -> None:
24
if s.len() == 0:
25
return
26
s_ptr = s._get_buffers()["values"]._get_buffer_info()[0]
27
arr_ptr = arr.__array_interface__["data"][0]
28
assert s_ptr == arr_ptr
29
30
31
def assert_allow_copy_false_raises(s: pl.Series) -> None:
32
with pytest.raises(RuntimeError, match="copy not allowed"):
33
s.to_numpy(allow_copy=False)
34
35
36
@pytest.mark.parametrize(
37
("dtype", "expected_dtype"),
38
[
39
(pl.Int8, np.int8),
40
(pl.Int16, np.int16),
41
(pl.Int32, np.int32),
42
(pl.Int64, np.int64),
43
(pl.UInt8, np.uint8),
44
(pl.UInt16, np.uint16),
45
(pl.UInt32, np.uint32),
46
(pl.UInt64, np.uint64),
47
(pl.Float32, np.float32),
48
(pl.Float64, np.float64),
49
],
50
)
51
def test_series_to_numpy_numeric_zero_copy(
52
dtype: PolarsDataType, expected_dtype: npt.DTypeLike
53
) -> None:
54
s = pl.Series([1, 2, 3]).cast(dtype)
55
result: npt.NDArray[np.generic] = s.to_numpy(allow_copy=False)
56
57
assert_zero_copy(s, result)
58
assert result.tolist() == s.to_list()
59
assert result.dtype == expected_dtype
60
61
62
@pytest.mark.parametrize(
63
("dtype", "expected_dtype"),
64
[
65
(pl.Int8, np.float32),
66
(pl.Int16, np.float32),
67
(pl.Int32, np.float64),
68
(pl.Int64, np.float64),
69
(pl.UInt8, np.float32),
70
(pl.UInt16, np.float32),
71
(pl.UInt32, np.float64),
72
(pl.UInt64, np.float64),
73
(pl.Float32, np.float32),
74
(pl.Float64, np.float64),
75
],
76
)
77
def test_series_to_numpy_numeric_with_nulls(
78
dtype: PolarsDataType, expected_dtype: npt.DTypeLike
79
) -> None:
80
s = pl.Series([1, 2, None], dtype=dtype, strict=False)
81
result: npt.NDArray[np.generic] = s.to_numpy()
82
83
assert result.tolist()[:-1] == s.to_list()[:-1]
84
assert np.isnan(result[-1])
85
assert result.dtype == expected_dtype
86
assert_allow_copy_false_raises(s)
87
88
89
@pytest.mark.parametrize(
90
("dtype", "expected_dtype"),
91
[
92
(pl.Duration, np.dtype("timedelta64[us]")),
93
(pl.Duration("ms"), np.dtype("timedelta64[ms]")),
94
(pl.Duration("us"), np.dtype("timedelta64[us]")),
95
(pl.Duration("ns"), np.dtype("timedelta64[ns]")),
96
(pl.Datetime, np.dtype("datetime64[us]")),
97
(pl.Datetime("ms"), np.dtype("datetime64[ms]")),
98
(pl.Datetime("us"), np.dtype("datetime64[us]")),
99
(pl.Datetime("ns"), np.dtype("datetime64[ns]")),
100
],
101
)
102
def test_series_to_numpy_temporal_zero_copy(
103
dtype: PolarsDataType, expected_dtype: npt.DTypeLike
104
) -> None:
105
values = [0, 2_000, 1_000_000]
106
s = pl.Series(values, dtype=dtype, strict=False)
107
result: npt.NDArray[np.generic] = s.to_numpy(allow_copy=False)
108
109
assert_zero_copy(s, result)
110
# NumPy tolist returns integers for ns precision
111
if s.dtype.time_unit == "ns": # type: ignore[attr-defined]
112
assert result.tolist() == values
113
else:
114
assert result.tolist() == s.to_list()
115
assert result.dtype == expected_dtype
116
117
118
def test_series_to_numpy_datetime_with_tz_zero_copy() -> None:
119
values = [datetime(1970, 1, 1), datetime(2024, 2, 28)]
120
s = pl.Series(values).dt.convert_time_zone("Europe/Amsterdam").rechunk()
121
result: npt.NDArray[np.generic] = s.to_numpy(allow_copy=False)
122
123
assert_zero_copy(s, result)
124
assert result.tolist() == values
125
assert result.dtype == np.dtype("datetime64[us]")
126
127
128
def test_series_to_numpy_date() -> None:
129
values = [date(1970, 1, 1), date(2024, 2, 28)]
130
s = pl.Series(values)
131
132
result: npt.NDArray[np.generic] = s.to_numpy()
133
134
assert s.to_list() == result.tolist()
135
assert result.dtype == np.dtype("datetime64[D]")
136
assert result.flags.writeable is True
137
assert_allow_copy_false_raises(s)
138
139
140
def test_series_to_numpy_multi_dimensional_init() -> None:
141
s = pl.Series(np.atleast_3d(np.array([-10.5, 0.0, 10.5])))
142
assert_series_equal(
143
s,
144
pl.Series(
145
[[[-10.5], [0.0], [10.5]]],
146
dtype=pl.Array(pl.Float64, shape=(3, 1)),
147
),
148
)
149
s = pl.Series(np.array(0), dtype=pl.Int32)
150
assert_series_equal(s, pl.Series([0], dtype=pl.Int32))
151
152
153
@pytest.mark.parametrize(
154
("dtype", "expected_dtype"),
155
[
156
(pl.Date, np.dtype("datetime64[D]")),
157
(pl.Duration("ms"), np.dtype("timedelta64[ms]")),
158
(pl.Duration("us"), np.dtype("timedelta64[us]")),
159
(pl.Duration("ns"), np.dtype("timedelta64[ns]")),
160
(pl.Datetime, np.dtype("datetime64[us]")),
161
(pl.Datetime("ms"), np.dtype("datetime64[ms]")),
162
(pl.Datetime("us"), np.dtype("datetime64[us]")),
163
(pl.Datetime("ns"), np.dtype("datetime64[ns]")),
164
],
165
)
166
def test_series_to_numpy_temporal_with_nulls(
167
dtype: PolarsDataType, expected_dtype: npt.DTypeLike
168
) -> None:
169
values = [0, 2_000, 1_000_000, None]
170
s = pl.Series(values, dtype=dtype, strict=False)
171
result: npt.NDArray[np.generic] = s.to_numpy()
172
173
# NumPy tolist returns integers for ns precision
174
if getattr(s.dtype, "time_unit", None) == "ns":
175
assert result.tolist() == values
176
else:
177
assert result.tolist() == s.to_list()
178
assert result.dtype == expected_dtype
179
assert_allow_copy_false_raises(s)
180
181
182
def test_series_to_numpy_datetime_with_tz_with_nulls() -> None:
183
values = [datetime(1970, 1, 1), datetime(2024, 2, 28), None]
184
s = pl.Series(values).dt.convert_time_zone("Europe/Amsterdam")
185
result: npt.NDArray[np.generic] = s.to_numpy()
186
187
assert result.tolist() == values
188
assert result.dtype == np.dtype("datetime64[us]")
189
assert_allow_copy_false_raises(s)
190
191
192
@pytest.mark.parametrize(
193
("dtype", "values"),
194
[
195
(pl.Time, [time(10, 30, 45), time(23, 59, 59)]),
196
(pl.Categorical, ["a", "b", "a"]),
197
(pl.Enum(["a", "b", "c"]), ["a", "b", "a"]),
198
(pl.String, ["a", "bc", "def"]),
199
(pl.Binary, [b"a", b"bc", b"def"]),
200
(pl.Decimal, [D("1.234"), D("2.345"), D("-3.456")]),
201
(pl.Object, [Path(), Path("abc")]),
202
],
203
)
204
@pytest.mark.parametrize("with_nulls", [False, True])
205
def test_to_numpy_object_dtypes(
206
dtype: PolarsDataType, values: list[Any], with_nulls: bool
207
) -> None:
208
if with_nulls:
209
values.append(None)
210
211
s = pl.Series(values, dtype=dtype)
212
result: npt.NDArray[np.generic] = s.to_numpy()
213
214
assert result.tolist() == values
215
assert result.dtype == np.object_
216
assert_allow_copy_false_raises(s)
217
218
219
def test_series_to_numpy_bool() -> None:
220
s = pl.Series([True, False])
221
result: npt.NDArray[np.generic] = s.to_numpy()
222
223
assert s.to_list() == result.tolist()
224
assert result.dtype == np.bool_
225
assert result.flags.writeable is True
226
assert_allow_copy_false_raises(s)
227
228
229
def test_series_to_numpy_bool_with_nulls() -> None:
230
s = pl.Series([True, False, None])
231
result: npt.NDArray[np.generic] = s.to_numpy()
232
233
assert s.to_list() == result.tolist()
234
assert result.dtype == np.object_
235
assert_allow_copy_false_raises(s)
236
237
238
def test_series_to_numpy_array_of_int() -> None:
239
values = [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]
240
s = pl.Series(values, dtype=pl.Array(pl.Array(pl.Int8, 3), 2))
241
result = s.to_numpy(allow_copy=False)
242
243
expected = np.array(values)
244
assert_array_equal(result, expected)
245
assert result.dtype == np.int8
246
assert result.shape == (2, 2, 3)
247
248
249
def test_series_to_numpy_array_of_str() -> None:
250
values = [["1", "2", "3"], ["4", "5", "10000"]]
251
s = pl.Series(values, dtype=pl.Array(pl.String, 3))
252
result: npt.NDArray[np.generic] = s.to_numpy()
253
assert result.tolist() == values
254
assert result.dtype == np.object_
255
256
257
def test_series_to_numpy_array_with_nulls() -> None:
258
values = [[1, 2], [3, 4], None]
259
s = pl.Series(values, dtype=pl.Array(pl.Int64, 2))
260
result = s.to_numpy()
261
262
expected = np.array([[1.0, 2.0], [3.0, 4.0], [np.nan, np.nan]])
263
assert_array_equal(result, expected)
264
assert result.dtype == np.float64
265
assert_allow_copy_false_raises(s)
266
267
268
def test_series_to_numpy_array_with_nested_nulls() -> None:
269
values = [[None, 2], [3, 4], [5, None]]
270
s = pl.Series(values, dtype=pl.Array(pl.Int64, 2))
271
result = s.to_numpy()
272
273
expected = np.array([[np.nan, 2.0], [3.0, 4.0], [5.0, np.nan]])
274
assert_array_equal(result, expected)
275
assert result.dtype == np.float64
276
assert_allow_copy_false_raises(s)
277
278
279
def test_series_to_numpy_array_of_arrays() -> None:
280
values = [[[None, 2], [3, 4]], [None, [7, 8]]]
281
s = pl.Series(values, dtype=pl.Array(pl.Array(pl.Int64, 2), 2))
282
result = s.to_numpy()
283
284
expected = np.array([[[np.nan, 2], [3, 4]], [[np.nan, np.nan], [7, 8]]])
285
assert_array_equal(result, expected)
286
assert result.dtype == np.float64
287
assert result.shape == (2, 2, 2)
288
assert_allow_copy_false_raises(s)
289
290
291
@pytest.mark.parametrize("chunked", [True, False])
292
def test_series_to_numpy_list(chunked: bool) -> None:
293
values = [[1, 2], [3, 4, 5], [6], []]
294
s = pl.Series(values)
295
if chunked:
296
s = pl.concat([s[:2], s[2:]])
297
result = s.to_numpy()
298
299
expected = np.array([np.array(v, dtype=np.int64) for v in values], dtype=np.object_)
300
for res, exp in zip(result, expected):
301
assert_array_equal(res, exp)
302
assert result.dtype == expected.dtype
303
assert_allow_copy_false_raises(s)
304
305
306
def test_series_to_numpy_struct_numeric_supertype() -> None:
307
values = [{"a": 1, "b": 2.0}, {"a": 3, "b": 4.0}, {"a": 5, "b": None}]
308
s = pl.Series(values)
309
result = s.to_numpy()
310
311
expected = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, np.nan]])
312
assert_array_equal(result, expected)
313
assert result.dtype == np.float64
314
assert_allow_copy_false_raises(s)
315
316
317
def test_to_numpy_null() -> None:
318
s = pl.Series([None, None], dtype=pl.Null)
319
result = s.to_numpy()
320
expected = np.array([np.nan, np.nan], dtype=np.float32)
321
assert_array_equal(result, expected)
322
assert result.dtype == np.float32
323
assert_allow_copy_false_raises(s)
324
325
326
def test_to_numpy_empty() -> None:
327
s = pl.Series(dtype=pl.String)
328
result = s.to_numpy(allow_copy=False)
329
assert result.dtype == np.object_
330
assert result.shape == (0,)
331
332
333
def test_to_numpy_empty_writable() -> None:
334
s = pl.Series(dtype=pl.Int64)
335
result = s.to_numpy(allow_copy=False, writable=True)
336
assert result.dtype == np.int64
337
assert result.shape == (0,)
338
assert result.flags.writeable is True
339
340
341
def test_to_numpy_chunked() -> None:
342
s1 = pl.Series([1, 2])
343
s2 = pl.Series([3, 4])
344
s = pl.concat([s1, s2], rechunk=False)
345
346
result: npt.NDArray[np.generic] = s.to_numpy()
347
348
assert result.tolist() == s.to_list()
349
assert result.dtype == np.int64
350
assert result.flags.writeable is True
351
assert_allow_copy_false_raises(s)
352
353
# Check that writing to the array doesn't change the original data
354
result[0] = 10
355
assert result.tolist() == [10, 2, 3, 4]
356
assert s.to_list() == [1, 2, 3, 4]
357
358
359
def test_to_numpy_chunked_temporal_nested() -> None:
360
dtype = pl.Array(pl.Datetime("us"), 1)
361
s1 = pl.Series([[datetime(2020, 1, 1)], [datetime(2021, 1, 1)]], dtype=dtype)
362
s2 = pl.Series([[datetime(2022, 1, 1)], [datetime(2023, 1, 1)]], dtype=dtype)
363
s = pl.concat([s1, s2], rechunk=False)
364
365
result: npt.NDArray[np.generic] = s.to_numpy()
366
367
assert result.tolist() == s.to_list()
368
assert result.dtype == np.dtype("datetime64[us]")
369
assert result.shape == (4, 1)
370
assert result.flags.writeable is True
371
assert_allow_copy_false_raises(s)
372
373
374
def test_zero_copy_only_deprecated() -> None:
375
values = [1, 2]
376
s = pl.Series([1, 2])
377
with pytest.deprecated_call():
378
result: npt.NDArray[np.generic] = s.to_numpy(zero_copy_only=True)
379
assert result.tolist() == values
380
381
382
def test_series_to_numpy_temporal() -> None:
383
s0 = pl.Series("date", [123543, 283478, 1243]).cast(pl.Date)
384
s1 = pl.Series(
385
"datetime", [datetime(2021, 1, 2, 3, 4, 5), datetime(2021, 2, 3, 4, 5, 6)]
386
)
387
s2 = pl.datetime_range(
388
datetime(2021, 1, 1, 0),
389
datetime(2021, 1, 1, 1),
390
interval="1h",
391
time_unit="ms",
392
eager=True,
393
)
394
assert str(s0.to_numpy()) == "['2308-04-02' '2746-02-20' '1973-05-28']"
395
assert (
396
str(s1.to_numpy()[:2])
397
== "['2021-01-02T03:04:05.000000' '2021-02-03T04:05:06.000000']"
398
)
399
assert (
400
str(s2.to_numpy()[:2])
401
== "['2021-01-01T00:00:00.000' '2021-01-01T01:00:00.000']"
402
)
403
s3 = pl.Series([timedelta(hours=1), timedelta(hours=-2)])
404
out = np.array([3_600_000_000_000, -7_200_000_000_000], dtype="timedelta64[ns]")
405
assert (s3.to_numpy() == out).all()
406
407
408
@given(
409
s=series(
410
min_size=1,
411
max_size=10,
412
excluded_dtypes=[
413
pl.Categorical,
414
pl.List,
415
pl.Struct,
416
pl.Datetime("ms"),
417
pl.Duration("ms"),
418
],
419
allow_null=False,
420
allow_time_zones=False, # NumPy does not support parsing time zone aware data
421
).filter(
422
lambda s: (
423
not (s.dtype == pl.String and s.str.contains("\x00").any())
424
and not (s.dtype == pl.Binary and s.bin.contains(b"\x00").any())
425
)
426
),
427
)
428
@settings(max_examples=250)
429
def test_series_to_numpy(s: pl.Series) -> None:
430
result = s.to_numpy()
431
432
values = s.to_list()
433
dtype_map = {
434
pl.Datetime("ns"): "datetime64[ns]",
435
pl.Datetime("us"): "datetime64[us]",
436
pl.Duration("ns"): "timedelta64[ns]",
437
pl.Duration("us"): "timedelta64[us]",
438
pl.Null(): "float32",
439
}
440
np_dtype = dtype_map.get(s.dtype)
441
expected = np.array(values, dtype=np_dtype)
442
443
assert_array_equal(result, expected)
444
445
446
@pytest.mark.parametrize("writable", [False, True])
447
@pytest.mark.parametrize("pyarrow_available", [False, True])
448
def test_to_numpy2(
449
writable: bool, pyarrow_available: bool, monkeypatch: pytest.MonkeyPatch
450
) -> None:
451
monkeypatch.setattr(pl.series.series, "_PYARROW_AVAILABLE", pyarrow_available)
452
453
np_array = pl.Series("a", [1, 2, 3], pl.UInt8).to_numpy(writable=writable)
454
455
np.testing.assert_array_equal(np_array, np.array([1, 2, 3], dtype=np.uint8))
456
# Test if numpy array is readonly or writable.
457
assert np_array.flags.writeable == writable
458
459
if writable:
460
np_array[1] += 10
461
np.testing.assert_array_equal(np_array, np.array([1, 12, 3], dtype=np.uint8))
462
463
np_array_with_missing_values = pl.Series("a", [None, 2, 3], pl.UInt8).to_numpy(
464
writable=writable
465
)
466
467
np.testing.assert_array_equal(
468
np_array_with_missing_values,
469
np.array(
470
[np.nan, 2.0, 3.0],
471
dtype=(np.float64 if pyarrow_available else np.float32),
472
),
473
)
474
475
if writable:
476
# As Null values can't be encoded natively in a numpy array,
477
# this array will never be a view.
478
assert np_array_with_missing_values.flags.writeable == writable
479
480
481
def test_to_numpy_series_indexed_18986() -> None:
482
df = pl.DataFrame({"a": [[4, 5, 6], [7, 8, 9, 10], None]})
483
assert (df[1].to_numpy()[0, 0] == np.array([7, 8, 9, 10])).all()
484
assert (
485
df.to_numpy()[2] == np.array([None])
486
).all() # this one is strange, but only option in numpy?
487
488