Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py
8422 views
1
from __future__ import annotations
2
3
from datetime import date, datetime, time, timedelta
4
from decimal import Decimal as D
5
from pathlib import Path
6
from typing import TYPE_CHECKING, Any
7
8
import numpy as np
9
import pytest
10
from hypothesis import given, settings
11
from numpy.testing import assert_array_equal
12
13
import polars as pl
14
from polars.testing import assert_series_equal
15
from polars.testing.parametric import series
16
17
if TYPE_CHECKING:
18
import numpy.typing as npt
19
20
from polars._typing import PolarsDataType
21
from tests.conftest import PlMonkeyPatch
22
23
24
def assert_zero_copy(s: pl.Series, arr: np.ndarray[Any, Any]) -> None:
25
if s.len() == 0:
26
return
27
s_ptr = s._get_buffers()["values"]._get_buffer_info()[0]
28
arr_ptr = arr.__array_interface__["data"][0]
29
assert s_ptr == arr_ptr
30
31
32
def assert_allow_copy_false_raises(s: pl.Series) -> None:
33
with pytest.raises(RuntimeError, match="copy not allowed"):
34
s.to_numpy(allow_copy=False)
35
36
37
@pytest.mark.parametrize(
38
("dtype", "expected_dtype"),
39
[
40
(pl.Int8, np.int8),
41
(pl.Int16, np.int16),
42
(pl.Int32, np.int32),
43
(pl.Int64, np.int64),
44
(pl.UInt8, np.uint8),
45
(pl.UInt16, np.uint16),
46
(pl.UInt32, np.uint32),
47
(pl.UInt64, np.uint64),
48
(pl.Float32, np.float32),
49
(pl.Float64, np.float64),
50
],
51
)
52
def test_series_to_numpy_numeric_zero_copy(
53
dtype: PolarsDataType, expected_dtype: npt.DTypeLike
54
) -> None:
55
s = pl.Series([1, 2, 3]).cast(dtype)
56
result: npt.NDArray[np.generic] = s.to_numpy(allow_copy=False)
57
58
assert_zero_copy(s, result)
59
assert result.tolist() == s.to_list()
60
assert result.dtype == expected_dtype
61
62
63
@pytest.mark.parametrize(
64
("dtype", "expected_dtype"),
65
[
66
(pl.Int8, np.float32),
67
(pl.Int16, np.float32),
68
(pl.Int32, np.float64),
69
(pl.Int64, np.float64),
70
(pl.UInt8, np.float32),
71
(pl.UInt16, np.float32),
72
(pl.UInt32, np.float64),
73
(pl.UInt64, np.float64),
74
(pl.Float32, np.float32),
75
(pl.Float64, np.float64),
76
],
77
)
78
def test_series_to_numpy_numeric_with_nulls(
79
dtype: PolarsDataType, expected_dtype: npt.DTypeLike
80
) -> None:
81
s = pl.Series([1, 2, None], dtype=dtype, strict=False)
82
result: npt.NDArray[np.generic] = s.to_numpy()
83
84
assert result.tolist()[:-1] == s.to_list()[:-1]
85
assert np.isnan(result[-1])
86
assert result.dtype == expected_dtype
87
assert_allow_copy_false_raises(s)
88
89
90
@pytest.mark.parametrize(
91
("dtype", "expected_dtype"),
92
[
93
(pl.Duration, np.dtype("timedelta64[us]")),
94
(pl.Duration("ms"), np.dtype("timedelta64[ms]")),
95
(pl.Duration("us"), np.dtype("timedelta64[us]")),
96
(pl.Duration("ns"), np.dtype("timedelta64[ns]")),
97
(pl.Datetime, np.dtype("datetime64[us]")),
98
(pl.Datetime("ms"), np.dtype("datetime64[ms]")),
99
(pl.Datetime("us"), np.dtype("datetime64[us]")),
100
(pl.Datetime("ns"), np.dtype("datetime64[ns]")),
101
],
102
)
103
def test_series_to_numpy_temporal_zero_copy(
104
dtype: PolarsDataType, expected_dtype: npt.DTypeLike
105
) -> None:
106
values = [0, 2_000, 1_000_000]
107
s = pl.Series(values, dtype=dtype, strict=False)
108
result: npt.NDArray[np.generic] = s.to_numpy(allow_copy=False)
109
110
assert_zero_copy(s, result)
111
# NumPy tolist returns integers for ns precision
112
if s.dtype.time_unit == "ns": # type: ignore[attr-defined]
113
assert result.tolist() == values
114
else:
115
assert result.tolist() == s.to_list()
116
assert result.dtype == expected_dtype
117
118
119
def test_series_to_numpy_datetime_with_tz_zero_copy() -> None:
120
values = [datetime(1970, 1, 1), datetime(2024, 2, 28)]
121
s = pl.Series(values).dt.convert_time_zone("Europe/Amsterdam").rechunk()
122
result: npt.NDArray[np.generic] = s.to_numpy(allow_copy=False)
123
124
assert_zero_copy(s, result)
125
assert result.tolist() == values
126
assert result.dtype == np.dtype("datetime64[us]")
127
128
129
def test_series_to_numpy_date() -> None:
130
values = [date(1970, 1, 1), date(2024, 2, 28)]
131
s = pl.Series(values)
132
133
result: npt.NDArray[np.generic] = s.to_numpy()
134
135
assert s.to_list() == result.tolist()
136
assert result.dtype == np.dtype("datetime64[D]")
137
assert result.flags.writeable is True
138
assert_allow_copy_false_raises(s)
139
140
141
def test_series_to_numpy_multi_dimensional_init() -> None:
142
s = pl.Series(np.atleast_3d(np.array([-10.5, 0.0, 10.5])))
143
assert_series_equal(
144
s,
145
pl.Series(
146
[[[-10.5], [0.0], [10.5]]],
147
dtype=pl.Array(pl.Float64, shape=(3, 1)),
148
),
149
)
150
s = pl.Series(np.array(0), dtype=pl.Int32)
151
assert_series_equal(s, pl.Series([0], dtype=pl.Int32))
152
153
154
@pytest.mark.parametrize(
155
("dtype", "expected_dtype"),
156
[
157
(pl.Date, np.dtype("datetime64[D]")),
158
(pl.Duration("ms"), np.dtype("timedelta64[ms]")),
159
(pl.Duration("us"), np.dtype("timedelta64[us]")),
160
(pl.Duration("ns"), np.dtype("timedelta64[ns]")),
161
(pl.Datetime, np.dtype("datetime64[us]")),
162
(pl.Datetime("ms"), np.dtype("datetime64[ms]")),
163
(pl.Datetime("us"), np.dtype("datetime64[us]")),
164
(pl.Datetime("ns"), np.dtype("datetime64[ns]")),
165
],
166
)
167
def test_series_to_numpy_temporal_with_nulls(
168
dtype: PolarsDataType, expected_dtype: npt.DTypeLike
169
) -> None:
170
values = [0, 2_000, 1_000_000, None]
171
s = pl.Series(values, dtype=dtype, strict=False)
172
result: npt.NDArray[np.generic] = s.to_numpy()
173
174
# NumPy tolist returns integers for ns precision
175
if getattr(s.dtype, "time_unit", None) == "ns":
176
assert result.tolist() == values
177
else:
178
assert result.tolist() == s.to_list()
179
assert result.dtype == expected_dtype
180
assert_allow_copy_false_raises(s)
181
182
183
def test_series_to_numpy_datetime_with_tz_with_nulls() -> None:
184
values = [datetime(1970, 1, 1), datetime(2024, 2, 28), None]
185
s = pl.Series(values).dt.convert_time_zone("Europe/Amsterdam")
186
result: npt.NDArray[np.generic] = s.to_numpy()
187
188
assert result.tolist() == values
189
assert result.dtype == np.dtype("datetime64[us]")
190
assert_allow_copy_false_raises(s)
191
192
193
@pytest.mark.parametrize(
194
("dtype", "values"),
195
[
196
(pl.Time, [time(10, 30, 45), time(23, 59, 59)]),
197
(pl.Categorical, ["a", "b", "a"]),
198
(pl.Enum(["a", "b", "c"]), ["a", "b", "a"]),
199
(pl.String, ["a", "bc", "def"]),
200
(pl.Binary, [b"a", b"bc", b"def"]),
201
(pl.Decimal, [D("1.234"), D("2.345"), D("-3.456")]),
202
(pl.Object, [Path(), Path("abc")]),
203
],
204
)
205
@pytest.mark.parametrize("with_nulls", [False, True])
206
def test_to_numpy_object_dtypes(
207
dtype: PolarsDataType, values: list[Any], with_nulls: bool
208
) -> None:
209
if with_nulls:
210
values.append(None)
211
212
s = pl.Series(values, dtype=dtype)
213
result: npt.NDArray[np.generic] = s.to_numpy()
214
215
assert result.tolist() == values
216
assert result.dtype == np.object_
217
assert_allow_copy_false_raises(s)
218
219
220
def test_series_to_numpy_bool() -> None:
221
s = pl.Series([True, False])
222
result: npt.NDArray[np.generic] = s.to_numpy()
223
224
assert s.to_list() == result.tolist()
225
assert result.dtype == np.bool_
226
assert result.flags.writeable is True
227
assert_allow_copy_false_raises(s)
228
229
230
def test_series_to_numpy_bool_with_nulls() -> None:
231
s = pl.Series([True, False, None])
232
result: npt.NDArray[np.generic] = s.to_numpy()
233
234
assert s.to_list() == result.tolist()
235
assert result.dtype == np.object_
236
assert_allow_copy_false_raises(s)
237
238
239
def test_series_to_numpy_array_of_int() -> None:
240
values = [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]
241
s = pl.Series(values, dtype=pl.Array(pl.Array(pl.Int8, 3), 2))
242
result = s.to_numpy(allow_copy=False)
243
244
expected = np.array(values)
245
assert_array_equal(result, expected)
246
assert result.dtype == np.int8
247
assert result.shape == (2, 2, 3)
248
249
250
def test_series_to_numpy_array_of_str() -> None:
251
values = [["1", "2", "3"], ["4", "5", "10000"]]
252
s = pl.Series(values, dtype=pl.Array(pl.String, 3))
253
result: npt.NDArray[np.generic] = s.to_numpy()
254
assert result.tolist() == values
255
assert result.dtype == np.object_
256
257
258
def test_series_to_numpy_array_with_nulls() -> None:
259
values = [[1, 2], [3, 4], None]
260
s = pl.Series(values, dtype=pl.Array(pl.Int64, 2))
261
result = s.to_numpy()
262
263
expected = np.array([[1.0, 2.0], [3.0, 4.0], [np.nan, np.nan]])
264
assert_array_equal(result, expected)
265
assert result.dtype == np.float64
266
assert_allow_copy_false_raises(s)
267
268
269
def test_series_to_numpy_array_with_nested_nulls() -> None:
270
values = [[None, 2], [3, 4], [5, None]]
271
s = pl.Series(values, dtype=pl.Array(pl.Int64, 2))
272
result = s.to_numpy()
273
274
expected = np.array([[np.nan, 2.0], [3.0, 4.0], [5.0, np.nan]])
275
assert_array_equal(result, expected)
276
assert result.dtype == np.float64
277
assert_allow_copy_false_raises(s)
278
279
280
def test_series_to_numpy_array_of_arrays() -> None:
281
values = [[[None, 2], [3, 4]], [None, [7, 8]]]
282
s = pl.Series(values, dtype=pl.Array(pl.Array(pl.Int64, 2), 2))
283
result = s.to_numpy()
284
285
expected = np.array([[[np.nan, 2], [3, 4]], [[np.nan, np.nan], [7, 8]]])
286
assert_array_equal(result, expected)
287
assert result.dtype == np.float64
288
assert result.shape == (2, 2, 2)
289
assert_allow_copy_false_raises(s)
290
291
292
@pytest.mark.parametrize("chunked", [True, False])
293
def test_series_to_numpy_list(chunked: bool) -> None:
294
values = [[1, 2], [3, 4, 5], [6], []]
295
s = pl.Series(values)
296
if chunked:
297
s = pl.concat([s[:2], s[2:]])
298
result = s.to_numpy()
299
300
expected = np.array([np.array(v, dtype=np.int64) for v in values], dtype=np.object_)
301
for res, exp in zip(result, expected, strict=True):
302
assert_array_equal(res, exp)
303
assert result.dtype == expected.dtype
304
assert_allow_copy_false_raises(s)
305
306
307
def test_series_to_numpy_struct_numeric_supertype() -> None:
308
values = [{"a": 1, "b": 2.0}, {"a": 3, "b": 4.0}, {"a": 5, "b": None}]
309
s = pl.Series(values)
310
result = s.to_numpy()
311
312
expected = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, np.nan]])
313
assert_array_equal(result, expected)
314
assert result.dtype == np.float64
315
assert_allow_copy_false_raises(s)
316
317
318
def test_to_numpy_null() -> None:
319
s = pl.Series([None, None], dtype=pl.Null)
320
result = s.to_numpy()
321
expected = np.array([np.nan, np.nan], dtype=np.float32)
322
assert_array_equal(result, expected)
323
assert result.dtype == np.float32
324
assert_allow_copy_false_raises(s)
325
326
327
def test_to_numpy_empty() -> None:
328
s = pl.Series(dtype=pl.String)
329
result = s.to_numpy(allow_copy=False)
330
assert result.dtype == np.object_
331
assert result.shape == (0,)
332
333
334
def test_to_numpy_empty_writable() -> None:
335
s = pl.Series(dtype=pl.Int64)
336
result = s.to_numpy(allow_copy=False, writable=True)
337
assert result.dtype == np.int64
338
assert result.shape == (0,)
339
assert result.flags.writeable is True
340
341
342
def test_to_numpy_chunked() -> None:
343
s1 = pl.Series([1, 2])
344
s2 = pl.Series([3, 4])
345
s = pl.concat([s1, s2], rechunk=False)
346
347
result: npt.NDArray[np.generic] = s.to_numpy()
348
349
assert result.tolist() == s.to_list()
350
assert result.dtype == np.int64
351
assert result.flags.writeable is True
352
assert_allow_copy_false_raises(s)
353
354
# Check that writing to the array doesn't change the original data
355
result[0] = 10
356
assert result.tolist() == [10, 2, 3, 4]
357
assert s.to_list() == [1, 2, 3, 4]
358
359
360
def test_to_numpy_chunked_temporal_nested() -> None:
361
dtype = pl.Array(pl.Datetime("us"), 1)
362
s1 = pl.Series([[datetime(2020, 1, 1)], [datetime(2021, 1, 1)]], dtype=dtype)
363
s2 = pl.Series([[datetime(2022, 1, 1)], [datetime(2023, 1, 1)]], dtype=dtype)
364
s = pl.concat([s1, s2], rechunk=False)
365
366
result: npt.NDArray[np.generic] = s.to_numpy()
367
368
assert result.tolist() == s.to_list()
369
assert result.dtype == np.dtype("datetime64[us]")
370
assert result.shape == (4, 1)
371
assert result.flags.writeable is True
372
assert_allow_copy_false_raises(s)
373
374
375
def test_zero_copy_only_deprecated() -> None:
376
values = [1, 2]
377
s = pl.Series([1, 2])
378
with pytest.deprecated_call():
379
result: npt.NDArray[np.generic] = s.to_numpy(zero_copy_only=True)
380
assert result.tolist() == values
381
382
383
def test_series_to_numpy_temporal() -> None:
384
s0 = pl.Series("date", [123543, 283478, 1243]).cast(pl.Date)
385
s1 = pl.Series(
386
"datetime", [datetime(2021, 1, 2, 3, 4, 5), datetime(2021, 2, 3, 4, 5, 6)]
387
)
388
s2 = pl.datetime_range(
389
datetime(2021, 1, 1, 0),
390
datetime(2021, 1, 1, 1),
391
interval="1h",
392
time_unit="ms",
393
eager=True,
394
)
395
assert str(s0.to_numpy()) == "['2308-04-02' '2746-02-20' '1973-05-28']"
396
assert (
397
str(s1.to_numpy()[:2])
398
== "['2021-01-02T03:04:05.000000' '2021-02-03T04:05:06.000000']"
399
)
400
assert (
401
str(s2.to_numpy()[:2])
402
== "['2021-01-01T00:00:00.000' '2021-01-01T01:00:00.000']"
403
)
404
s3 = pl.Series([timedelta(hours=1), timedelta(hours=-2)])
405
out = np.array([3_600_000_000_000, -7_200_000_000_000], dtype="timedelta64[ns]")
406
assert (s3.to_numpy() == out).all()
407
408
409
@given(
410
s=series(
411
min_size=1,
412
max_size=10,
413
excluded_dtypes=[
414
pl.Float16,
415
pl.Int128,
416
pl.UInt128,
417
pl.Categorical,
418
pl.List,
419
pl.Struct,
420
pl.Datetime("ms"),
421
pl.Duration("ms"),
422
],
423
allow_null=False,
424
allow_time_zones=False, # NumPy does not support parsing time zone aware data
425
).filter(
426
lambda s: (
427
not (s.dtype == pl.String and s.str.contains("\x00").any())
428
and not (s.dtype == pl.Binary and s.bin.contains(b"\x00").any())
429
)
430
),
431
)
432
@settings(max_examples=250)
433
def test_series_to_numpy(s: pl.Series) -> None:
434
result = s.to_numpy()
435
436
values = s.to_list()
437
dtype_map = {
438
pl.Datetime("ns"): "datetime64[ns]",
439
pl.Datetime("us"): "datetime64[us]",
440
pl.Duration("ns"): "timedelta64[ns]",
441
pl.Duration("us"): "timedelta64[us]",
442
pl.Null(): "float32",
443
}
444
np_dtype = dtype_map.get(s.dtype)
445
expected = np.array(values, dtype=np_dtype)
446
447
assert_array_equal(result, expected)
448
449
450
@pytest.mark.parametrize("writable", [False, True])
451
@pytest.mark.parametrize("pyarrow_available", [False, True])
452
def test_to_numpy2(
453
writable: bool, pyarrow_available: bool, plmonkeypatch: PlMonkeyPatch
454
) -> None:
455
plmonkeypatch.setattr(pl.series.series, "_PYARROW_AVAILABLE", pyarrow_available)
456
457
np_array = pl.Series("a", [1, 2, 3], pl.UInt8).to_numpy(writable=writable)
458
459
np.testing.assert_array_equal(np_array, np.array([1, 2, 3], dtype=np.uint8))
460
# Test if numpy array is readonly or writable.
461
assert np_array.flags.writeable == writable
462
463
if writable:
464
np_array[1] += 10
465
np.testing.assert_array_equal(np_array, np.array([1, 12, 3], dtype=np.uint8))
466
467
np_array_with_missing_values = pl.Series("a", [None, 2, 3], pl.UInt8).to_numpy(
468
writable=writable
469
)
470
471
np.testing.assert_array_equal(
472
np_array_with_missing_values,
473
np.array(
474
[np.nan, 2.0, 3.0],
475
dtype=(np.float64 if pyarrow_available else np.float32),
476
),
477
)
478
479
if writable:
480
# As Null values can't be encoded natively in a numpy array,
481
# this array will never be a view.
482
assert np_array_with_missing_values.flags.writeable == writable
483
484
485
def test_to_numpy_series_indexed_18986() -> None:
486
df = pl.DataFrame({"a": [[4, 5, 6], [7, 8, 9, 10], None]})
487
assert (df[1].to_numpy()[0, 0] == np.array([7, 8, 9, 10])).all()
488
assert (
489
df.to_numpy()[2] == np.array([None])
490
).all() # this one is strange, but only option in numpy?
491
492