Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/interop/numpy/test_from_numpy_df.py
6939 views
1
from __future__ import annotations
2
3
from typing import TYPE_CHECKING
4
5
import numpy as np
6
import pytest
7
from numpy.testing import assert_array_equal
8
9
import polars as pl
10
from polars.testing import assert_frame_equal
11
12
if TYPE_CHECKING:
13
import numpy.typing as npt
14
15
from polars._typing import PolarsDataType, PolarsTemporalType
16
17
18
def test_from_numpy() -> None:
19
data = np.array([[1, 2, 3], [4, 5, 6]])
20
df = pl.from_numpy(
21
data,
22
schema=["a", "b"],
23
orient="col",
24
schema_overrides={"a": pl.UInt32, "b": pl.UInt32},
25
)
26
assert df.shape == (3, 2)
27
assert df.rows() == [(1, 4), (2, 5), (3, 6)]
28
assert df.schema == {"a": pl.UInt32, "b": pl.UInt32}
29
data2 = np.array(["foo", "bar"], dtype=object)
30
df2 = pl.from_numpy(data2)
31
assert df2.shape == (2, 1)
32
assert df2.rows() == [("foo",), ("bar",)]
33
assert df2.schema == {"column_0": pl.String}
34
with pytest.raises(
35
ValueError,
36
match="cannot create DataFrame from array with more than two dimensions",
37
):
38
_ = pl.from_numpy(np.array([[[1]]]))
39
with pytest.raises(
40
ValueError, match="cannot create DataFrame from zero-dimensional array"
41
):
42
_ = pl.from_numpy(np.array(1))
43
44
45
def test_from_numpy_array_value() -> None:
46
df = pl.DataFrame({"A": [[2, 3]]})
47
assert df.rows() == [([2, 3],)]
48
assert df.schema == {"A": pl.List(pl.Int64)}
49
50
51
def test_construct_from_ndarray_value() -> None:
52
array_cell = np.array([2, 3])
53
df = pl.DataFrame(np.array([[array_cell, 4]], dtype=object))
54
assert df.dtypes == [pl.Object, pl.Object]
55
to_numpy = df.to_numpy()
56
assert to_numpy.shape == (1, 2)
57
assert_array_equal(to_numpy[0][0], array_cell)
58
assert to_numpy[0][1] == 4
59
60
61
def test_from_numpy_nparray_value() -> None:
62
array_cell = np.array([2, 3])
63
df = pl.from_numpy(np.array([[array_cell, 4]], dtype=object))
64
assert df.dtypes == [pl.Object, pl.Object]
65
to_numpy = df.to_numpy()
66
assert to_numpy.shape == (1, 2)
67
assert_array_equal(to_numpy[0][0], array_cell)
68
assert to_numpy[0][1] == 4
69
70
71
def test_from_numpy_structured() -> None:
72
test_data = [
73
("Google Pixel 7", 521.90, True),
74
("Apple iPhone 14 Pro", 999.00, True),
75
("Samsung Galaxy S23 Ultra", 1199.99, False),
76
("OnePlus 11", 699.00, True),
77
]
78
# create a numpy structured array...
79
arr_structured = np.array(
80
test_data,
81
dtype=np.dtype(
82
[
83
("product", "U32"),
84
("price_usd", "float64"),
85
("in_stock", "bool"),
86
]
87
),
88
)
89
# ...and also establish as a record array view
90
arr_records = arr_structured.view(np.recarray)
91
92
# confirm that we can cleanly initialise a DataFrame from both,
93
# respecting the native dtypes and any schema overrides, etc.
94
for arr in (arr_structured, arr_records):
95
df = pl.DataFrame(data=arr).sort(by="price_usd", descending=True)
96
97
assert df.schema == {
98
"product": pl.String,
99
"price_usd": pl.Float64,
100
"in_stock": pl.Boolean,
101
}
102
assert df.rows() == sorted(test_data, key=lambda row: -row[1])
103
104
for df in (
105
pl.DataFrame(
106
data=arr, schema=["phone", ("price_usd", pl.Float32), "available"]
107
),
108
pl.DataFrame(
109
data=arr,
110
schema=["phone", "price_usd", "available"],
111
schema_overrides={"price_usd": pl.Float32},
112
),
113
):
114
assert df.schema == {
115
"phone": pl.String,
116
"price_usd": pl.Float32,
117
"available": pl.Boolean,
118
}
119
120
121
def test_from_numpy2() -> None:
122
# note: numpy timeunit support is limited to those supported by polars.
123
# as a result, datetime64[s] raises
124
x = np.asarray(range(100_000, 200_000, 10_000), dtype="datetime64[s]")
125
with pytest.raises(ValueError, match="Please cast to the closest supported unit"):
126
pl.Series(x)
127
128
129
@pytest.mark.parametrize(
130
("numpy_time_unit", "expected_values", "expected_dtype"),
131
[
132
("ns", ["1970-01-02T01:12:34.123456789"], pl.Datetime("ns")),
133
("us", ["1970-01-02T01:12:34.123456"], pl.Datetime("us")),
134
("ms", ["1970-01-02T01:12:34.123"], pl.Datetime("ms")),
135
("D", ["1970-01-02"], pl.Date),
136
],
137
)
138
def test_from_numpy_supported_units(
139
numpy_time_unit: str,
140
expected_values: list[str],
141
expected_dtype: PolarsTemporalType,
142
) -> None:
143
values = np.array(
144
["1970-01-02T01:12:34.123456789123456789"],
145
dtype=f"datetime64[{numpy_time_unit}]",
146
)
147
result = pl.from_numpy(values)
148
expected = (
149
pl.Series("column_0", expected_values).str.strptime(expected_dtype).to_frame()
150
)
151
assert_frame_equal(result, expected)
152
153
154
@pytest.mark.parametrize(
155
("np_dtype", "dtype"),
156
[
157
(np.float64, pl.Float64),
158
(np.int32, pl.Int32),
159
],
160
)
161
def test_from_numpy_empty(np_dtype: npt.DTypeLike, dtype: PolarsDataType) -> None:
162
data = np.array([], dtype=np_dtype)
163
result = pl.from_numpy(data, schema=["a"])
164
expected = pl.Series("a", [], dtype=dtype).to_frame()
165
assert_frame_equal(result, expected)
166
167