Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/ml/test_torch.py
6940 views
1
from __future__ import annotations
2
3
import sys
4
from typing import Any
5
6
import pytest
7
8
import polars as pl
9
import polars.selectors as cs
10
from polars.dependencies import _lazy_import
11
from polars.testing import assert_frame_equal, assert_series_equal
12
13
# don't import torch until an actual test is triggered (the decorator already
14
# ensures the tests aren't run locally; this avoids premature local import)
15
torch, _ = _lazy_import("torch")
16
17
pytestmark = [
18
pytest.mark.ci_only,
19
pytest.mark.skipif(
20
sys.platform == "win32" and sys.version_info >= (3, 13),
21
reason="Torch does not ship wheels for Python 3.13 on Windows",
22
),
23
]
24
25
26
@pytest.fixture
27
def df() -> pl.DataFrame:
28
return pl.DataFrame(
29
{
30
"x": [1, 2, 2, 3],
31
"y": [True, False, True, False],
32
"z": [1.5, -0.5, 0.0, -2.0],
33
},
34
schema_overrides={"x": pl.Int8, "z": pl.Float32},
35
)
36
37
38
def assert_tensor_equal(actual: Any, expected: Any) -> None:
39
torch.testing.assert_close(actual, expected)
40
41
42
def test_to_torch_from_series() -> None:
43
s = pl.Series("x", [1, 2, 3, 4], dtype=pl.Int8)
44
t = s.to_torch()
45
46
assert list(t.shape) == [4]
47
assert_tensor_equal(t, torch.tensor([1, 2, 3, 4], dtype=torch.int8))
48
49
# note: torch doesn't natively support uint16/32/64.
50
# confirm that we export to a suitable signed integer type
51
s = s.cast(pl.UInt16)
52
t = s.to_torch()
53
assert_tensor_equal(t, torch.tensor([1, 2, 3, 4], dtype=torch.int32))
54
55
for dtype in (pl.UInt32, pl.UInt64):
56
t = s.cast(dtype).to_torch()
57
assert_tensor_equal(t, torch.tensor([1, 2, 3, 4], dtype=torch.int64))
58
59
60
def test_to_torch_tensor(df: pl.DataFrame) -> None:
61
t1 = df.to_torch()
62
t2 = df.to_torch("tensor")
63
64
assert list(t1.shape) == [4, 3]
65
assert (t1 == t2).all().item() is True
66
67
68
def test_to_torch_dict(df: pl.DataFrame) -> None:
69
td = df.to_torch("dict")
70
71
assert list(td.keys()) == ["x", "y", "z"]
72
73
assert_tensor_equal(td["x"], torch.tensor([1, 2, 2, 3], dtype=torch.int8))
74
assert_tensor_equal(
75
td["y"], torch.tensor([True, False, True, False], dtype=torch.bool)
76
)
77
assert_tensor_equal(
78
td["z"], torch.tensor([1.5, -0.5, 0.0, -2.0], dtype=torch.float32)
79
)
80
81
82
def test_to_torch_feature_label_dict(df: pl.DataFrame) -> None:
83
df = pl.DataFrame(
84
{
85
"age": [25, 32, 45, 22, 34],
86
"income": [50000, 75000, 60000, 58000, 120000],
87
"education": ["bachelor", "master", "phd", "bachelor", "phd"],
88
"purchased": [False, True, True, False, True],
89
},
90
schema_overrides={"age": pl.Int32, "income": pl.Int32},
91
).to_dummies("education", separator=":")
92
93
lbl_feat_dict = df.to_torch(return_type="dict", label="purchased")
94
assert list(lbl_feat_dict.keys()) == ["label", "features"]
95
96
assert_tensor_equal(
97
lbl_feat_dict["label"],
98
torch.tensor([[False], [True], [True], [False], [True]], dtype=torch.bool),
99
)
100
assert_tensor_equal(
101
lbl_feat_dict["features"],
102
torch.tensor(
103
[
104
[25, 50000, 1, 0, 0],
105
[32, 75000, 0, 1, 0],
106
[45, 60000, 0, 0, 1],
107
[22, 58000, 1, 0, 0],
108
[34, 120000, 0, 0, 1],
109
],
110
dtype=torch.int32,
111
),
112
)
113
114
115
def test_2D_array_cols_to_torch() -> None:
116
# 2D array
117
df1 = pl.DataFrame(
118
{"data": [[1, 1], [1, 2], [2, 2]]},
119
schema_overrides={"data": pl.Array(pl.Int32, shape=(2,))},
120
)
121
arr1 = df1.to_torch()
122
assert_tensor_equal(
123
arr1,
124
torch.tensor([[1, 1], [1, 2], [2, 2]], dtype=torch.int32),
125
)
126
127
# nested 2D array
128
df2 = pl.DataFrame(
129
{"data": [[[1, 1], [1, 2]], [[2, 2], [2, 3]]]},
130
schema_overrides={"data": pl.Array(pl.Array(pl.Int32, shape=(2,)), shape=(2,))},
131
)
132
arr2 = df2.to_torch()
133
assert_tensor_equal(
134
arr2,
135
torch.tensor([[[1, 1], [1, 2]], [[2, 2], [2, 3]]], dtype=torch.int32),
136
)
137
138
# dict with 2D array
139
df3 = df2.insert_column(0, pl.Series("lbl", [0, 1], dtype=pl.Int32))
140
lbl_feat_dict = df3.to_torch("dict")
141
assert_tensor_equal(
142
lbl_feat_dict["lbl"],
143
torch.tensor([0, 1], dtype=torch.int32),
144
)
145
assert_tensor_equal(
146
lbl_feat_dict["data"],
147
torch.tensor([[[1, 1], [1, 2]], [[2, 2], [2, 3]]], dtype=torch.int32),
148
)
149
150
# no support for list (yet? could add if ragged arrays are valid)
151
with pytest.raises(
152
TypeError,
153
match=r"cannot convert List column 'data' to Tensor \(use Array dtype instead\)",
154
):
155
pl.DataFrame({"data": [[1, 1], [1, 2], [2, 2]]}).to_torch()
156
157
158
def test_to_torch_dataset(df: pl.DataFrame) -> None:
159
ds = df.to_torch("dataset", dtype=pl.Float64)
160
161
assert len(ds) == 4
162
assert isinstance(ds, torch.utils.data.Dataset)
163
assert repr(ds).startswith("<PolarsDataset [len:4, features:3, labels:0] at 0x")
164
165
ts = ds[0]
166
assert isinstance(ts, tuple)
167
assert len(ts) == 1
168
assert_tensor_equal(ts[0], torch.tensor([1.0, 1.0, 1.5], dtype=torch.float64))
169
170
171
def test_to_torch_dataset_with_2D_arrays() -> None:
172
df = pl.DataFrame(
173
{"lbl": [0, 1], "data": [[[1, 1], [1, 2]], [[2, 2], [2, 3]]]},
174
schema_overrides={"data": pl.Array(pl.Array(pl.Int32, shape=(2,)), shape=(2,))},
175
)
176
ds = df.to_torch("dataset", label="lbl")
177
178
assert len(ds) == 2
179
assert_tensor_equal(ds[0][1], torch.tensor(0, dtype=torch.int64))
180
assert_tensor_equal(ds[1][1], torch.tensor(1, dtype=torch.int64))
181
assert_tensor_equal(ds[0][0], torch.tensor([[1, 1], [1, 2]], dtype=torch.int32))
182
assert_tensor_equal(ds[1][0], torch.tensor([[2, 2], [2, 3]], dtype=torch.int32))
183
184
185
def test_to_torch_dataset_feature_reorder(df: pl.DataFrame) -> None:
186
ds = df.to_torch("dataset", label="x", features=["z", "y"])
187
assert_tensor_equal(
188
torch.tensor(
189
[
190
[1.5000, 1.0000],
191
[-0.5000, 0.0000],
192
[0.0000, 1.0000],
193
[-2.0000, 0.0000],
194
]
195
),
196
ds.features,
197
)
198
assert_tensor_equal(torch.tensor([1, 2, 2, 3], dtype=torch.int8), ds.labels)
199
200
201
def test_to_torch_dataset_feature_subset(df: pl.DataFrame) -> None:
202
ds = df.to_torch("dataset", label="x", features=["z"])
203
assert_tensor_equal(
204
torch.tensor([[1.5000], [-0.5000], [0.0000], [-2.0000]]),
205
ds.features,
206
)
207
assert_tensor_equal(torch.tensor([1, 2, 2, 3], dtype=torch.int8), ds.labels)
208
209
210
def test_to_torch_dataset_index_slice(df: pl.DataFrame) -> None:
211
ds = df.to_torch("dataset")
212
ts = ds[1:3]
213
214
expected = (torch.tensor([[2.0000, 0.0000, -0.5000], [2.0000, 1.0000, 0.0000]]),)
215
assert_tensor_equal(expected, ts)
216
217
ts = ds[::2]
218
expected = (torch.tensor([[1.0000, 1.0000, 1.5000], [2.0, 1.0, 0.0]]),)
219
assert_tensor_equal(expected, ts)
220
221
222
@pytest.mark.parametrize(
223
"index",
224
[
225
[0, 3],
226
range(0, 4, 3),
227
slice(0, 4, 3),
228
],
229
)
230
def test_to_torch_dataset_index_multi(index: Any, df: pl.DataFrame) -> None:
231
ds = df.to_torch("dataset")
232
ts = ds[index]
233
234
expected = (torch.tensor([[1.0, 1.0, 1.5], [3.0, 0.0, -2.0]]),)
235
assert_tensor_equal(expected, ts)
236
assert ds.schema == {"features": torch.float32, "labels": None}
237
238
239
def test_to_torch_dataset_index_range(df: pl.DataFrame) -> None:
240
ds = df.to_torch("dataset")
241
ts = ds[range(3, 0, -1)]
242
243
expected = (torch.tensor([[3.0, 0.0, -2.0], [2.0, 1.0, 0.0], [2.0, 0.0, -0.5]]),)
244
assert_tensor_equal(expected, ts)
245
246
247
def test_to_dataset_half_precision(df: pl.DataFrame) -> None:
248
ds = df.to_torch("dataset", label="x")
249
assert ds.schema == {"features": torch.float32, "labels": torch.int8}
250
251
dsf16 = ds.half()
252
assert dsf16.schema == {"features": torch.float16, "labels": torch.float16}
253
254
# half precision across all data
255
ts = dsf16[:3:2]
256
expected = (
257
torch.tensor([[1.0000, 1.5000], [1.0000, 0.0000]], dtype=torch.float16),
258
torch.tensor([1.0, 2.0], dtype=torch.float16),
259
)
260
assert_tensor_equal(expected, ts)
261
262
# only apply half precision to the feature data
263
dsf16 = ds.half(labels=False)
264
assert dsf16.schema == {"features": torch.float16, "labels": torch.int8}
265
266
ts = dsf16[:3:2]
267
expected = (
268
torch.tensor([[1.0000, 1.5000], [1.0000, 0.0000]], dtype=torch.float16),
269
torch.tensor([1, 2], dtype=torch.int8),
270
)
271
assert_tensor_equal(expected, ts)
272
273
# only apply half precision to the label data
274
dsf16 = ds.half(features=False)
275
assert dsf16.schema == {"features": torch.float32, "labels": torch.float16}
276
277
ts = dsf16[:3:2]
278
expected = (
279
torch.tensor([[1.0000, 1.5000], [1.0000, 0.0000]], dtype=torch.float32),
280
torch.tensor([1.0, 2.0], dtype=torch.float16),
281
)
282
assert_tensor_equal(expected, ts)
283
284
# no labels
285
dsf16 = df.to_torch("dataset").half()
286
assert dsf16.schema == {"features": torch.float16, "labels": None}
287
288
ts = dsf16[:3:2]
289
expected = ( # type: ignore[assignment]
290
torch.tensor(
291
data=[[1.0000, 1.0000, 1.5000], [2.0000, 1.0000, 0.0000]],
292
dtype=torch.float16,
293
),
294
)
295
assert_tensor_equal(expected, ts)
296
297
298
@pytest.mark.parametrize(
299
("label", "features"),
300
[
301
("x", None),
302
("x", ["y", "z"]),
303
(cs.integer(), ~cs.integer()),
304
],
305
)
306
def test_to_torch_labelled_dataset(label: Any, features: Any, df: pl.DataFrame) -> None:
307
ds = df.to_torch("dataset", label=label, features=features)
308
ts = next(iter(torch.utils.data.DataLoader(ds, batch_size=2, shuffle=False)))
309
310
expected = [
311
torch.tensor([[1.0, 1.5], [0.0, -0.5]]),
312
torch.tensor([1, 2], dtype=torch.int8),
313
]
314
assert len(ts) == len(expected)
315
for actual, exp in zip(ts, expected):
316
assert_tensor_equal(exp, actual)
317
318
319
def test_to_torch_labelled_dataset_expr(df: pl.DataFrame) -> None:
320
ds = df.to_torch(
321
"dataset",
322
dtype=pl.Float64,
323
label=(pl.col("x") * 8).cast(pl.Int16),
324
)
325
dl = torch.utils.data.DataLoader(ds, batch_size=2, shuffle=False)
326
for data in (tuple(ds[:2]), tuple(next(iter(dl)))):
327
expected = (
328
torch.tensor([[1.0000, 1.5000], [0.0000, -0.5000]], dtype=torch.float64),
329
torch.tensor([8, 16], dtype=torch.int16),
330
)
331
assert len(data) == len(expected)
332
for actual, exp in zip(data, expected):
333
assert_tensor_equal(exp, actual)
334
335
336
def test_to_torch_labelled_dataset_multi(df: pl.DataFrame) -> None:
337
ds = df.to_torch("dataset", label=["x", "y"])
338
dl = torch.utils.data.DataLoader(ds, batch_size=3, shuffle=False)
339
ts = list(dl)
340
341
expected = [
342
[
343
torch.tensor([[1.5000], [-0.5000], [0.0000]]),
344
torch.tensor([[1, 1], [2, 0], [2, 1]], dtype=torch.int8),
345
],
346
[
347
torch.tensor([[-2.0]]),
348
torch.tensor([[3, 0]], dtype=torch.int8),
349
],
350
]
351
assert len(ts) == len(expected)
352
353
for actual, exp in zip(ts, expected):
354
assert len(actual) == len(exp)
355
for a, e in zip(actual, exp):
356
assert_tensor_equal(e, a)
357
358
359
def test_misc_errors(df: pl.DataFrame) -> None:
360
ds = df.to_torch("dataset")
361
362
with pytest.raises(
363
ValueError,
364
match="invalid `return_type`: 'stroopwafel'",
365
):
366
_res0 = df.to_torch("stroopwafel") # type: ignore[call-overload]
367
368
with pytest.raises(
369
ValueError,
370
match="does not support u16, u32, or u64 dtypes",
371
):
372
_res1 = df.to_torch(dtype=pl.UInt16)
373
374
with pytest.raises(
375
IndexError,
376
match="tensors used as indices must be long, int",
377
):
378
_res2 = ds[torch.tensor([0, 3], dtype=torch.complex64)]
379
380
with pytest.raises(
381
ValueError,
382
match="`label` and `features` only apply when `return_type` is 'dataset' or 'dict'",
383
):
384
_res3 = df.to_torch(label="stroopwafel")
385
386
with pytest.raises(
387
ValueError,
388
match="`label` is required if setting `features` when `return_type='dict'",
389
):
390
_res4 = df.to_torch("dict", features=cs.float())
391
392
393
def test_misc_lit_compatibility() -> None:
394
t = torch.tensor([[3, 0]], dtype=torch.int8)
395
assert isinstance(pl.lit(t), pl.Expr)
396
397
398
def test_from_torch() -> None:
399
t = torch.tensor([[1234.5, 200.0, -3000.5], [8000.0, 500.5, 6000.0]])
400
401
# dataframe
402
expected_frame = pl.DataFrame(
403
data={"colx": [1234.5, 200.0, -3000.5], "coly": [8000.0, 500.5, 6000.0]},
404
schema={"colx": pl.Float32, "coly": pl.Float64},
405
)
406
schema_params = {
407
"schema": ["colx", "coly"],
408
"schema_overrides": {"coly": pl.Float64},
409
}
410
for df in (
411
pl.DataFrame(t, **schema_params), # type: ignore[arg-type]
412
pl.from_torch(t, **schema_params), # type: ignore[arg-type]
413
):
414
assert_frame_equal(expected_frame, df)
415
416
# series
417
expected_series = pl.Series(
418
name="tensor",
419
dtype=pl.Array(pl.Float32, shape=(3,)),
420
values=[[1234.5, 200.0, -3000.5], [8000.0, 500.5, 6000.0]],
421
)
422
s = pl.Series(name="tensor", values=t)
423
assert_series_equal(expected_series, s)
424
425