Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/interchange/test_column.py
6939 views
1
from __future__ import annotations
2
3
from datetime import datetime
4
from typing import TYPE_CHECKING
5
6
import pytest
7
8
import polars as pl
9
from polars.interchange.column import PolarsColumn
10
from polars.interchange.protocol import ColumnNullType, CopyNotAllowedError, DtypeKind
11
from polars.testing import assert_series_equal
12
13
if TYPE_CHECKING:
14
from polars.interchange.protocol import Dtype
15
16
17
def test_size() -> None:
18
s = pl.Series([1, 2, 3])
19
col = PolarsColumn(s)
20
assert col.size() == 3
21
22
23
def test_offset() -> None:
24
s = pl.Series([1, 2, 3])
25
col = PolarsColumn(s)
26
assert col.offset == 0
27
28
29
def test_dtype_int() -> None:
30
s = pl.Series([1, 2, 3], dtype=pl.Int32)
31
col = PolarsColumn(s)
32
assert col.dtype == (DtypeKind.INT, 32, "i", "=")
33
34
35
def test_dtype_categorical() -> None:
36
s = pl.Series(["a", "b", "a"], dtype=pl.Categorical)
37
col = PolarsColumn(s)
38
assert col.dtype == (DtypeKind.CATEGORICAL, 32, "I", "=")
39
40
41
def test_describe_categorical() -> None:
42
s = pl.Series(["b", "a", "a", "c", None, "b"], dtype=pl.Categorical)
43
col = PolarsColumn(s)
44
45
out = col.describe_categorical
46
47
assert out["is_ordered"] is False
48
assert out["is_dictionary"] is True
49
assert set(out["categories"]._col) >= {"b", "a", "c"}
50
51
52
def test_describe_categorical_enum() -> None:
53
s = pl.Series(["b", "a", "a", "c", None, "b"], dtype=pl.Enum(["a", "b", "c"]))
54
col = PolarsColumn(s)
55
56
out = col.describe_categorical
57
58
assert out["is_ordered"] is True
59
assert out["is_dictionary"] is True
60
61
expected_categories = pl.Series("category", ["a", "b", "c"])
62
assert_series_equal(out["categories"]._col, expected_categories)
63
64
65
def test_describe_categorical_other_dtype() -> None:
66
s = pl.Series(["a", "b", "a"], dtype=pl.String)
67
col = PolarsColumn(s)
68
with pytest.raises(TypeError):
69
col.describe_categorical
70
71
72
def test_describe_null() -> None:
73
s = pl.Series([1, 2, None])
74
col = PolarsColumn(s)
75
assert col.describe_null == (ColumnNullType.USE_BITMASK, 0)
76
77
78
def test_describe_null_no_null_values() -> None:
79
s = pl.Series([1, 2, 3])
80
col = PolarsColumn(s)
81
assert col.describe_null == (ColumnNullType.NON_NULLABLE, None)
82
83
84
def test_null_count() -> None:
85
s = pl.Series([None, 2, None])
86
col = PolarsColumn(s)
87
assert col.null_count == 2
88
89
90
def test_metadata() -> None:
91
s = pl.Series([1, 2])
92
col = PolarsColumn(s)
93
assert col.metadata == {}
94
95
96
def test_num_chunks() -> None:
97
s = pl.Series([1, 2])
98
col = PolarsColumn(s)
99
assert col.num_chunks() == 1
100
101
s2 = pl.concat([s, s], rechunk=False)
102
col2 = s2.to_frame().__dataframe__().get_column(0)
103
assert col2.num_chunks() == 2
104
105
106
@pytest.mark.parametrize("n_chunks", [None, 2])
107
def test_get_chunks(n_chunks: int | None) -> None:
108
s1 = pl.Series([1, 2, 3])
109
s2 = pl.Series([4, 5])
110
s = pl.concat([s1, s2], rechunk=False)
111
col = PolarsColumn(s)
112
113
out = col.get_chunks(n_chunks)
114
115
expected = [s1, s2]
116
for o, e in zip(out, expected):
117
assert_series_equal(o._col, e)
118
119
120
def test_get_chunks_invalid_input() -> None:
121
s1 = pl.Series([1, 2, 3])
122
s2 = pl.Series([4, 5])
123
s = pl.concat([s1, s2], rechunk=False)
124
col = PolarsColumn(s)
125
126
with pytest.raises(ValueError):
127
next(col.get_chunks(0))
128
129
with pytest.raises(ValueError):
130
next(col.get_chunks(3))
131
132
133
def test_get_chunks_subdivided_chunks() -> None:
134
s1 = pl.Series([1, 2, 3])
135
s2 = pl.Series([4, 5])
136
s = pl.concat([s1, s2], rechunk=False)
137
col = PolarsColumn(s)
138
139
out = col.get_chunks(4)
140
141
chunk1 = next(out)
142
expected1 = pl.Series([1, 2])
143
assert_series_equal(chunk1._col, expected1)
144
145
chunk2 = next(out)
146
expected2 = pl.Series([3])
147
assert_series_equal(chunk2._col, expected2)
148
149
chunk3 = next(out)
150
expected3 = pl.Series([4])
151
assert_series_equal(chunk3._col, expected3)
152
153
chunk4 = next(out)
154
expected4 = pl.Series([5])
155
assert_series_equal(chunk4._col, expected4)
156
157
with pytest.raises(StopIteration):
158
next(out)
159
160
161
@pytest.mark.parametrize(
162
("series", "expected_data", "expected_dtype"),
163
[
164
(
165
pl.Series([1, None, 3], dtype=pl.Int16),
166
pl.Series([1, 0, 3], dtype=pl.Int16),
167
(DtypeKind.INT, 16, "s", "="),
168
),
169
(
170
pl.Series([-1.5, 3.0, None], dtype=pl.Float64),
171
pl.Series([-1.5, 3.0, 0.0], dtype=pl.Float64),
172
(DtypeKind.FLOAT, 64, "g", "="),
173
),
174
(
175
pl.Series(["a", "bc", None, "éâç"], dtype=pl.String),
176
pl.Series([97, 98, 99, 195, 169, 195, 162, 195, 167], dtype=pl.UInt8),
177
(DtypeKind.UINT, 8, "C", "="),
178
),
179
(
180
pl.Series(
181
[datetime(1988, 1, 2), None, datetime(2022, 12, 3)], dtype=pl.Datetime
182
),
183
pl.Series([568080000000000, 0, 1670025600000000], dtype=pl.Int64),
184
(DtypeKind.INT, 64, "l", "="),
185
),
186
# TODO: cat-rework: re-enable this with a unique named categorical.
187
# (
188
# pl.Series(["a", "b", None, "a"], dtype=pl.Categorical),
189
# pl.Series([0, 1, 0, 0], dtype=pl.UInt32),
190
# (DtypeKind.UINT, 32, "I", "="),
191
# ),
192
],
193
)
194
def test_get_buffers_data(
195
series: pl.Series,
196
expected_data: pl.Series,
197
expected_dtype: Dtype,
198
) -> None:
199
col = PolarsColumn(series)
200
201
out = col.get_buffers()
202
203
data_buffer, data_dtype = out["data"]
204
assert_series_equal(data_buffer._data, expected_data)
205
assert data_dtype == expected_dtype
206
207
208
def test_get_buffers_int() -> None:
209
s = pl.Series([1, 2, 3], dtype=pl.Int8)
210
col = PolarsColumn(s)
211
212
out = col.get_buffers()
213
214
data_buffer, data_dtype = out["data"]
215
assert_series_equal(data_buffer._data, s)
216
assert data_dtype == (DtypeKind.INT, 8, "c", "=")
217
218
assert out["validity"] is None
219
assert out["offsets"] is None
220
221
222
def test_get_buffers_with_validity_and_offsets() -> None:
223
s = pl.Series(["a", "bc", None, "éâç"])
224
col = PolarsColumn(s)
225
226
out = col.get_buffers()
227
228
data_buffer, data_dtype = out["data"]
229
expected = pl.Series([97, 98, 99, 195, 169, 195, 162, 195, 167], dtype=pl.UInt8)
230
assert_series_equal(data_buffer._data, expected)
231
assert data_dtype == (DtypeKind.UINT, 8, "C", "=")
232
233
validity = out["validity"]
234
assert validity is not None
235
val_buffer, val_dtype = validity
236
expected = pl.Series([True, True, False, True])
237
assert_series_equal(val_buffer._data, expected)
238
assert val_dtype == (DtypeKind.BOOL, 1, "b", "=")
239
240
offsets = out["offsets"]
241
assert offsets is not None
242
offsets_buffer, offsets_dtype = offsets
243
expected = pl.Series([0, 1, 3, 3, 9], dtype=pl.Int64)
244
assert_series_equal(offsets_buffer._data, expected)
245
assert offsets_dtype == (DtypeKind.INT, 64, "l", "=")
246
247
248
def test_get_buffers_chunked_bitmask() -> None:
249
s = pl.Series([True, False], dtype=pl.Boolean)
250
s_chunked = pl.concat([s[:1], s[1:]], rechunk=False)
251
col = PolarsColumn(s_chunked)
252
253
chunks = list(col.get_chunks())
254
assert chunks[0].get_buffers()["data"][0]._data.item() is True
255
assert chunks[1].get_buffers()["data"][0]._data.item() is False
256
257
258
def test_get_buffers_string_zero_copy_fails() -> None:
259
s = pl.Series("a", ["a", "bc"], dtype=pl.String)
260
261
col = PolarsColumn(s, allow_copy=False)
262
263
msg = "string buffers must be converted"
264
with pytest.raises(CopyNotAllowedError, match=msg):
265
col.get_buffers()
266
267
268
@pytest.mark.parametrize("allow_copy", [False, True])
269
def test_get_buffers_categorical(allow_copy: bool) -> None:
270
s = pl.Series("a", ["c", "b"], dtype=pl.Categorical)
271
col = PolarsColumn(s, allow_copy=allow_copy)
272
result = col.get_buffers()
273
274
data_buffer, _ = result["data"]
275
assert len(data_buffer._data) == 2
276
assert data_buffer._data[0] != data_buffer._data[1]
277
assert data_buffer._data.dtype == pl.UInt32
278
279
280
def test_get_buffers_chunked_zero_copy_fails() -> None:
281
s1 = pl.Series([1, 2, 3])
282
s = pl.concat([s1, s1], rechunk=False)
283
col = PolarsColumn(s, allow_copy=False)
284
285
with pytest.raises(
286
CopyNotAllowedError, match="non-contiguous buffer must be made contiguous"
287
):
288
col.get_buffers()
289
290
291
def test_wrap_data_buffer() -> None:
292
values = pl.Series([1, 2, 3])
293
col = PolarsColumn(pl.Series())
294
295
result_buffer, result_dtype = col._wrap_data_buffer(values)
296
297
assert_series_equal(result_buffer._data, values)
298
assert result_dtype == (DtypeKind.INT, 64, "l", "=")
299
300
301
def test_wrap_validity_buffer() -> None:
302
validity = pl.Series([True, False, True])
303
col = PolarsColumn(pl.Series())
304
305
result = col._wrap_validity_buffer(validity)
306
307
assert result is not None
308
309
result_buffer, result_dtype = result
310
assert_series_equal(result_buffer._data, validity)
311
assert result_dtype == (DtypeKind.BOOL, 1, "b", "=")
312
313
314
def test_wrap_validity_buffer_no_nulls() -> None:
315
col = PolarsColumn(pl.Series())
316
assert col._wrap_validity_buffer(None) is None
317
318
319
def test_wrap_offsets_buffer() -> None:
320
offsets = pl.Series([0, 1, 3, 3, 9], dtype=pl.Int64)
321
col = PolarsColumn(pl.Series())
322
323
result = col._wrap_offsets_buffer(offsets)
324
325
assert result is not None
326
327
result_buffer, result_dtype = result
328
assert_series_equal(result_buffer._data, offsets)
329
assert result_dtype == (DtypeKind.INT, 64, "l", "=")
330
331
332
def test_wrap_offsets_buffer_none() -> None:
333
col = PolarsColumn(pl.Series())
334
assert col._wrap_validity_buffer(None) is None
335
336
337
def test_column_unsupported_type() -> None:
338
s = pl.Series("a", [[4], [5, 6]])
339
col = PolarsColumn(s)
340
341
# Certain column operations work
342
assert col.num_chunks() == 1
343
assert col.null_count == 0
344
345
# Error is raised when unsupported operations are requested
346
with pytest.raises(ValueError, match="not supported"):
347
col.dtype
348
349