Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/test_row_encoding.py
6939 views
1
from __future__ import annotations
2
3
from decimal import Decimal as D
4
from typing import TYPE_CHECKING
5
6
import pytest
7
from hypothesis import given
8
9
import polars as pl
10
import polars.selectors as cs
11
from polars.testing import assert_frame_equal, assert_series_equal
12
from polars.testing.parametric import dataframes, series
13
from polars.testing.parametric.strategies.dtype import dtypes
14
from tests.unit.conftest import FLOAT_DTYPES, INTEGER_DTYPES
15
16
if TYPE_CHECKING:
17
from typing import Any
18
19
from polars._typing import PolarsDataType
20
21
FIELD_COMBS = [
22
(descending, nulls_last, False)
23
for descending in [False, True]
24
for nulls_last in [False, True]
25
] + [(None, None, True)]
26
27
FIELD_COMBS_ARGS = [
28
{
29
"unordered": unordered,
30
"descending": descending,
31
"nulls_last": nulls_last,
32
}
33
for descending, nulls_last, unordered in FIELD_COMBS
34
]
35
36
37
def roundtrip_re(
38
df: pl.DataFrame,
39
*,
40
unordered: bool = False,
41
descending: list[bool] | None = None,
42
nulls_last: list[bool] | None = None,
43
) -> None:
44
row_encoded = df._row_encode(
45
unordered=unordered,
46
descending=descending,
47
nulls_last=nulls_last,
48
)
49
50
if unordered:
51
return
52
53
names = df.columns
54
dtypes = df.dtypes
55
result = row_encoded._row_decode(
56
names, dtypes, unordered=unordered, descending=descending, nulls_last=nulls_last
57
).struct.unnest()
58
59
assert_frame_equal(df, result)
60
61
62
def roundtrip_series_re(
63
values: pl.series.series.ArrayLike,
64
dtype: PolarsDataType,
65
*,
66
unordered: bool = False,
67
descending: bool | None = None,
68
nulls_last: bool | None = False,
69
) -> None:
70
descending_lst = None if descending is None else [descending]
71
nulls_last_lst = None if nulls_last is None else [nulls_last]
72
73
roundtrip_re(
74
pl.Series("series", values, dtype).to_frame(),
75
unordered=unordered,
76
descending=descending_lst,
77
nulls_last=nulls_last_lst,
78
)
79
80
81
@given(
82
df=dataframes(
83
excluded_dtypes=[
84
pl.Categorical,
85
pl.Decimal, # Bug: see https://github.com/pola-rs/polars/issues/20308
86
]
87
)
88
)
89
@pytest.mark.parametrize(("descending", "nulls_last", "unordered"), FIELD_COMBS)
90
def test_row_encoding_parametric(
91
df: pl.DataFrame,
92
unordered: bool,
93
descending: bool | None,
94
nulls_last: bool | None,
95
) -> None:
96
roundtrip_re(
97
df,
98
unordered=unordered,
99
descending=None if descending is None else [descending] * df.width,
100
nulls_last=None if nulls_last is None else [nulls_last] * df.width,
101
)
102
103
104
@pytest.mark.parametrize("field", FIELD_COMBS_ARGS)
105
def test_nulls(field: Any) -> None:
106
roundtrip_series_re([], pl.Null, **field)
107
roundtrip_series_re([None], pl.Null, **field)
108
roundtrip_series_re([None] * 2, pl.Null, **field)
109
roundtrip_series_re([None] * 13, pl.Null, **field)
110
roundtrip_series_re([None] * 42, pl.Null, **field)
111
112
113
@pytest.mark.parametrize("field", FIELD_COMBS_ARGS)
114
def test_bool(field: Any) -> None:
115
roundtrip_series_re([], pl.Boolean, **field)
116
roundtrip_series_re([False], pl.Boolean, **field)
117
roundtrip_series_re([True], pl.Boolean, **field)
118
roundtrip_series_re([False, True], pl.Boolean, **field)
119
roundtrip_series_re([True, False], pl.Boolean, **field)
120
121
122
@pytest.mark.parametrize("dtype", INTEGER_DTYPES)
123
@pytest.mark.parametrize("field", FIELD_COMBS_ARGS)
124
def test_int(dtype: pl.DataType, field: Any) -> None:
125
min = pl.select(x=dtype.min()).item() # type: ignore[attr-defined]
126
max = pl.select(x=dtype.max()).item() # type: ignore[attr-defined]
127
128
roundtrip_series_re([], dtype, **field)
129
roundtrip_series_re([0], dtype, **field)
130
roundtrip_series_re([min], dtype, **field)
131
roundtrip_series_re([max], dtype, **field)
132
133
roundtrip_series_re([1, 2, 3], dtype, **field)
134
roundtrip_series_re([0, 1, 2, 3], dtype, **field)
135
roundtrip_series_re([min, 0, max], dtype, **field)
136
137
138
@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
139
@pytest.mark.parametrize("field", FIELD_COMBS_ARGS)
140
def test_float(dtype: pl.DataType, field: Any) -> None:
141
inf = float("inf")
142
inf_b = float("-inf")
143
144
roundtrip_series_re([], dtype, **field)
145
roundtrip_series_re([0.0], dtype, **field)
146
roundtrip_series_re([inf], dtype, **field)
147
roundtrip_series_re([-inf_b], dtype, **field)
148
149
roundtrip_series_re([1.0, 2.0, 3.0], dtype, **field)
150
roundtrip_series_re([0.0, 1.0, 2.0, 3.0], dtype, **field)
151
roundtrip_series_re([inf, 0, -inf_b], dtype, **field)
152
153
154
@pytest.mark.parametrize("field", FIELD_COMBS_ARGS)
155
def test_str(field: Any) -> None:
156
dtype = pl.String
157
roundtrip_series_re([], dtype, **field)
158
roundtrip_series_re([""], dtype, **field)
159
160
roundtrip_series_re(["a", "b", "c"], dtype, **field)
161
roundtrip_series_re(["", "a", "b", "c"], dtype, **field)
162
163
roundtrip_series_re(
164
["different", "length", "strings"],
165
dtype,
166
**field,
167
)
168
roundtrip_series_re(
169
["different", "", "length", "", "strings"],
170
dtype,
171
**field,
172
)
173
174
175
@pytest.mark.parametrize("field", FIELD_COMBS_ARGS)
176
def test_struct(field: Any) -> None:
177
dtype = pl.Struct({})
178
roundtrip_series_re([], dtype, **field)
179
roundtrip_series_re([None], dtype, **field)
180
roundtrip_series_re([{}], dtype, **field)
181
roundtrip_series_re([{}, {}, {}], dtype, **field)
182
roundtrip_series_re([{}, None, {}], dtype, **field)
183
184
dtype = pl.Struct({"x": pl.Int32})
185
roundtrip_series_re([{"x": 1}], dtype, **field)
186
roundtrip_series_re([None], dtype, **field)
187
roundtrip_series_re([{"x": 1}] * 3, dtype, **field)
188
roundtrip_series_re([{"x": 1}, {"x": None}, None], dtype, **field)
189
190
dtype = pl.Struct({"x": pl.Int32, "y": pl.Int32})
191
roundtrip_series_re(
192
[{"x": 1}, {"y": 2}],
193
dtype,
194
**field,
195
)
196
roundtrip_series_re([None], dtype, **field)
197
198
199
@pytest.mark.parametrize("field", FIELD_COMBS_ARGS)
200
def test_list(field: Any) -> None:
201
dtype = pl.List(pl.Int32)
202
roundtrip_series_re([], dtype, **field)
203
roundtrip_series_re([[]], dtype, **field)
204
roundtrip_series_re([[1], [2]], dtype, **field)
205
roundtrip_series_re([[1, 2], [3]], dtype, **field)
206
roundtrip_series_re([[1, 2], [], [3]], dtype, **field)
207
roundtrip_series_re([None, [1, 2], None, [], [3]], dtype, **field)
208
209
dtype = pl.List(pl.String)
210
roundtrip_series_re([], dtype, **field)
211
roundtrip_series_re([[]], dtype, **field)
212
roundtrip_series_re([[""], [""]], dtype, **field)
213
roundtrip_series_re([["abc"], ["xyzw"]], dtype, **field)
214
roundtrip_series_re([["x", "yx"], ["abc"]], dtype, **field)
215
roundtrip_series_re([["wow", "this is"], [], ["cool"]], dtype, **field)
216
roundtrip_series_re(
217
[None, ["very", "very"], None, [], ["cool"]],
218
dtype,
219
**field,
220
)
221
222
223
@pytest.mark.parametrize("field", FIELD_COMBS_ARGS)
224
def test_array(field: Any) -> None:
225
dtype = pl.Array(pl.Int32, 0)
226
roundtrip_series_re([], dtype, **field)
227
roundtrip_series_re([[]], dtype, **field)
228
roundtrip_series_re([None, [], None], dtype, **field)
229
roundtrip_series_re([None], dtype, **field)
230
231
dtype = pl.Array(pl.Int32, 2)
232
roundtrip_series_re([], dtype, **field)
233
roundtrip_series_re([[5, 6]], dtype, **field)
234
roundtrip_series_re([[1, 2], [2, 3]], dtype, **field)
235
roundtrip_series_re([[1, 2], [3, 7]], dtype, **field)
236
roundtrip_series_re([[1, 2], [13, 11], [3, 7]], dtype, **field)
237
roundtrip_series_re(
238
[None, [1, 2], None, [13, 11], [5, 7]],
239
dtype,
240
**field,
241
)
242
243
dtype = pl.Array(pl.String, 2)
244
roundtrip_series_re([], dtype, **field)
245
roundtrip_series_re([["a", "b"]], dtype, **field)
246
roundtrip_series_re([["", ""], ["", "a"]], dtype, **field)
247
roundtrip_series_re([["abc", "def"], ["ghi", "xyzw"]], dtype, **field)
248
roundtrip_series_re([["x", "yx"], ["abc", "xxx"]], dtype, **field)
249
roundtrip_series_re(
250
[["wow", "this is"], ["soo", "so"], ["veryyy", "cool"]],
251
dtype,
252
**field,
253
)
254
roundtrip_series_re(
255
[None, ["very", "very"], None, [None, None], ["verryy", "cool"]],
256
dtype,
257
**field,
258
)
259
260
261
@pytest.mark.parametrize("field", FIELD_COMBS_ARGS)
262
@pytest.mark.parametrize("precision", range(1, 38))
263
def test_decimal(field: Any, precision: int) -> None:
264
dtype = pl.Decimal(precision=precision, scale=0)
265
roundtrip_series_re([], dtype, **field)
266
roundtrip_series_re([None], dtype, **field)
267
roundtrip_series_re([D("1")], dtype, **field)
268
roundtrip_series_re([D("-1")], dtype, **field)
269
roundtrip_series_re([D("9" * precision)], dtype, **field)
270
roundtrip_series_re([D("-" + "9" * precision)], dtype, **field)
271
roundtrip_series_re([None, D("-1"), None], dtype, **field)
272
roundtrip_series_re([D("-1"), D("0"), D("1")], dtype, **field)
273
274
275
@pytest.mark.parametrize("field", FIELD_COMBS_ARGS)
276
def test_enum(field: Any) -> None:
277
dtype = pl.Enum([])
278
279
roundtrip_series_re([], dtype, **field)
280
roundtrip_series_re([None], dtype, **field)
281
roundtrip_series_re([None, None], dtype, **field)
282
283
dtype = pl.Enum(["a", "x", "b"])
284
285
roundtrip_series_re([], dtype, **field)
286
roundtrip_series_re([None], dtype, **field)
287
roundtrip_series_re(["a"], dtype, **field)
288
roundtrip_series_re(["x"], dtype, **field)
289
roundtrip_series_re(["b"], dtype, **field)
290
roundtrip_series_re(["b", "x", "a"], dtype, **field)
291
roundtrip_series_re([None, "b", None], dtype, **field)
292
roundtrip_series_re([None, "a", None], dtype, **field)
293
294
295
@pytest.mark.parametrize("size", [127, 128, 255, 256, 2**15, 2**15 + 1])
296
@pytest.mark.parametrize("field", FIELD_COMBS_ARGS)
297
@pytest.mark.slow
298
def test_large_enum(size: int, field: Any) -> None:
299
dtype = pl.Enum([str(i) for i in range(size)])
300
roundtrip_series_re([None, "1"], dtype, **field)
301
roundtrip_series_re(["1", None], dtype, **field)
302
303
roundtrip_series_re(
304
[str(i) for i in range(3, size, int(7 * size / (2**8)))], dtype, **field
305
)
306
307
308
@pytest.mark.parametrize("field", FIELD_COMBS_ARGS)
309
def test_list_arr(field: Any) -> None:
310
dtype = pl.List(pl.Array(pl.String, 2))
311
roundtrip_series_re([], dtype, **field)
312
roundtrip_series_re([None], dtype, **field)
313
roundtrip_series_re([[None]], dtype, **field)
314
roundtrip_series_re([[[None, None]]], dtype, **field)
315
roundtrip_series_re([[["a", "b"]]], dtype, **field)
316
roundtrip_series_re([[["a", "b"], ["xyz", "wowie"]]], dtype, **field)
317
roundtrip_series_re([[["a", "b"]], None, [None, None]], dtype, **field)
318
319
320
@pytest.mark.parametrize("field", FIELD_COMBS_ARGS)
321
def test_list_struct_arr(field: Any) -> None:
322
dtype = pl.List(
323
pl.Struct({"x": pl.Array(pl.String, 2), "y": pl.Array(pl.Int64, 3)})
324
)
325
roundtrip_series_re([], dtype, **field)
326
roundtrip_series_re([None], dtype, **field)
327
roundtrip_series_re([[None]], dtype, **field)
328
roundtrip_series_re([[{"x": None, "y": None}]], dtype, **field)
329
roundtrip_series_re([[{"x": ["a", None], "y": [1, None, 3]}]], dtype, **field)
330
roundtrip_series_re([[{"x": ["a", "xyz"], "y": [1, 7, 3]}]], dtype, **field)
331
roundtrip_series_re([[{"x": ["a", "xyz"], "y": [1, 7, 3]}], []], dtype, **field)
332
333
334
@pytest.mark.parametrize("field", FIELD_COMBS_ARGS)
335
def test_list_nulls(field: Any) -> None:
336
dtype = pl.List(pl.Null)
337
roundtrip_series_re([], dtype, **field)
338
roundtrip_series_re([[]], dtype, **field)
339
roundtrip_series_re([None], dtype, **field)
340
roundtrip_series_re([[None]], dtype, **field)
341
roundtrip_series_re([[None, None, None]], dtype, **field)
342
roundtrip_series_re([[None], [None, None], [None, None, None]], dtype, **field)
343
344
345
@pytest.mark.parametrize("field", FIELD_COMBS_ARGS)
346
def test_masked_out_list_20151(field: Any) -> None:
347
dtype = pl.List(pl.Int64())
348
349
values = [[1, 2], None, [4, 5], [None, 3]]
350
351
array_series = pl.Series(values, dtype=pl.Array(pl.Int64(), 2))
352
list_from_array_series = array_series.cast(dtype)
353
354
roundtrip_series_re(list_from_array_series, dtype, **field)
355
356
357
def test_int_after_null() -> None:
358
roundtrip_re(
359
pl.DataFrame(
360
[
361
pl.Series("a", [None], pl.Null),
362
pl.Series("b", [None], pl.Int8),
363
]
364
),
365
nulls_last=[True, True],
366
)
367
368
369
@pytest.mark.parametrize("field", FIELD_COMBS_ARGS)
370
@given(s=series(allow_null=False, allow_chunks=False, excluded_dtypes=[pl.Categorical]))
371
def test_optional_eq_non_optional_20320(field: Any, s: pl.Series) -> None:
372
with_null = s.extend(pl.Series([None], dtype=s.dtype))
373
374
re_without_null = s._row_encode(**field)
375
re_with_null = with_null._row_encode(**field)
376
377
re_without_null = re_without_null.cast(pl.Binary)
378
re_with_null = re_with_null.cast(pl.Binary)
379
380
assert_series_equal(re_with_null.head(s.len()), re_without_null)
381
382
383
@pytest.mark.parametrize("field", FIELD_COMBS_ARGS)
384
@given(dtype=dtypes(excluded_dtypes=[pl.Categorical]))
385
def test_null(
386
field: Any,
387
dtype: pl.DataType,
388
) -> None:
389
s = pl.Series("a", [None], dtype)
390
391
assert_series_equal(
392
s._row_encode(**field)
393
._row_decode(
394
["a"],
395
[dtype],
396
descending=None if field["descending"] is None else [field["descending"]],
397
nulls_last=None if field["nulls_last"] is None else [field["nulls_last"]],
398
unordered=field["unordered"],
399
)
400
.struct.unnest()
401
.to_series(),
402
s,
403
)
404
405
406
@pytest.mark.parametrize(
407
("dtype", "vs"),
408
[
409
(pl.List(pl.String), [[None], ["A"], ["B"]]),
410
(pl.Array(pl.String, 1), [[None], ["A"], ["B"]]),
411
(pl.Struct({"x": pl.String}), [{"x": None}, {"x": "A"}, {"x": "B"}]),
412
(pl.Array(pl.String, 2), [[None, "Z"], ["A", "C"], ["B", "B"]]),
413
],
414
)
415
def test_nested_sorting_22557(dtype: pl.DataType, vs: list[Any]) -> None:
416
s = pl.Series("a", [vs[1], None, vs[0], vs[2]], dtype)
417
418
assert_series_equal(
419
s.sort(descending=False, nulls_last=False), pl.Series("a", [None] + vs, dtype)
420
)
421
assert_series_equal(
422
s.sort(descending=False, nulls_last=True), pl.Series("a", vs + [None], dtype)
423
)
424
assert_series_equal(
425
s.sort(descending=True, nulls_last=False),
426
pl.Series("a", [None] + vs[::-1], dtype),
427
)
428
assert_series_equal(
429
s.sort(descending=True, nulls_last=True),
430
pl.Series("a", vs[::-1] + [None], dtype),
431
)
432
433
roundtrip_series_re(vs, dtype, descending=False, nulls_last=False)
434
roundtrip_series_re(vs, dtype, descending=False, nulls_last=True)
435
roundtrip_series_re(vs, dtype, descending=True, nulls_last=False)
436
roundtrip_series_re(vs, dtype, descending=True, nulls_last=True)
437
438
assert_series_equal(
439
s._row_encode(descending=False, nulls_last=False).arg_sort(),
440
pl.Series("a", [1, 2, 0, 3], pl.get_index_type()),
441
check_names=False,
442
)
443
assert_series_equal(
444
s._row_encode(descending=False, nulls_last=True).arg_sort(),
445
pl.Series("a", [2, 0, 3, 1], pl.get_index_type()),
446
check_names=False,
447
)
448
assert_series_equal(
449
s._row_encode(descending=True, nulls_last=False).arg_sort(),
450
pl.Series("a", [1, 3, 0, 2], pl.get_index_type()),
451
check_names=False,
452
)
453
assert_series_equal(
454
s._row_encode(descending=True, nulls_last=True).arg_sort(),
455
pl.Series("a", [3, 0, 2, 1], pl.get_index_type()),
456
check_names=False,
457
)
458
459
460
def test_row_encoding_null_chunks() -> None:
461
lf1 = pl.select(a=pl.lit(1, pl.Int64)).lazy()
462
lf2 = pl.select(a=None).lazy()
463
464
lf = pl.concat([lf1, lf2]).select(pl.col.a._row_encode())
465
466
out = (
467
lf.select(cs.all()._row_decode(["a"], [pl.Int64]))
468
.unnest(cs.all())
469
.collect(engine="streaming")
470
)
471
472
assert_frame_equal(
473
pl.concat([lf1, lf2]).collect(),
474
out,
475
)
476
477