Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/namespaces/test_binary.py
6940 views
1
from __future__ import annotations
2
3
import random
4
import struct
5
from datetime import date, datetime, time, timedelta
6
from typing import TYPE_CHECKING
7
8
import numpy as np
9
import pytest
10
from hypothesis import given
11
from hypothesis import strategies as st
12
13
import polars as pl
14
from polars.exceptions import InvalidOperationError
15
from polars.testing import assert_frame_equal, assert_series_equal
16
17
if TYPE_CHECKING:
18
from polars._typing import PolarsDataType, SizeUnit, TransferEncoding
19
20
21
def test_binary_conversions() -> None:
22
df = pl.DataFrame({"blob": [b"abc", None, b"cde"]}).with_columns(
23
pl.col("blob").cast(pl.String).alias("decoded_blob")
24
)
25
26
assert df.to_dict(as_series=False) == {
27
"blob": [b"abc", None, b"cde"],
28
"decoded_blob": ["abc", None, "cde"],
29
}
30
assert df[0, 0] == b"abc"
31
assert df[1, 0] is None
32
assert df.dtypes == [pl.Binary, pl.String]
33
34
35
def test_contains() -> None:
36
df = pl.DataFrame(
37
data=[
38
(1, b"some * * text"),
39
(2, b"(with) special\n * chars"),
40
(3, b"**etc...?$"),
41
(4, None),
42
],
43
schema=["idx", "bin"],
44
orient="row",
45
)
46
for pattern, expected in (
47
(b"e * ", [True, False, False, None]),
48
(b"text", [True, False, False, None]),
49
(b"special", [False, True, False, None]),
50
(b"", [True, True, True, None]),
51
(b"qwe", [False, False, False, None]),
52
):
53
# series
54
assert expected == df["bin"].bin.contains(pattern).to_list()
55
# frame select
56
assert (
57
expected == df.select(pl.col("bin").bin.contains(pattern))["bin"].to_list()
58
)
59
# frame filter
60
assert sum(e for e in expected if e is True) == len(
61
df.filter(pl.col("bin").bin.contains(pattern))
62
)
63
64
65
def test_contains_with_expr() -> None:
66
df = pl.DataFrame(
67
{
68
"bin": [b"some * * text", b"(with) special\n * chars", b"**etc...?$", None],
69
"lit1": [b"e * ", b"", b"qwe", b"None"],
70
"lit2": [None, b"special\n", b"?!", None],
71
}
72
)
73
74
assert df.select(
75
pl.col("bin").bin.contains(pl.col("lit1")).alias("contains_1"),
76
pl.col("bin").bin.contains(pl.col("lit2")).alias("contains_2"),
77
pl.col("bin").bin.contains(pl.lit(None)).alias("contains_3"),
78
).to_dict(as_series=False) == {
79
"contains_1": [True, True, False, None],
80
"contains_2": [None, True, False, None],
81
"contains_3": [None, None, None, None],
82
}
83
84
85
def test_starts_ends_with() -> None:
86
assert pl.DataFrame(
87
{
88
"a": [b"hamburger", b"nuts", b"lollypop", None],
89
"end": [b"ger", b"tg", None, b"anything"],
90
"start": [b"ha", b"nga", None, b"anything"],
91
}
92
).select(
93
pl.col("a").bin.ends_with(b"pop").alias("end_lit"),
94
pl.col("a").bin.ends_with(pl.lit(None)).alias("end_none"),
95
pl.col("a").bin.ends_with(pl.col("end")).alias("end_expr"),
96
pl.col("a").bin.starts_with(b"ham").alias("start_lit"),
97
pl.col("a").bin.ends_with(pl.lit(None)).alias("start_none"),
98
pl.col("a").bin.starts_with(pl.col("start")).alias("start_expr"),
99
).to_dict(as_series=False) == {
100
"end_lit": [False, False, True, None],
101
"end_none": [None, None, None, None],
102
"end_expr": [True, False, None, None],
103
"start_lit": [True, False, False, None],
104
"start_none": [None, None, None, None],
105
"start_expr": [True, False, None, None],
106
}
107
108
109
def test_base64_encode() -> None:
110
df = pl.DataFrame({"data": [b"asd", b"qwe"]})
111
112
assert df["data"].bin.encode("base64").to_list() == ["YXNk", "cXdl"]
113
114
115
def test_base64_decode() -> None:
116
df = pl.DataFrame({"data": [b"YXNk", b"cXdl"]})
117
118
assert df["data"].bin.decode("base64").to_list() == [b"asd", b"qwe"]
119
120
121
def test_hex_encode() -> None:
122
df = pl.DataFrame({"data": [b"asd", b"qwe"]})
123
124
assert df["data"].bin.encode("hex").to_list() == ["617364", "717765"]
125
126
127
def test_hex_decode() -> None:
128
df = pl.DataFrame({"data": [b"617364", b"717765"]})
129
130
assert df["data"].bin.decode("hex").to_list() == [b"asd", b"qwe"]
131
132
133
@pytest.mark.parametrize(
134
"encoding",
135
["hex", "base64"],
136
)
137
def test_compare_encode_between_lazy_and_eager_6814(encoding: TransferEncoding) -> None:
138
df = pl.DataFrame({"x": [b"aa", b"bb", b"cc"]})
139
expr = pl.col("x").bin.encode(encoding)
140
141
result_eager = df.select(expr)
142
dtype = result_eager["x"].dtype
143
144
result_lazy = df.lazy().select(expr).select(pl.col(dtype)).collect()
145
assert_frame_equal(result_eager, result_lazy)
146
147
148
@pytest.mark.parametrize(
149
"encoding",
150
["hex", "base64"],
151
)
152
def test_compare_decode_between_lazy_and_eager_6814(encoding: TransferEncoding) -> None:
153
df = pl.DataFrame({"x": [b"d3d3", b"abcd", b"1234"]})
154
expr = pl.col("x").bin.decode(encoding)
155
156
result_eager = df.select(expr)
157
dtype = result_eager["x"].dtype
158
159
result_lazy = df.lazy().select(expr).select(pl.col(dtype)).collect()
160
assert_frame_equal(result_eager, result_lazy)
161
162
163
@pytest.mark.parametrize(
164
("sz", "unit", "expected"),
165
[(128, "b", 128), (512, "kb", 0.5), (131072, "mb", 0.125)],
166
)
167
def test_binary_size(sz: int, unit: SizeUnit, expected: int | float) -> None:
168
df = pl.DataFrame({"data": [b"\x00" * sz]}, schema={"data": pl.Binary})
169
for sz in (
170
df.select(sz=pl.col("data").bin.size(unit)).item(), # expr
171
df["data"].bin.size(unit).item(), # series
172
):
173
assert sz == expected
174
175
176
@pytest.mark.parametrize(
177
("dtype", "type_size", "struct_type"),
178
[
179
(pl.Int8, 1, "b"),
180
(pl.UInt8, 1, "B"),
181
(pl.Int16, 2, "h"),
182
(pl.UInt16, 2, "H"),
183
(pl.Int32, 4, "i"),
184
(pl.UInt32, 4, "I"),
185
(pl.Int64, 8, "q"),
186
(pl.UInt64, 8, "Q"),
187
(pl.Float32, 4, "f"),
188
(pl.Float64, 8, "d"),
189
],
190
)
191
def test_reinterpret(
192
dtype: pl.DataType,
193
type_size: int,
194
struct_type: str,
195
) -> None:
196
# Make test reproducible
197
random.seed(42)
198
199
byte_arr = [random.randbytes(type_size) for _ in range(3)]
200
df = pl.DataFrame({"x": byte_arr})
201
202
for endianness in ["little", "big"]:
203
# So that mypy doesn't complain
204
struct_endianness = "<" if endianness == "little" else ">"
205
expected = [
206
struct.unpack_from(f"{struct_endianness}{struct_type}", elem_bytes)[0]
207
for elem_bytes in byte_arr
208
]
209
expected_df = pl.DataFrame({"x": expected}, schema={"x": dtype})
210
211
result = df.select(
212
pl.col("x").bin.reinterpret(dtype=dtype, endianness=endianness) # type: ignore[arg-type]
213
)
214
215
assert_frame_equal(result, expected_df)
216
217
218
@pytest.mark.parametrize(
219
("dtype", "inner_type_size", "struct_type"),
220
[
221
(pl.Array(pl.Int8, 3), 1, "b"),
222
(pl.Array(pl.UInt8, 3), 1, "B"),
223
(pl.Array(pl.Int16, 3), 2, "h"),
224
(pl.Array(pl.UInt16, 3), 2, "H"),
225
(pl.Array(pl.Int32, 3), 4, "i"),
226
(pl.Array(pl.UInt32, 3), 4, "I"),
227
(pl.Array(pl.Int64, 3), 8, "q"),
228
(pl.Array(pl.UInt64, 3), 8, "Q"),
229
(pl.Array(pl.Float32, 3), 4, "f"),
230
(pl.Array(pl.Float64, 3), 8, "d"),
231
],
232
)
233
def test_reinterpret_to_array_numeric_types(
234
dtype: pl.Array,
235
inner_type_size: int,
236
struct_type: str,
237
) -> None:
238
# Make test reproducible
239
random.seed(42)
240
241
type_size = inner_type_size
242
shape = dtype.shape
243
if isinstance(shape, int):
244
shape = (shape,)
245
for dim_size in dtype.shape:
246
type_size *= dim_size
247
248
byte_arr = [random.randbytes(type_size) for _ in range(3)]
249
df = pl.DataFrame({"x": byte_arr}, orient="row")
250
251
for endianness in ["little", "big"]:
252
result = df.select(
253
pl.col("x").bin.reinterpret(dtype=dtype, endianness=endianness) # type: ignore[arg-type]
254
)
255
256
# So that mypy doesn't complain
257
struct_endianness = "<" if endianness == "little" else ">"
258
expected = []
259
for elem_bytes in byte_arr:
260
vals = [
261
struct.unpack_from(
262
f"{struct_endianness}{struct_type}",
263
elem_bytes[idx : idx + inner_type_size],
264
)[0]
265
for idx in range(0, type_size, inner_type_size)
266
]
267
if len(shape) > 1:
268
vals = np.reshape(vals, shape).tolist()
269
expected.append(vals)
270
expected_df = pl.DataFrame({"x": expected}, schema={"x": dtype})
271
272
assert_frame_equal(result, expected_df)
273
274
275
@pytest.mark.parametrize(
276
("dtype", "binary_value", "expected_values"),
277
[
278
(pl.Date(), b"\x06\x00\x00\x00", [date(1970, 1, 7)]),
279
(
280
pl.Datetime(),
281
b"\x40\xb6\xfd\xe3\x7c\x00\x00\x00",
282
[datetime(1970, 1, 7, 5, 0, 1)],
283
),
284
(
285
pl.Duration(),
286
b"\x03\x00\x00\x00\x00\x00\x00\x00",
287
[timedelta(microseconds=3)],
288
),
289
(
290
pl.Time(),
291
b"\x58\x1b\x00\x00\x00\x00\x00\x00",
292
[time(microsecond=7)],
293
),
294
(
295
pl.Int128(),
296
b"\x06\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
297
[6],
298
),
299
],
300
)
301
def test_reinterpret_to_additional_types(
302
dtype: PolarsDataType, binary_value: bytes, expected_values: list[object]
303
) -> None:
304
series = pl.Series([binary_value])
305
306
# Direct conversion:
307
result = series.bin.reinterpret(dtype=dtype, endianness="little")
308
assert_series_equal(result, pl.Series(expected_values, dtype=dtype))
309
310
# Array conversion:
311
dtype = pl.Array(dtype, 1)
312
result = series.bin.reinterpret(dtype=dtype, endianness="little")
313
assert_series_equal(result, pl.Series([expected_values], dtype=dtype))
314
315
316
def test_reinterpret_to_array_resulting_in_nulls() -> None:
317
series = pl.Series([None, b"short", b"justrite", None, b"waytoolong"])
318
as_bin = series.bin.reinterpret(dtype=pl.Array(pl.UInt32(), 2), endianness="little")
319
assert as_bin.to_list() == [None, None, [0x7473756A, 0x65746972], None, None]
320
as_bin = series.bin.reinterpret(dtype=pl.Array(pl.UInt32(), 2), endianness="big")
321
assert as_bin.to_list() == [None, None, [0x6A757374, 0x72697465], None, None]
322
323
324
def test_reinterpret_to_n_dimensional_array() -> None:
325
series = pl.Series([b"abcd"])
326
for endianness in ["big", "little"]:
327
with pytest.raises(
328
InvalidOperationError,
329
match="reinterpret to a linear Array, and then use reshape",
330
):
331
series.bin.reinterpret(
332
dtype=pl.Array(pl.UInt32(), (2, 2)),
333
endianness=endianness, # type: ignore[arg-type]
334
)
335
336
337
def test_reinterpret_to_zero_length_array() -> None:
338
arr_dtype = pl.Array(pl.UInt8, 0)
339
result = pl.Series([b"", b""]).bin.reinterpret(dtype=arr_dtype)
340
assert_series_equal(result, pl.Series([[], []], dtype=arr_dtype))
341
342
343
@given(
344
value1=st.integers(0, 2**63),
345
value2=st.binary(min_size=0, max_size=7),
346
value3=st.integers(0, 2**63),
347
)
348
def test_reinterpret_to_array_different_alignment(
349
value1: int, value2: bytes, value3: int
350
) -> None:
351
series = pl.Series([struct.pack("<Q", value1), value2, struct.pack("<Q", value3)])
352
arr_dtype = pl.Array(pl.UInt64, 1)
353
as_uint64 = series.bin.reinterpret(dtype=arr_dtype, endianness="little")
354
assert_series_equal(
355
pl.Series([[value1], None, [value3]], dtype=arr_dtype), as_uint64
356
)
357
358
359
@pytest.mark.parametrize(
360
"bad_dtype",
361
[
362
pl.Array(pl.Array(pl.UInt8, 1), 1),
363
pl.String(),
364
pl.Array(pl.List(pl.UInt8()), 1),
365
pl.Array(pl.Null(), 1),
366
pl.Array(pl.Boolean(), 1),
367
],
368
)
369
def test_reinterpret_unsupported(bad_dtype: pl.DataType) -> None:
370
series = pl.Series([b"12345678"])
371
lazy_df = pl.DataFrame({"s": series}).lazy()
372
expected = "cannot reinterpret binary to dtype.*Only numeric or temporal dtype.*"
373
for endianness in ["little", "big"]:
374
with pytest.raises(InvalidOperationError, match=expected):
375
series.bin.reinterpret(dtype=bad_dtype, endianness=endianness) # type: ignore[arg-type]
376
with pytest.raises(InvalidOperationError, match=expected):
377
lazy_df.select(
378
pl.col("s").bin.reinterpret(dtype=bad_dtype, endianness=endianness) # type: ignore[arg-type]
379
).collect_schema()
380
381
382
@pytest.mark.parametrize(
383
("dtype", "type_size"),
384
[
385
(pl.Int128, 16),
386
],
387
)
388
def test_reinterpret_int(
389
dtype: pl.DataType,
390
type_size: int,
391
) -> None:
392
# Function used for testing integers that `struct` or `numpy`
393
# doesn't support parsing from bytes.
394
# Rather than creating bytes directly, create integer and view it as bytes
395
is_signed = dtype.is_signed_integer()
396
397
if is_signed:
398
min_val = -(2 ** (type_size - 1))
399
max_val = 2 ** (type_size - 1) - 1
400
else:
401
min_val = 0
402
max_val = 2**type_size - 1
403
404
# Make test reproducible
405
random.seed(42)
406
407
expected = [random.randint(min_val, max_val) for _ in range(3)]
408
expected_df = pl.DataFrame({"x": expected}, schema={"x": dtype})
409
410
for endianness in ["little", "big"]:
411
byte_arr = [
412
val.to_bytes(type_size, byteorder=endianness, signed=is_signed) # type: ignore[arg-type]
413
for val in expected
414
]
415
df = pl.DataFrame({"x": byte_arr})
416
417
result = df.select(
418
pl.col("x").bin.reinterpret(dtype=dtype, endianness=endianness) # type: ignore[arg-type]
419
)
420
421
assert_frame_equal(result, expected_df)
422
423
424
def test_reinterpret_invalid() -> None:
425
# Fails because buffer has more than 4 bytes
426
df = pl.DataFrame({"x": [b"d3d3a"]})
427
print(struct.unpack_from("<i", b"d3d3a"))
428
assert_frame_equal(
429
df.select(pl.col("x").bin.reinterpret(dtype=pl.Int32)),
430
pl.DataFrame({"x": [None]}, schema={"x": pl.Int32}),
431
)
432
433
# Fails because buffer has less than 4 bytes
434
df = pl.DataFrame({"x": [b"d3"]})
435
print(df.select(pl.col("x").bin.reinterpret(dtype=pl.Int32)))
436
assert_frame_equal(
437
df.select(pl.col("x").bin.reinterpret(dtype=pl.Int32)),
438
pl.DataFrame({"x": [None]}, schema={"x": pl.Int32}),
439
)
440
441
# Fails because dtype is invalid
442
with pytest.raises(pl.exceptions.InvalidOperationError):
443
df.select(pl.col("x").bin.reinterpret(dtype=pl.String))
444
445
446
@pytest.mark.parametrize("func", ["contains", "starts_with", "ends_with"])
447
def test_bin_contains_unequal_lengths_22018(func: str) -> None:
448
s = pl.Series("a", [b"a", b"xyz"], pl.Binary).bin
449
f = getattr(s, func)
450
with pytest.raises(pl.exceptions.ShapeError):
451
f(pl.Series([b"x", b"y", b"z"]))
452
453