Path: blob/main/py-polars/tests/unit/operations/namespaces/test_binary.py
6940 views
from __future__ import annotations12import random3import struct4from datetime import date, datetime, time, timedelta5from typing import TYPE_CHECKING67import numpy as np8import pytest9from hypothesis import given10from hypothesis import strategies as st1112import polars as pl13from polars.exceptions import InvalidOperationError14from polars.testing import assert_frame_equal, assert_series_equal1516if TYPE_CHECKING:17from polars._typing import PolarsDataType, SizeUnit, TransferEncoding181920def test_binary_conversions() -> None:21df = pl.DataFrame({"blob": [b"abc", None, b"cde"]}).with_columns(22pl.col("blob").cast(pl.String).alias("decoded_blob")23)2425assert df.to_dict(as_series=False) == {26"blob": [b"abc", None, b"cde"],27"decoded_blob": ["abc", None, "cde"],28}29assert df[0, 0] == b"abc"30assert df[1, 0] is None31assert df.dtypes == [pl.Binary, pl.String]323334def test_contains() -> None:35df = pl.DataFrame(36data=[37(1, b"some * * text"),38(2, b"(with) special\n * chars"),39(3, b"**etc...?$"),40(4, None),41],42schema=["idx", "bin"],43orient="row",44)45for pattern, expected in (46(b"e * ", [True, False, False, None]),47(b"text", [True, False, False, None]),48(b"special", [False, True, False, None]),49(b"", [True, True, True, None]),50(b"qwe", [False, False, False, None]),51):52# series53assert expected == df["bin"].bin.contains(pattern).to_list()54# frame select55assert (56expected == df.select(pl.col("bin").bin.contains(pattern))["bin"].to_list()57)58# frame filter59assert sum(e for e in expected if e is True) == len(60df.filter(pl.col("bin").bin.contains(pattern))61)626364def test_contains_with_expr() -> None:65df = pl.DataFrame(66{67"bin": [b"some * * text", b"(with) special\n * chars", b"**etc...?$", None],68"lit1": [b"e * ", b"", b"qwe", b"None"],69"lit2": [None, b"special\n", b"?!", None],70}71)7273assert df.select(74pl.col("bin").bin.contains(pl.col("lit1")).alias("contains_1"),75pl.col("bin").bin.contains(pl.col("lit2")).alias("contains_2"),76pl.col("bin").bin.contains(pl.lit(None)).alias("contains_3"),77).to_dict(as_series=False) == {78"contains_1": [True, True, False, None],79"contains_2": [None, True, False, None],80"contains_3": [None, None, None, None],81}828384def test_starts_ends_with() -> None:85assert pl.DataFrame(86{87"a": [b"hamburger", b"nuts", b"lollypop", None],88"end": [b"ger", b"tg", None, b"anything"],89"start": [b"ha", b"nga", None, b"anything"],90}91).select(92pl.col("a").bin.ends_with(b"pop").alias("end_lit"),93pl.col("a").bin.ends_with(pl.lit(None)).alias("end_none"),94pl.col("a").bin.ends_with(pl.col("end")).alias("end_expr"),95pl.col("a").bin.starts_with(b"ham").alias("start_lit"),96pl.col("a").bin.ends_with(pl.lit(None)).alias("start_none"),97pl.col("a").bin.starts_with(pl.col("start")).alias("start_expr"),98).to_dict(as_series=False) == {99"end_lit": [False, False, True, None],100"end_none": [None, None, None, None],101"end_expr": [True, False, None, None],102"start_lit": [True, False, False, None],103"start_none": [None, None, None, None],104"start_expr": [True, False, None, None],105}106107108def test_base64_encode() -> None:109df = pl.DataFrame({"data": [b"asd", b"qwe"]})110111assert df["data"].bin.encode("base64").to_list() == ["YXNk", "cXdl"]112113114def test_base64_decode() -> None:115df = pl.DataFrame({"data": [b"YXNk", b"cXdl"]})116117assert df["data"].bin.decode("base64").to_list() == [b"asd", b"qwe"]118119120def test_hex_encode() -> None:121df = pl.DataFrame({"data": [b"asd", b"qwe"]})122123assert df["data"].bin.encode("hex").to_list() == ["617364", "717765"]124125126def test_hex_decode() -> None:127df = pl.DataFrame({"data": [b"617364", b"717765"]})128129assert df["data"].bin.decode("hex").to_list() == [b"asd", b"qwe"]130131132@pytest.mark.parametrize(133"encoding",134["hex", "base64"],135)136def test_compare_encode_between_lazy_and_eager_6814(encoding: TransferEncoding) -> None:137df = pl.DataFrame({"x": [b"aa", b"bb", b"cc"]})138expr = pl.col("x").bin.encode(encoding)139140result_eager = df.select(expr)141dtype = result_eager["x"].dtype142143result_lazy = df.lazy().select(expr).select(pl.col(dtype)).collect()144assert_frame_equal(result_eager, result_lazy)145146147@pytest.mark.parametrize(148"encoding",149["hex", "base64"],150)151def test_compare_decode_between_lazy_and_eager_6814(encoding: TransferEncoding) -> None:152df = pl.DataFrame({"x": [b"d3d3", b"abcd", b"1234"]})153expr = pl.col("x").bin.decode(encoding)154155result_eager = df.select(expr)156dtype = result_eager["x"].dtype157158result_lazy = df.lazy().select(expr).select(pl.col(dtype)).collect()159assert_frame_equal(result_eager, result_lazy)160161162@pytest.mark.parametrize(163("sz", "unit", "expected"),164[(128, "b", 128), (512, "kb", 0.5), (131072, "mb", 0.125)],165)166def test_binary_size(sz: int, unit: SizeUnit, expected: int | float) -> None:167df = pl.DataFrame({"data": [b"\x00" * sz]}, schema={"data": pl.Binary})168for sz in (169df.select(sz=pl.col("data").bin.size(unit)).item(), # expr170df["data"].bin.size(unit).item(), # series171):172assert sz == expected173174175@pytest.mark.parametrize(176("dtype", "type_size", "struct_type"),177[178(pl.Int8, 1, "b"),179(pl.UInt8, 1, "B"),180(pl.Int16, 2, "h"),181(pl.UInt16, 2, "H"),182(pl.Int32, 4, "i"),183(pl.UInt32, 4, "I"),184(pl.Int64, 8, "q"),185(pl.UInt64, 8, "Q"),186(pl.Float32, 4, "f"),187(pl.Float64, 8, "d"),188],189)190def test_reinterpret(191dtype: pl.DataType,192type_size: int,193struct_type: str,194) -> None:195# Make test reproducible196random.seed(42)197198byte_arr = [random.randbytes(type_size) for _ in range(3)]199df = pl.DataFrame({"x": byte_arr})200201for endianness in ["little", "big"]:202# So that mypy doesn't complain203struct_endianness = "<" if endianness == "little" else ">"204expected = [205struct.unpack_from(f"{struct_endianness}{struct_type}", elem_bytes)[0]206for elem_bytes in byte_arr207]208expected_df = pl.DataFrame({"x": expected}, schema={"x": dtype})209210result = df.select(211pl.col("x").bin.reinterpret(dtype=dtype, endianness=endianness) # type: ignore[arg-type]212)213214assert_frame_equal(result, expected_df)215216217@pytest.mark.parametrize(218("dtype", "inner_type_size", "struct_type"),219[220(pl.Array(pl.Int8, 3), 1, "b"),221(pl.Array(pl.UInt8, 3), 1, "B"),222(pl.Array(pl.Int16, 3), 2, "h"),223(pl.Array(pl.UInt16, 3), 2, "H"),224(pl.Array(pl.Int32, 3), 4, "i"),225(pl.Array(pl.UInt32, 3), 4, "I"),226(pl.Array(pl.Int64, 3), 8, "q"),227(pl.Array(pl.UInt64, 3), 8, "Q"),228(pl.Array(pl.Float32, 3), 4, "f"),229(pl.Array(pl.Float64, 3), 8, "d"),230],231)232def test_reinterpret_to_array_numeric_types(233dtype: pl.Array,234inner_type_size: int,235struct_type: str,236) -> None:237# Make test reproducible238random.seed(42)239240type_size = inner_type_size241shape = dtype.shape242if isinstance(shape, int):243shape = (shape,)244for dim_size in dtype.shape:245type_size *= dim_size246247byte_arr = [random.randbytes(type_size) for _ in range(3)]248df = pl.DataFrame({"x": byte_arr}, orient="row")249250for endianness in ["little", "big"]:251result = df.select(252pl.col("x").bin.reinterpret(dtype=dtype, endianness=endianness) # type: ignore[arg-type]253)254255# So that mypy doesn't complain256struct_endianness = "<" if endianness == "little" else ">"257expected = []258for elem_bytes in byte_arr:259vals = [260struct.unpack_from(261f"{struct_endianness}{struct_type}",262elem_bytes[idx : idx + inner_type_size],263)[0]264for idx in range(0, type_size, inner_type_size)265]266if len(shape) > 1:267vals = np.reshape(vals, shape).tolist()268expected.append(vals)269expected_df = pl.DataFrame({"x": expected}, schema={"x": dtype})270271assert_frame_equal(result, expected_df)272273274@pytest.mark.parametrize(275("dtype", "binary_value", "expected_values"),276[277(pl.Date(), b"\x06\x00\x00\x00", [date(1970, 1, 7)]),278(279pl.Datetime(),280b"\x40\xb6\xfd\xe3\x7c\x00\x00\x00",281[datetime(1970, 1, 7, 5, 0, 1)],282),283(284pl.Duration(),285b"\x03\x00\x00\x00\x00\x00\x00\x00",286[timedelta(microseconds=3)],287),288(289pl.Time(),290b"\x58\x1b\x00\x00\x00\x00\x00\x00",291[time(microsecond=7)],292),293(294pl.Int128(),295b"\x06\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",296[6],297),298],299)300def test_reinterpret_to_additional_types(301dtype: PolarsDataType, binary_value: bytes, expected_values: list[object]302) -> None:303series = pl.Series([binary_value])304305# Direct conversion:306result = series.bin.reinterpret(dtype=dtype, endianness="little")307assert_series_equal(result, pl.Series(expected_values, dtype=dtype))308309# Array conversion:310dtype = pl.Array(dtype, 1)311result = series.bin.reinterpret(dtype=dtype, endianness="little")312assert_series_equal(result, pl.Series([expected_values], dtype=dtype))313314315def test_reinterpret_to_array_resulting_in_nulls() -> None:316series = pl.Series([None, b"short", b"justrite", None, b"waytoolong"])317as_bin = series.bin.reinterpret(dtype=pl.Array(pl.UInt32(), 2), endianness="little")318assert as_bin.to_list() == [None, None, [0x7473756A, 0x65746972], None, None]319as_bin = series.bin.reinterpret(dtype=pl.Array(pl.UInt32(), 2), endianness="big")320assert as_bin.to_list() == [None, None, [0x6A757374, 0x72697465], None, None]321322323def test_reinterpret_to_n_dimensional_array() -> None:324series = pl.Series([b"abcd"])325for endianness in ["big", "little"]:326with pytest.raises(327InvalidOperationError,328match="reinterpret to a linear Array, and then use reshape",329):330series.bin.reinterpret(331dtype=pl.Array(pl.UInt32(), (2, 2)),332endianness=endianness, # type: ignore[arg-type]333)334335336def test_reinterpret_to_zero_length_array() -> None:337arr_dtype = pl.Array(pl.UInt8, 0)338result = pl.Series([b"", b""]).bin.reinterpret(dtype=arr_dtype)339assert_series_equal(result, pl.Series([[], []], dtype=arr_dtype))340341342@given(343value1=st.integers(0, 2**63),344value2=st.binary(min_size=0, max_size=7),345value3=st.integers(0, 2**63),346)347def test_reinterpret_to_array_different_alignment(348value1: int, value2: bytes, value3: int349) -> None:350series = pl.Series([struct.pack("<Q", value1), value2, struct.pack("<Q", value3)])351arr_dtype = pl.Array(pl.UInt64, 1)352as_uint64 = series.bin.reinterpret(dtype=arr_dtype, endianness="little")353assert_series_equal(354pl.Series([[value1], None, [value3]], dtype=arr_dtype), as_uint64355)356357358@pytest.mark.parametrize(359"bad_dtype",360[361pl.Array(pl.Array(pl.UInt8, 1), 1),362pl.String(),363pl.Array(pl.List(pl.UInt8()), 1),364pl.Array(pl.Null(), 1),365pl.Array(pl.Boolean(), 1),366],367)368def test_reinterpret_unsupported(bad_dtype: pl.DataType) -> None:369series = pl.Series([b"12345678"])370lazy_df = pl.DataFrame({"s": series}).lazy()371expected = "cannot reinterpret binary to dtype.*Only numeric or temporal dtype.*"372for endianness in ["little", "big"]:373with pytest.raises(InvalidOperationError, match=expected):374series.bin.reinterpret(dtype=bad_dtype, endianness=endianness) # type: ignore[arg-type]375with pytest.raises(InvalidOperationError, match=expected):376lazy_df.select(377pl.col("s").bin.reinterpret(dtype=bad_dtype, endianness=endianness) # type: ignore[arg-type]378).collect_schema()379380381@pytest.mark.parametrize(382("dtype", "type_size"),383[384(pl.Int128, 16),385],386)387def test_reinterpret_int(388dtype: pl.DataType,389type_size: int,390) -> None:391# Function used for testing integers that `struct` or `numpy`392# doesn't support parsing from bytes.393# Rather than creating bytes directly, create integer and view it as bytes394is_signed = dtype.is_signed_integer()395396if is_signed:397min_val = -(2 ** (type_size - 1))398max_val = 2 ** (type_size - 1) - 1399else:400min_val = 0401max_val = 2**type_size - 1402403# Make test reproducible404random.seed(42)405406expected = [random.randint(min_val, max_val) for _ in range(3)]407expected_df = pl.DataFrame({"x": expected}, schema={"x": dtype})408409for endianness in ["little", "big"]:410byte_arr = [411val.to_bytes(type_size, byteorder=endianness, signed=is_signed) # type: ignore[arg-type]412for val in expected413]414df = pl.DataFrame({"x": byte_arr})415416result = df.select(417pl.col("x").bin.reinterpret(dtype=dtype, endianness=endianness) # type: ignore[arg-type]418)419420assert_frame_equal(result, expected_df)421422423def test_reinterpret_invalid() -> None:424# Fails because buffer has more than 4 bytes425df = pl.DataFrame({"x": [b"d3d3a"]})426print(struct.unpack_from("<i", b"d3d3a"))427assert_frame_equal(428df.select(pl.col("x").bin.reinterpret(dtype=pl.Int32)),429pl.DataFrame({"x": [None]}, schema={"x": pl.Int32}),430)431432# Fails because buffer has less than 4 bytes433df = pl.DataFrame({"x": [b"d3"]})434print(df.select(pl.col("x").bin.reinterpret(dtype=pl.Int32)))435assert_frame_equal(436df.select(pl.col("x").bin.reinterpret(dtype=pl.Int32)),437pl.DataFrame({"x": [None]}, schema={"x": pl.Int32}),438)439440# Fails because dtype is invalid441with pytest.raises(pl.exceptions.InvalidOperationError):442df.select(pl.col("x").bin.reinterpret(dtype=pl.String))443444445@pytest.mark.parametrize("func", ["contains", "starts_with", "ends_with"])446def test_bin_contains_unequal_lengths_22018(func: str) -> None:447s = pl.Series("a", [b"a", b"xyz"], pl.Binary).bin448f = getattr(s, func)449with pytest.raises(pl.exceptions.ShapeError):450f(pl.Series([b"x", b"y", b"z"]))451452453