Path: blob/main/py-polars/tests/unit/interop/test_interop.py
6939 views
from __future__ import annotations12from datetime import date, datetime, time, timedelta, timezone3from typing import Any, cast45import numpy as np6import pandas as pd7import pyarrow as pa8import pytest910import polars as pl11from polars.exceptions import ComputeError, DuplicateError, UnstableWarning12from polars.interchange.protocol import CompatLevel13from polars.testing import assert_frame_equal, assert_series_equal14from tests.unit.utils.pycapsule_utils import PyCapsuleStreamHolder151617def test_arrow_list_roundtrip() -> None:18# https://github.com/pola-rs/polars/issues/106419tbl = pa.table({"a": [1], "b": [[1, 2]]})20arw = pl.from_arrow(tbl).to_arrow()2122assert arw.shape == tbl.shape23assert arw.schema.names == tbl.schema.names24for c1, c2 in zip(arw.columns, tbl.columns):25assert c1.to_pylist() == c2.to_pylist()262728def test_arrow_null_roundtrip() -> None:29tbl = pa.table({"a": [None, None], "b": [[None, None], [None, None]]})30df = pl.from_arrow(tbl)3132if isinstance(df, pl.DataFrame):33assert df.dtypes == [pl.Null, pl.List(pl.Null)]3435arw = df.to_arrow()3637assert arw.shape == tbl.shape38assert arw.schema.names == tbl.schema.names39for c1, c2 in zip(arw.columns, tbl.columns):40assert c1.to_pylist() == c2.to_pylist()414243def test_arrow_empty_dataframe() -> None:44# 0x0 dataframe45df = pl.DataFrame({})46tbl = pa.table({})47assert df.to_arrow() == tbl48df2 = cast(pl.DataFrame, pl.from_arrow(df.to_arrow()))49assert_frame_equal(df2, df)5051# 0 row dataframe52df = pl.DataFrame({}, schema={"a": pl.Int32})53tbl = pa.Table.from_batches([], pa.schema([pa.field("a", pa.int32())]))54assert df.to_arrow() == tbl55df2 = cast(pl.DataFrame, pl.from_arrow(df.to_arrow()))56assert df2.schema == {"a": pl.Int32}57assert df2.shape == (0, 1)585960def test_arrow_dict_to_polars() -> None:61pa_dict = pa.DictionaryArray.from_arrays(62indices=np.array([0, 1, 2, 3, 1, 0, 2, 3, 3, 2]),63dictionary=np.array(["AAA", "BBB", "CCC", "DDD"]),64).cast(pa.large_utf8())6566s = pl.Series(67name="pa_dict",68values=["AAA", "BBB", "CCC", "DDD", "BBB", "AAA", "CCC", "DDD", "DDD", "CCC"],69)70assert_series_equal(s, pl.Series("pa_dict", pa_dict))717273def test_arrow_list_chunked_array() -> None:74a = pa.array([[1, 2], [3, 4]])75ca = pa.chunked_array([a, a, a])76s = cast(pl.Series, pl.from_arrow(ca))77assert s.dtype == pl.List787980# Test that polars convert Arrays of logical types correctly to arrow81def test_arrow_array_logical() -> None:82# cast to large string and uint8 indices because polars converts to those83pa_data1 = (84pa.array(["a", "b", "c", "d"])85.dictionary_encode()86.cast(pa.dictionary(pa.uint8(), pa.large_string()))87)88pa_array_logical1 = pa.FixedSizeListArray.from_arrays(pa_data1, 2)8990s1 = pl.Series(91values=[["a", "b"], ["c", "d"]],92dtype=pl.Array(pl.Enum(["a", "b", "c", "d"]), shape=2),93)94assert s1.to_arrow() == pa_array_logical19596pa_data2 = pa.array([date(2024, 1, 1), date(2024, 1, 2)])97pa_array_logical2 = pa.FixedSizeListArray.from_arrays(pa_data2, 1)9899s2 = pl.Series(100values=[[date(2024, 1, 1)], [date(2024, 1, 2)]],101dtype=pl.Array(pl.Date, shape=1),102)103assert s2.to_arrow() == pa_array_logical2104105106def test_from_dict() -> None:107data = {"a": [1, 2], "b": [3, 4]}108df = pl.from_dict(data)109assert df.shape == (2, 2)110for s1, s2 in zip(list(df), [pl.Series("a", [1, 2]), pl.Series("b", [3, 4])]):111assert_series_equal(s1, s2)112113114def test_from_dict_struct() -> None:115data: dict[str, dict[str, list[int]] | list[int]] = {116"a": {"b": [1, 3], "c": [2, 4]},117"d": [5, 6],118}119df = pl.from_dict(data)120assert df.shape == (2, 2)121assert df["a"][0] == {"b": 1, "c": 2}122assert df["a"][1] == {"b": 3, "c": 4}123assert df.schema == {"a": pl.Struct({"b": pl.Int64, "c": pl.Int64}), "d": pl.Int64}124125126def test_from_dicts() -> None:127data = [{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": None}]128df = pl.from_dicts(data) # type: ignore[arg-type]129assert df.shape == (3, 2)130assert df.rows() == [(1, 4), (2, 5), (3, None)]131assert df.schema == {"a": pl.Int64, "b": pl.Int64}132133134def test_from_dict_no_inference() -> None:135schema = {"a": pl.String}136data = [{"a": "aa"}]137df = pl.from_dicts(data, schema_overrides=schema, infer_schema_length=0)138assert df.schema == schema139assert df.to_dicts() == data140141142def test_from_dicts_schema_override() -> None:143schema = {144"a": pl.String,145"b": pl.Int64,146"c": pl.List(pl.Struct({"x": pl.Int64, "y": pl.String, "z": pl.Float64})),147}148149# initial data matches the expected schema150data1 = [151{152"a": "l",153"b": i,154"c": [{"x": (j + 2), "y": "?", "z": (j % 2)} for j in range(2)],155}156for i in range(5)157]158159# extend with a mix of fields that are/not in the schema160data2 = [{"b": i + 5, "d": "ABC", "e": "DEF"} for i in range(5)]161162for n_infer in (0, 3, 5, 8, 10, 100):163df = pl.DataFrame(164data=(data1 + data2),165schema=schema, # type: ignore[arg-type]166infer_schema_length=n_infer,167)168assert df.schema == schema169assert df.rows() == [170("l", 0, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),171("l", 1, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),172("l", 2, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),173("l", 3, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),174("l", 4, [{"x": 2, "y": "?", "z": 0.0}, {"x": 3, "y": "?", "z": 1.0}]),175(None, 5, None),176(None, 6, None),177(None, 7, None),178(None, 8, None),179(None, 9, None),180]181182183def test_from_dicts_struct() -> None:184data = [{"a": {"b": 1, "c": 2}, "d": 5}, {"a": {"b": 3, "c": 4}, "d": 6}]185df = pl.from_dicts(data)186assert df.shape == (2, 2)187assert df["a"][0] == {"b": 1, "c": 2}188assert df["a"][1] == {"b": 3, "c": 4}189190# 5649191assert pl.from_dicts([{"a": [{"x": 1}]}, {"a": [{"y": 1}]}]).to_dict(192as_series=False193) == {"a": [[{"y": None, "x": 1}], [{"y": 1, "x": None}]]}194assert pl.from_dicts([{"a": [{"x": 1}, {"y": 2}]}, {"a": [{"y": 1}]}]).to_dict(195as_series=False196) == {"a": [[{"y": None, "x": 1}, {"y": 2, "x": None}], [{"y": 1, "x": None}]]}197198199def test_from_records() -> None:200data = [[1, 2, 3], [4, 5, 6]]201df = pl.from_records(data, schema=["a", "b"])202assert df.shape == (3, 2)203assert df.rows() == [(1, 4), (2, 5), (3, 6)]204205206# https://github.com/pola-rs/polars/issues/15195207@pytest.mark.parametrize(208"input",209[210pl.Series([1, 2]),211pl.Series([{"a": 1, "b": 2}]),212pl.DataFrame({"a": [1, 2], "b": [3, 4]}),213],214)215def test_from_records_non_sequence_input(input: Any) -> None:216with pytest.raises(TypeError, match="expected data of type Sequence"):217pl.from_records(input)218219220def test_from_arrow() -> None:221data = pa.table({"a": [1, 2, 3], "b": [4, 5, 6]})222df = pl.from_arrow(data)223assert df.shape == (3, 2)224assert df.rows() == [(1, 4), (2, 5), (3, 6)] # type: ignore[union-attr]225226# if not a PyArrow type, raise a TypeError227with pytest.raises(TypeError):228_ = pl.from_arrow([1, 2])229230df = pl.from_arrow(231data, schema=["a", "b"], schema_overrides={"a": pl.UInt32, "b": pl.UInt64}232)233assert df.rows() == [(1, 4), (2, 5), (3, 6)] # type: ignore[union-attr]234assert df.schema == {"a": pl.UInt32, "b": pl.UInt64} # type: ignore[union-attr]235236237def test_from_arrow_with_bigquery_metadata() -> None:238arrow_schema = pa.schema(239[240pa.field("id", pa.int64()).with_metadata(241{"ARROW:extension:name": "google:sqlType:integer"}242),243pa.field(244"misc",245pa.struct([("num", pa.int32()), ("val", pa.string())]),246).with_metadata({"ARROW:extension:name": "google:sqlType:struct"}),247]248)249arrow_tbl = pa.Table.from_pylist(250[{"id": 1, "misc": None}, {"id": 2, "misc": None}],251schema=arrow_schema,252)253254expected_data = {"id": [1, 2], "num": [None, None], "val": [None, None]}255expected_schema = {"id": pl.Int64, "num": pl.Int32, "val": pl.String}256assert_frame_equal(257pl.DataFrame(expected_data, schema=expected_schema),258pl.from_arrow(arrow_tbl).unnest("misc"), # type: ignore[union-attr]259)260261262def test_from_optional_not_available() -> None:263from polars.dependencies import _LazyModule264265# proxy module is created dynamically if the required module is not available266# (see the polars.dependencies source code for additional detail/comments)267268np = _LazyModule("numpy", module_available=False)269with pytest.raises(ImportError, match=r"np\.array requires 'numpy'"):270pl.from_numpy(np.array([[1, 2], [3, 4]]), schema=["a", "b"])271272pa = _LazyModule("pyarrow", module_available=False)273with pytest.raises(ImportError, match=r"pa\.table requires 'pyarrow'"):274pl.from_arrow(pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}))275276pd = _LazyModule("pandas", module_available=False)277with pytest.raises(ImportError, match=r"pd\.Series requires 'pandas'"):278pl.from_pandas(pd.Series([1, 2, 3]))279280281def test_upcast_pyarrow_dicts() -> None:282# https://github.com/pola-rs/polars/issues/1752283tbls = [284pa.table(285{286"col_name": pa.array(287[f"value_{i}"], pa.dictionary(pa.int8(), pa.string())288)289}290)291for i in range(128)292]293294tbl = pa.concat_tables(tbls, promote_options="default")295out = cast(pl.DataFrame, pl.from_arrow(tbl))296assert out.shape == (128, 1)297assert out["col_name"][0] == "value_0"298assert out["col_name"][127] == "value_127"299300301def test_no_rechunk() -> None:302table = pa.Table.from_pydict({"x": pa.chunked_array([list("ab"), list("cd")])})303# table304assert pl.from_arrow(table, rechunk=False).n_chunks() == 2305# chunked array306assert pl.from_arrow(table["x"], rechunk=False).n_chunks() == 2307308309def test_from_empty_arrow() -> None:310df = cast(pl.DataFrame, pl.from_arrow(pa.table(pd.DataFrame({"a": [], "b": []}))))311assert df.columns == ["a", "b"]312assert df.dtypes == [pl.Float64, pl.Float64]313314# 2705315df1 = pd.DataFrame(columns=["b"], dtype=float, index=pd.Index([]))316tbl = pa.Table.from_pandas(df1)317out = cast(pl.DataFrame, pl.from_arrow(tbl))318assert out.columns == ["b", "__index_level_0__"]319assert out.dtypes == [pl.Float64, pl.Null]320tbl = pa.Table.from_pandas(df1, preserve_index=False)321out = cast(pl.DataFrame, pl.from_arrow(tbl))322assert out.columns == ["b"]323assert out.dtypes == [pl.Float64]324325# 4568326tbl = pa.table({"l": []}, schema=pa.schema([("l", pa.large_list(pa.uint8()))]))327328df = cast(pl.DataFrame, pl.from_arrow(tbl))329assert df.schema["l"] == pl.List(pl.UInt8)330331332def test_cat_int_types_3500() -> None:333# Create an enum / categorical / dictionary typed pyarrow array334# Most simply done by creating a pandas categorical series first335categorical_s = pd.Series(["a", "a", "b"], dtype="category")336pyarrow_array = pa.Array.from_pandas(categorical_s)337338# The in-memory representation of each category can either be a signed or339# unsigned 8-bit integer. Pandas uses Int8...340int_dict_type = pa.dictionary(index_type=pa.int8(), value_type=pa.utf8())341# ... while DuckDB uses UInt8342uint_dict_type = pa.dictionary(index_type=pa.uint8(), value_type=pa.utf8())343344for t in [int_dict_type, uint_dict_type]:345s = cast(pl.Series, pl.from_arrow(pyarrow_array.cast(t)))346assert_series_equal(347s, pl.Series(["a", "a", "b"]).cast(pl.Categorical), check_names=False348)349350351def test_from_pyarrow_chunked_array() -> None:352column = pa.chunked_array([[1], [2]])353series = pl.Series("column", column)354assert series.to_list() == [1, 2]355356357def test_arrow_list_null_5697() -> None:358# Create a pyarrow table with a list[null] column.359pa_table = pa.table([[[None]]], names=["mycol"])360df = pl.from_arrow(pa_table)361pa_table = df.to_arrow()362# again to polars to test the schema363assert pl.from_arrow(pa_table).schema == {"mycol": pl.List(pl.Null)} # type: ignore[union-attr]364365366def test_from_pyarrow_map() -> None:367pa_table = pa.table(368[[1, 2], [[("a", "something")], [("a", "else"), ("b", "another key")]]],369schema=pa.schema(370[("idx", pa.int16()), ("mapping", pa.map_(pa.string(), pa.string()))]371),372)373374# Convert from an empty table to trigger an ArrowSchema -> native schema375# conversion (checks that ArrowDataType::Map is handled in Rust).376pl.DataFrame(pa_table.slice(0, 0))377378result = pl.DataFrame(pa_table)379assert result.to_dict(as_series=False) == {380"idx": [1, 2],381"mapping": [382[{"key": "a", "value": "something"}],383[{"key": "a", "value": "else"}, {"key": "b", "value": "another key"}],384],385}386387388def test_from_fixed_size_binary_list() -> None:389val = [[b"63A0B1C66575DD5708E1EB2B"]]390arrow_array = pa.array(val, type=pa.list_(pa.binary(24)))391s = cast(pl.Series, pl.from_arrow(arrow_array))392assert s.dtype == pl.List(pl.Binary)393assert s.to_list() == val394395396def test_dataframe_from_repr() -> None:397# round-trip various types398frame = (399pl.LazyFrame(400{401"a": [1, 2, None],402"b": [4.5, 5.5, 6.5],403"c": ["x", "y", "z"],404"d": [True, False, True],405"e": [None, "", None],406"f": [date(2022, 7, 5), date(2023, 2, 5), date(2023, 8, 5)],407"g": [time(0, 0, 0, 1), time(12, 30, 45), time(23, 59, 59, 999000)],408"h": [409datetime(2022, 7, 5, 10, 30, 45, 4560),410datetime(2023, 10, 12, 20, 3, 8, 11),411None,412],413},414)415.with_columns(416pl.col("c").cast(pl.Categorical),417pl.col("h").cast(pl.Datetime("ns")),418)419.collect()420)421422assert frame.schema == {423"a": pl.Int64,424"b": pl.Float64,425"c": pl.Categorical(ordering="lexical"),426"d": pl.Boolean,427"e": pl.String,428"f": pl.Date,429"g": pl.Time,430"h": pl.Datetime("ns"),431}432df = cast(pl.DataFrame, pl.from_repr(repr(frame)))433assert_frame_equal(frame, df)434435# empty frame; confirm schema is inferred436df = cast(437pl.DataFrame,438pl.from_repr(439"""440┌─────┬─────┬─────┬─────┬─────┬───────┐441│ id ┆ q1 ┆ q2 ┆ q3 ┆ q4 ┆ total │442│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │443│ str ┆ i8 ┆ i16 ┆ i32 ┆ i64 ┆ f64 │444╞═════╪═════╪═════╪═════╪═════╪═══════╡445└─────┴─────┴─────┴─────┴─────┴───────┘446"""447),448)449assert df.shape == (0, 6)450assert df.rows() == []451assert df.schema == {452"id": pl.String,453"q1": pl.Int8,454"q2": pl.Int16,455"q3": pl.Int32,456"q4": pl.Int64,457"total": pl.Float64,458}459460# empty frame with no dtypes461df = cast(462pl.DataFrame,463pl.from_repr(464"""465┌──────┬───────┐466│ misc ┆ other │467╞══════╪═══════╡468└──────┴───────┘469"""470),471)472assert_frame_equal(df, pl.DataFrame(schema={"misc": pl.String, "other": pl.String}))473474# empty frame with a non-standard/blank 'null' in numeric col475df = cast(476pl.DataFrame,477pl.from_repr(478"""479┌─────┬──────┐480│ c1 ┆ c2 │481│ --- ┆ --- │482│ i32 ┆ f64 │483╞═════╪══════╡484│ │ NULL │485└─────┴──────┘486"""487),488)489assert_frame_equal(490df,491pl.DataFrame(492data=[(None, None)],493schema={"c1": pl.Int32, "c2": pl.Float64},494orient="row",495),496)497498df = cast(499pl.DataFrame,500pl.from_repr(501"""502# >>> Missing cols with old-style ellipsis, nulls, commented out503# ┌────────────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬──────┐504# │ dt ┆ c1 ┆ c2 ┆ c3 ┆ ... ┆ c96 ┆ c97 ┆ c98 ┆ c99 │505# │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │506# │ date ┆ i32 ┆ i32 ┆ i32 ┆ ┆ i64 ┆ i64 ┆ i64 ┆ i64 │507# ╞════════════╪═════╪═════╪═════╪═════╪═════╪═════╪═════╪══════╡508# │ 2023-03-25 ┆ 1 ┆ 2 ┆ 3 ┆ ... ┆ 96 ┆ 97 ┆ 98 ┆ 99 │509# │ 1999-12-31 ┆ 3 ┆ 6 ┆ 9 ┆ ... ┆ 288 ┆ 291 ┆ 294 ┆ null │510# │ null ┆ 9 ┆ 18 ┆ 27 ┆ ... ┆ 864 ┆ 873 ┆ 882 ┆ 891 │511# └────────────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴──────┘512"""513),514)515assert df.schema == {516"dt": pl.Date,517"c1": pl.Int32,518"c2": pl.Int32,519"c3": pl.Int32,520"c96": pl.Int64,521"c97": pl.Int64,522"c98": pl.Int64,523"c99": pl.Int64,524}525assert df.rows() == [526(date(2023, 3, 25), 1, 2, 3, 96, 97, 98, 99),527(date(1999, 12, 31), 3, 6, 9, 288, 291, 294, None),528(None, 9, 18, 27, 864, 873, 882, 891),529]530531df = cast(532pl.DataFrame,533pl.from_repr(534"""535# >>> no dtypes:536# ┌────────────┬──────┐537# │ dt ┆ c99 │538# ╞════════════╪══════╡539# │ 2023-03-25 ┆ 99 │540# │ 1999-12-31 ┆ null │541# │ null ┆ 891 │542# └────────────┴──────┘543"""544),545)546assert df.schema == {"dt": pl.Date, "c99": pl.Int64}547assert df.rows() == [548(date(2023, 3, 25), 99),549(date(1999, 12, 31), None),550(None, 891),551]552553df = cast(554pl.DataFrame,555pl.from_repr(556"""557In [2]: with pl.Config() as cfg:558...: pl.Config.set_tbl_formatting("UTF8_FULL", rounded_corners=True)559...: print(df)560...:561shape: (1, 5)562╭───────────┬────────────┬───┬───────┬────────────────────────────────╮563│ source_ac ┆ source_cha ┆ … ┆ ident ┆ timestamp │564│ tor_id ┆ nnel_id ┆ ┆ --- ┆ --- │565│ --- ┆ --- ┆ ┆ str ┆ datetime[μs, Asia/Tokyo] │566│ i32 ┆ i64 ┆ ┆ ┆ │567╞═══════════╪════════════╪═══╪═══════╪════════════════════════════════╡568│ 123456780 ┆ 9876543210 ┆ … ┆ a:b:c ┆ 2023-03-25 10:56:59.663053 JST │569├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤570│ … ┆ … ┆ … ┆ … ┆ … │571├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤572│ 803065983 ┆ 2055938745 ┆ … ┆ x:y:z ┆ 2023-03-25 12:38:18.050545 JST │573╰───────────┴────────────┴───┴───────┴────────────────────────────────╯574# "Een fluitje van een cent..." :)575"""576),577)578assert df.shape == (2, 4)579assert df.schema == {580"source_actor_id": pl.Int32,581"source_channel_id": pl.Int64,582"ident": pl.String,583"timestamp": pl.Datetime("us", "Asia/Tokyo"),584}585586587def test_dataframe_from_repr_24110() -> None:588df = cast(589pl.DataFrame,590pl.from_repr("""591shape: (7, 1)592┌──────────────┐593│ time_offset │594│ --- │595│ duration[μs] │596╞══════════════╡597│ -2h │598│ 0µs │599│ 2h │600│ +2h │601└──────────────┘602"""),603)604expected = pl.DataFrame(605{606"time_offset": [607timedelta(hours=-2),608timedelta(),609timedelta(hours=2),610timedelta(hours=2),611]612},613schema={"time_offset": pl.Duration("us")},614)615assert_frame_equal(df, expected)616617618def test_dataframe_from_duckdb_repr() -> None:619df = cast(620pl.DataFrame,621pl.from_repr(622"""623# misc streaming stats624┌────────────┬───────┬───────────────────┬───┬────────────────┬───────────────────┐625│ As Of │ Rank │ Year to Date Rank │ … │ Days In Top 10 │ Streaming Seconds │626│ date │ int32 │ varchar │ │ int16 │ int128 │627├────────────┼───────┼───────────────────┼───┼────────────────┼───────────────────┤628│ 2025-05-09 │ 1 │ 1 │ … │ 29 │ 1864939402857430 │629│ 2025-05-09 │ 2 │ 2 │ … │ 15 │ 658937443590045 │630│ 2025-05-09 │ 3 │ 3 │ … │ 9 │ 267876522242076 │631└────────────┴───────┴───────────────────┴───┴────────────────┴───────────────────┘632"""633),634)635expected = pl.DataFrame(636{637"As Of": [date(2025, 5, 9), date(2025, 5, 9), date(2025, 5, 9)],638"Rank": [1, 2, 3],639"Year to Date Rank": ["1", "2", "3"],640"Days In Top 10": [29, 15, 9],641"Streaming Seconds": [1864939402857430, 658937443590045, 267876522242076],642},643schema={644"As Of": pl.Date,645"Rank": pl.Int32,646"Year to Date Rank": pl.String,647"Days In Top 10": pl.Int16,648"Streaming Seconds": pl.Int128,649},650)651assert_frame_equal(expected, df)652653654def test_series_from_repr() -> None:655frame = (656pl.LazyFrame(657{658"a": [1, 2, None],659"b": [4.5, 5.5, 6.5],660"c": ["x", "y", "z"],661"d": [True, False, True],662"e": [None, "", None],663"f": [date(2022, 7, 5), date(2023, 2, 5), date(2023, 8, 5)],664"g": [time(0, 0, 0, 1), time(12, 30, 45), time(23, 59, 59, 999000)],665"h": [666datetime(2022, 7, 5, 10, 30, 45, 4560),667datetime(2023, 10, 12, 20, 3, 8, 11),668None,669],670},671)672.with_columns(673pl.col("c").cast(pl.Categorical),674pl.col("h").cast(pl.Datetime("ns")),675)676.collect()677)678679for col in frame.columns:680s = cast(pl.Series, pl.from_repr(repr(frame[col])))681assert_series_equal(s, frame[col])682683s = cast(684pl.Series,685pl.from_repr(686"""687Out[3]:688shape: (3,)689Series: 's' [str]690[691"a"692…693"c"694]695"""696),697)698assert_series_equal(s, pl.Series("s", ["a", "c"]))699700s = cast(701pl.Series,702pl.from_repr(703"""704Series: 'flt' [f32]705[706]707"""708),709)710assert_series_equal(s, pl.Series("flt", [], dtype=pl.Float32))711712s = cast(713pl.Series,714pl.from_repr(715"""716Series: 'flt' [f64]717[718null719+inf720-inf721inf7220.0723NaN724]725>>> print("stuff")726"""727),728)729inf, nan = float("inf"), float("nan")730assert_series_equal(731s,732pl.Series(733name="flt",734dtype=pl.Float64,735values=[None, inf, -inf, inf, 0.0, nan],736),737)738739740def test_dataframe_from_repr_custom_separators() -> None:741# repr created with custom digit-grouping742# and non-default group/decimal separators743df = cast(744pl.DataFrame,745pl.from_repr(746"""747┌───────────┬────────────┐748│ x ┆ y │749│ --- ┆ --- │750│ i32 ┆ f64 │751╞═══════════╪════════════╡752│ 123.456 ┆ -10.000,55 │753│ -9.876 ┆ 10,0 │754│ 9.999.999 ┆ 8,5e8 │755└───────────┴────────────┘756"""757),758)759assert_frame_equal(760df,761pl.DataFrame(762{763"x": [123456, -9876, 9999999],764"y": [-10000.55, 10.0, 850000000.0],765},766schema={"x": pl.Int32, "y": pl.Float64},767),768)769770771def test_sliced_struct_from_arrow() -> None:772# Create a dataset with 3 rows773tbl = pa.Table.from_arrays(774arrays=[775pa.StructArray.from_arrays(776arrays=[777pa.array([1, 2, 3], pa.int32()),778pa.array(["foo", "bar", "baz"], pa.utf8()),779],780names=["a", "b"],781)782],783names=["struct_col"],784)785786# slice the table787# check if FFI correctly reads sliced788result = cast(pl.DataFrame, pl.from_arrow(tbl.slice(1, 2)))789assert result.to_dict(as_series=False) == {790"struct_col": [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}]791}792793result = cast(pl.DataFrame, pl.from_arrow(tbl.slice(1, 1)))794assert result.to_dict(as_series=False) == {"struct_col": [{"a": 2, "b": "bar"}]}795796797def test_from_arrow_invalid_time_zone() -> None:798arr = pa.array(799[datetime(2021, 1, 1, 0, 0, 0, 0)],800type=pa.timestamp("ns", tz="this-is-not-a-time-zone"),801)802with pytest.raises(803ComputeError, match=r"unable to parse time zone: 'this-is-not-a-time-zone'"804):805pl.from_arrow(arr)806807808@pytest.mark.parametrize(809("fixed_offset", "etc_tz"),810[811("+10:00", "Etc/GMT-10"),812("10:00", "Etc/GMT-10"),813("-10:00", "Etc/GMT+10"),814("+05:00", "Etc/GMT-5"),815("05:00", "Etc/GMT-5"),816("-05:00", "Etc/GMT+5"),817],818)819def test_from_arrow_fixed_offset(fixed_offset: str, etc_tz: str) -> None:820arr = pa.array(821[datetime(2021, 1, 1, 0, 0, 0, 0)],822type=pa.timestamp("us", tz=fixed_offset),823)824result = cast(pl.Series, pl.from_arrow(arr))825expected = pl.Series(826[datetime(2021, 1, 1, tzinfo=timezone.utc)]827).dt.convert_time_zone(etc_tz)828assert_series_equal(result, expected)829830831def test_from_avro_valid_time_zone_13032() -> None:832arr = pa.array(833[datetime(2021, 1, 1, 0, 0, 0, 0)], type=pa.timestamp("ns", tz="00:00")834)835result = cast(pl.Series, pl.from_arrow(arr))836expected = pl.Series([datetime(2021, 1, 1)], dtype=pl.Datetime("ns", "UTC"))837assert_series_equal(result, expected)838839840def test_from_numpy_different_resolution_15991() -> None:841result = pl.Series(842np.array(["2020-01-01"], dtype="datetime64[ns]"), dtype=pl.Datetime("us")843)844expected = pl.Series([datetime(2020, 1, 1)], dtype=pl.Datetime("us"))845assert_series_equal(result, expected)846847848def test_from_numpy_different_resolution_invalid() -> None:849with pytest.raises(ValueError, match="Please cast"):850pl.Series(851np.array(["2020-01-01"], dtype="datetime64[s]"), dtype=pl.Datetime("us")852)853854855def test_compat_level(monkeypatch: pytest.MonkeyPatch) -> None:856# change these if compat level bumped857monkeypatch.setenv("POLARS_WARN_UNSTABLE", "1")858oldest = CompatLevel.oldest()859assert oldest is CompatLevel.oldest() # test singleton860assert oldest._version == 0861with pytest.warns(UnstableWarning):862newest = CompatLevel.newest()863assert newest is CompatLevel.newest()864assert newest._version == 1865866str_col = pl.Series(["awd"])867bin_col = pl.Series([b"dwa"])868assert str_col._newest_compat_level() == newest._version869assert isinstance(str_col.to_arrow(), pa.LargeStringArray)870assert isinstance(str_col.to_arrow(compat_level=oldest), pa.LargeStringArray)871assert isinstance(str_col.to_arrow(compat_level=newest), pa.StringViewArray)872assert isinstance(bin_col.to_arrow(), pa.LargeBinaryArray)873assert isinstance(bin_col.to_arrow(compat_level=oldest), pa.LargeBinaryArray)874assert isinstance(bin_col.to_arrow(compat_level=newest), pa.BinaryViewArray)875876df = pl.DataFrame({"str_col": str_col, "bin_col": bin_col})877assert isinstance(df.to_arrow()["str_col"][0], pa.LargeStringScalar)878assert isinstance(879df.to_arrow(compat_level=oldest)["str_col"][0], pa.LargeStringScalar880)881assert isinstance(882df.to_arrow(compat_level=newest)["str_col"][0], pa.StringViewScalar883)884assert isinstance(df.to_arrow()["bin_col"][0], pa.LargeBinaryScalar)885assert isinstance(886df.to_arrow(compat_level=oldest)["bin_col"][0], pa.LargeBinaryScalar887)888assert isinstance(889df.to_arrow(compat_level=newest)["bin_col"][0], pa.BinaryViewScalar890)891892assert len(df.write_ipc(None).getbuffer()) == 738893assert len(df.write_ipc(None, compat_level=oldest).getbuffer()) == 866894assert len(df.write_ipc(None, compat_level=newest).getbuffer()) == 738895assert len(df.write_ipc_stream(None).getbuffer()) == 520896assert len(df.write_ipc_stream(None, compat_level=oldest).getbuffer()) == 648897assert len(df.write_ipc_stream(None, compat_level=newest).getbuffer()) == 520898899900def test_df_pycapsule_interface() -> None:901df = pl.DataFrame(902{903"a": [1, 2, 3],904"b": ["a", "b", "c"],905"c": ["fooooooooooooooooooooo", "bar", "looooooooooooooooong string"],906}907)908909capsule_df = PyCapsuleStreamHolder(df)910out = pa.table(capsule_df)911assert df.shape == out.shape912assert df.schema.names() == out.schema.names913914schema_overrides = {"a": pl.Int128}915expected_schema = pl.Schema([("a", pl.Int128), ("b", pl.String), ("c", pl.String)])916917for arrow_obj in (918pl.from_arrow(capsule_df), # capsule919out, # table loaded from capsule920):921df_res = pl.from_arrow(arrow_obj, schema_overrides=schema_overrides)922assert expected_schema == df_res.schema # type: ignore[union-attr]923assert isinstance(df_res, pl.DataFrame)924assert df.equals(df_res)925926927def test_misaligned_nested_arrow_19097() -> None:928a = pl.Series("a", [1, 2, 3])929a = a.slice(1, 2) # by slicing we offset=1 the values930a = a.replace(2, None) # then we add a validity mask with offset=0931a = a.reshape((2, 1)) # then we make it nested932assert_series_equal(pl.Series("a", a.to_arrow()), a)933934935def test_arrow_roundtrip_lex_cat_20288() -> None:936tb = (937pl.Series("a", ["A", "B"], pl.Categorical(ordering="lexical"))938.to_frame()939.to_arrow()940)941df = pl.from_arrow(tb)942assert isinstance(df, pl.DataFrame)943dt = df.schema["a"]944assert isinstance(dt, pl.Categorical)945assert dt.ordering == "lexical"946947948def test_from_arrow_20271() -> None:949df = pl.from_arrow(950pa.table({"b": pa.DictionaryArray.from_arrays([0, 1], ["D", "E"])})951)952assert isinstance(df, pl.DataFrame)953assert_series_equal(df.to_series(), pl.Series("b", ["D", "E"], pl.Categorical))954955956def test_to_arrow_empty_chunks_20627() -> None:957df = pl.concat(2 * [pl.Series([1])]).filter(pl.Series([False, True])).to_frame()958assert df.to_arrow().shape == (1, 1)959960961def test_from_arrow_recorbatch() -> None:962n_legs = pa.array([2, 2, 4, 4, 5, 100])963animals = pa.array(964["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]965)966names = ["n_legs", "animals"]967record_batch = pa.RecordBatch.from_arrays([n_legs, animals], names=names)968assert_frame_equal(969pl.DataFrame(record_batch),970pl.DataFrame(971{972"n_legs": n_legs,973"animals": animals,974}975),976)977978979def test_from_arrow_map_containing_timestamp_23658() -> None:980arrow_tbl = pa.Table.from_pydict(981{982"column_1": [983[984{985"field_1": [986{"key": 1, "value": datetime(2025, 1, 1)},987{"key": 2, "value": datetime(2025, 1, 2)},988{"key": 2, "value": None},989]990},991{"field_1": []},992None,993]994],995},996schema=pa.schema(997[998(999"column_1",1000pa.list_(1001pa.struct(1002[1003("field_1", pa.map_(pa.int32(), pa.timestamp("ms"))),1004]1005)1006),1007)1008]1009),1010)10111012expect = pl.DataFrame(1013{1014"column_1": [1015[1016{1017"field_1": [1018{"key": 1, "value": datetime(2025, 1, 1)},1019{"key": 2, "value": datetime(2025, 1, 2)},1020{"key": 2, "value": None},1021]1022},1023{"field_1": []},1024None,1025]1026],1027},1028schema={1029"column_1": pl.List(1030pl.Struct(1031{1032"field_1": pl.List(1033pl.Struct({"key": pl.Int32, "value": pl.Datetime("ms")})1034)1035}1036)1037)1038},1039)10401041out = pl.DataFrame(arrow_tbl)10421043assert_frame_equal(out, expect)104410451046def test_schema_constructor_from_schema_capsule() -> None:1047arrow_schema = pa.schema(1048[pa.field("test", pa.map_(pa.int32(), pa.timestamp("ms")))]1049)10501051assert pl.Schema(arrow_schema) == {1052"test": pl.List(pl.Struct({"key": pl.Int32, "value": pl.Datetime("ms")}))1053}10541055arrow_schema = pa.schema([pa.field("a", pa.int32()), pa.field("a", pa.int32())])10561057with pytest.raises(1058DuplicateError,1059match="arrow schema contained duplicate name: a",1060):1061pl.Schema(arrow_schema)10621063with pytest.raises(1064ValueError,1065match="object passed to pl.Schema did not return struct dtype: object: pyarrow.Field<a: int32>, dtype: Int32",1066):1067pl.Schema(pa.field("a", pa.int32()))10681069assert pl.Schema([pa.field("a", pa.int32()), pa.field("b", pa.string())]) == {1070"a": pl.Int32,1071"b": pl.String,1072}10731074with pytest.raises(1075DuplicateError,1076match="iterable passed to pl.Schema contained duplicate name 'a'",1077):1078pl.Schema([pa.field("a", pa.int32()), pa.field("a", pa.int64())])107910801081def test_to_arrow_24142() -> None:1082df = pl.DataFrame({"a": object(), "b": "any string or bytes"})1083df.to_arrow(compat_level=CompatLevel.oldest())108410851086def test_comprehensive_pycapsule_interface() -> None:1087"""Test all data types via Arrow C Stream PyCapsule interface."""1088from datetime import date, datetime, time, timedelta1089from decimal import Decimal10901091class PyCapsuleStreamWrap:1092def __init__(self, v: Any) -> None:1093self.capsule = v.__arrow_c_stream__()10941095def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:1096return self.capsule10971098def roundtrip_series_pycapsule(s: pl.Series) -> pl.Series:1099return pl.Series(PyCapsuleStreamWrap(s))11001101df = pl.DataFrame(1102{1103"bool": [True, False, None],1104"int8": pl.Series([1, 2, None], dtype=pl.Int8),1105"int16": pl.Series([1, 2, None], dtype=pl.Int16),1106"int32": pl.Series([1, 2, None], dtype=pl.Int32),1107"int64": pl.Series([1, 2, None], dtype=pl.Int64),1108"uint8": pl.Series([1, 2, None], dtype=pl.UInt8),1109"uint16": pl.Series([1, 2, None], dtype=pl.UInt16),1110"uint32": pl.Series([1, 2, None], dtype=pl.UInt32),1111"uint64": pl.Series([1, 2, None], dtype=pl.UInt64),1112"float32": pl.Series([1.1, 2.2, None], dtype=pl.Float32),1113"float64": pl.Series([1.1, 2.2, None], dtype=pl.Float64),1114"string": ["hello", "world", None],1115"binary": [b"hello", b"world", None],1116"decimal": pl.Series(1117[Decimal("1.23"), Decimal("4.56"), None], dtype=pl.Decimal(10, 2)1118),1119"date": [date(2023, 1, 1), date(2023, 1, 2), None],1120"datetime": [1121datetime(2023, 1, 1, 12, 0),1122datetime(2023, 1, 2, 13, 30),1123None,1124],1125"time": [time(12, 0, 0), time(13, 30, 0), None],1126"duration_us": pl.Series(1127[timedelta(days=1), timedelta(hours=2), None], dtype=pl.Duration("us")1128),1129"duration_ms": pl.Series(1130[timedelta(milliseconds=100), timedelta(microseconds=500), None],1131dtype=pl.Duration("ms"),1132),1133"duration_ns": pl.Series(1134[timedelta(seconds=1), timedelta(microseconds=1000), None],1135dtype=pl.Duration("ns"),1136),1137"categorical": pl.Series(1138["apple", "banana", "apple"], dtype=pl.Categorical1139),1140"list_duration": [1141[timedelta(days=1), timedelta(hours=2)],1142[timedelta(minutes=30)],1143None,1144],1145"struct_with_duration": [1146{"x": timedelta(days=1), "y": 1},1147{"x": timedelta(hours=2), "y": 2},1148None,1149],1150}1151).cast(1152{1153"list_duration": pl.List(pl.Duration("us")),1154"struct_with_duration": pl.Struct({"x": pl.Duration("ns"), "y": pl.Int32}),1155}1156)11571158df_roundtrip = df.map_columns(pl.selectors.all(), roundtrip_series_pycapsule)11591160assert_frame_equal(df_roundtrip, df)11611162df_roundtrip_direct = pl.DataFrame(PyCapsuleStreamWrap(df))11631164assert_frame_equal(df_roundtrip_direct, df)116511661167