Path: blob/main/py-polars/tests/unit/constructors/test_constructors.py
6939 views
from __future__ import annotations12from collections import OrderedDict, namedtuple3from datetime import date, datetime, time, timedelta, timezone4from decimal import Decimal5from random import shuffle6from typing import TYPE_CHECKING, Any, Literal, NamedTuple7from zoneinfo import ZoneInfo89import numpy as np10import pandas as pd11import pyarrow as pa12import pytest13from packaging.version import parse as parse_version14from pydantic import BaseModel, Field, TypeAdapter1516import polars as pl17import polars.selectors as cs18from polars._utils.construction.utils import try_get_type_hints19from polars.datatypes import numpy_char_code_to_dtype20from polars.dependencies import dataclasses, pydantic21from polars.exceptions import DuplicateError, ShapeError22from polars.testing import assert_frame_equal, assert_series_equal23from tests.unit.utils.pycapsule_utils import PyCapsuleArrayHolder, PyCapsuleStreamHolder2425if TYPE_CHECKING:26import sys27from collections.abc import Callable2829from polars._typing import PolarsDataType3031if sys.version_info >= (3, 11):32from typing import Self33else:34from typing_extensions import Self3536from typing_extensions import assert_type373839# -----------------------------------------------------------------------------------40# nested dataclasses, models, namedtuple classes (can't be defined inside test func)41# -----------------------------------------------------------------------------------42@dataclasses.dataclass43class _TestBazDC:44d: datetime45e: float46f: str474849@dataclasses.dataclass50class _TestBarDC:51a: str52b: int53c: _TestBazDC545556@dataclasses.dataclass57class _TestFooDC:58x: int59y: _TestBarDC606162class _TestBazPD(pydantic.BaseModel):63d: datetime64e: float65f: str666768class _TestBarPD(pydantic.BaseModel):69a: str70b: int71c: _TestBazPD727374class _TestFooPD(pydantic.BaseModel):75x: int76y: _TestBarPD777879class _TestBazNT(NamedTuple):80d: datetime81e: float82f: str838485class _TestBarNT(NamedTuple):86a: str87b: int88c: _TestBazNT899091class _TestFooNT(NamedTuple):92x: int93y: _TestBarNT949596# --------------------------------------------------------------------------------979899def test_init_dict() -> None:100# Empty dictionary101df = pl.DataFrame({})102assert df.shape == (0, 0)103104# Empty dictionary/values105df = pl.DataFrame({"a": [], "b": []})106assert df.shape == (0, 2)107assert df.schema == {"a": pl.Null, "b": pl.Null}108109for df in (110pl.DataFrame({}, schema={"a": pl.Date, "b": pl.String}),111pl.DataFrame({"a": [], "b": []}, schema={"a": pl.Date, "b": pl.String}),112):113assert df.shape == (0, 2)114assert df.schema == {"a": pl.Date, "b": pl.String}115116# List of empty list117df = pl.DataFrame({"a": [[]], "b": [[]]})118expected = {"a": pl.List(pl.Null), "b": pl.List(pl.Null)}119assert df.schema == expected120assert df.rows() == [([], [])]121122# Mixed dtypes123df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})124assert df.shape == (3, 2)125assert df.columns == ["a", "b"]126assert df.dtypes == [pl.Int64, pl.Float64]127128df = pl.DataFrame(129data={"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]},130schema=[("a", pl.Int8), ("b", pl.Float32)],131)132assert df.schema == {"a": pl.Int8, "b": pl.Float32}133134# Values contained in tuples135df = pl.DataFrame({"a": (1, 2, 3), "b": [1.0, 2.0, 3.0]})136assert df.shape == (3, 2)137138# Datetime/Date types (from both python and integer values)139py_datetimes = (140datetime(2022, 12, 31, 23, 59, 59),141datetime(2022, 12, 31, 23, 59, 59),142)143py_dates = (date(2022, 12, 31), date(2022, 12, 31))144int_datetimes = [1672531199000000, 1672531199000000]145int_dates = [19357, 19357]146147for dates, datetimes, coldefs in (148# test inferred and explicit (given both py/polars dtypes)149(py_dates, py_datetimes, None),150(py_dates, py_datetimes, [("dt", date), ("dtm", datetime)]),151(py_dates, py_datetimes, [("dt", pl.Date), ("dtm", pl.Datetime)]),152(int_dates, int_datetimes, [("dt", date), ("dtm", datetime)]),153(int_dates, int_datetimes, [("dt", pl.Date), ("dtm", pl.Datetime)]),154):155df = pl.DataFrame(156data={"dt": dates, "dtm": datetimes},157schema=coldefs,158)159assert df.schema == {"dt": pl.Date, "dtm": pl.Datetime("us")}160assert df.rows() == list(zip(py_dates, py_datetimes))161162# Overriding dict column names/types163df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, schema=["c", "d"])164assert df.columns == ["c", "d"]165166df = pl.DataFrame(167{"a": [1, 2, 3], "b": [4, 5, 6]},168schema=["c", ("d", pl.Int8)],169) # partial type info (allowed, but mypy doesn't like it ;p)170assert df.schema == {"c": pl.Int64, "d": pl.Int8}171172df = pl.DataFrame(173{"a": [1, 2, 3], "b": [4, 5, 6]}, schema=[("c", pl.Int8), ("d", pl.Int16)]174)175assert df.schema == {"c": pl.Int8, "d": pl.Int16}176177# empty nested objects178for empty_val in [None, "", {}, []]: # type: ignore[var-annotated]179test = [{"field": {"sub_field": empty_val, "sub_field_2": 2}}]180df = pl.DataFrame(test, schema={"field": pl.Object})181assert df["field"][0] == test[0]["field"]182183184def test_error_string_dtypes() -> None:185with pytest.raises(TypeError, match="cannot parse input"):186pl.DataFrame(187data={"x": [1, 2], "y": [3, 4], "z": [5, 6]},188schema={"x": "i16", "y": "i32", "z": "f32"}, # type: ignore[dict-item]189)190191with pytest.raises(TypeError, match="cannot parse input"):192pl.Series("n", [1, 2, 3], dtype="f32") # type: ignore[arg-type]193194195def test_init_structured_objects() -> None:196# validate init from dataclass, namedtuple, and pydantic model objects197@dataclasses.dataclass198class TradeDC:199timestamp: datetime200ticker: str201price: Decimal202size: int | None = None203204class TradePD(pydantic.BaseModel):205timestamp: datetime206ticker: str207price: Decimal208size: int209210class TradeNT(NamedTuple):211timestamp: datetime212ticker: str213price: Decimal214size: int | None = None215216raw_data = [217(datetime(2022, 9, 8, 14, 30, 45), "AAPL", Decimal("157.5"), 125),218(datetime(2022, 9, 9, 10, 15, 12), "FLSY", Decimal("10.0"), 1500),219(datetime(2022, 9, 7, 15, 30), "MU", Decimal("55.5"), 400),220]221columns = ["timestamp", "ticker", "price", "size"]222223for TradeClass in (TradeDC, TradeNT, TradePD):224trades = [TradeClass(**dict(zip(columns, values))) for values in raw_data] # type: ignore[arg-type]225226for DF in (pl.DataFrame, pl.from_records):227df = DF(data=trades)228assert df.schema == {229"timestamp": pl.Datetime("us"),230"ticker": pl.String,231"price": pl.Decimal(scale=1),232"size": pl.Int64,233}234assert df.rows() == raw_data235236# partial dtypes override237df = DF(238data=trades,239schema_overrides={"timestamp": pl.Datetime("ms"), "size": pl.Int32},240)241assert df.schema == {242"timestamp": pl.Datetime("ms"),243"ticker": pl.String,244"price": pl.Decimal(scale=1),245"size": pl.Int32,246}247248# in conjunction with full 'columns' override (rename/downcast)249df = pl.DataFrame(250data=trades,251schema=[252("ts", pl.Datetime("ms")),253("tk", pl.Categorical),254("pc", pl.Decimal(scale=1)),255("sz", pl.UInt16),256],257)258assert df.schema == {259"ts": pl.Datetime("ms"),260"tk": pl.Categorical(ordering="lexical"),261"pc": pl.Decimal(scale=1),262"sz": pl.UInt16,263}264assert df.rows() == raw_data265266# cover a miscellaneous edge-case when detecting the annotations267assert try_get_type_hints(obj=type(None)) == {}268269270def test_init_pydantic_2x() -> None:271class PageView(BaseModel):272user_id: str273ts: datetime = Field(alias=["ts", "$date"]) # type: ignore[literal-required, call-overload]274path: str = Field("?", alias=["url", "path"]) # type: ignore[literal-required, call-overload]275referer: str = Field("?", alias="referer")276event: Literal["leave", "enter"] = Field("enter")277time_on_page: int = Field(0, serialization_alias="top")278279data_json = """280[{281"user_id": "x",282"ts": {"$date": "2021-01-01T00:00:00.000Z"},283"url": "/latest/foobar",284"referer": "https://google.com",285"event": "enter",286"top": 123287}]288"""289adapter: TypeAdapter[Any] = TypeAdapter(list[PageView])290models = adapter.validate_json(data_json)291292result = pl.DataFrame(models)293expected = pl.DataFrame(294{295"user_id": ["x"],296"ts": [datetime(2021, 1, 1, 0, 0)],297"path": ["?"],298"referer": ["https://google.com"],299"event": ["enter"],300"time_on_page": [0],301}302)303assert_frame_equal(result, expected)304305306def test_init_structured_objects_unhashable() -> None:307# cover an edge-case with namedtuple fields that aren't hashable308309class Test(NamedTuple):310dt: datetime311info: dict[str, int]312313test_data = [314Test(datetime(2017, 1, 1), {"a": 1, "b": 2}),315Test(datetime(2017, 1, 2), {"a": 2, "b": 2}),316]317df = pl.DataFrame(test_data)318# shape: (2, 2)319# ┌─────────────────────┬───────────┐320# │ dt ┆ info │321# │ --- ┆ --- │322# │ datetime[μs] ┆ struct[2] │323# ╞═════════════════════╪═══════════╡324# │ 2017-01-01 00:00:00 ┆ {1,2} │325# │ 2017-01-02 00:00:00 ┆ {2,2} │326# └─────────────────────┴───────────┘327assert df.schema == {328"dt": pl.Datetime(time_unit="us", time_zone=None),329"info": pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Int64)]),330}331assert df.rows() == test_data332333334@pytest.mark.parametrize(335("foo", "bar", "baz"),336[337(_TestFooDC, _TestBarDC, _TestBazDC),338(_TestFooPD, _TestBarPD, _TestBazPD),339(_TestFooNT, _TestBarNT, _TestBazNT),340],341)342def test_init_structured_objects_nested(foo: Any, bar: Any, baz: Any) -> None:343data = [344foo(345x=100,346y=bar(347a="hello",348b=800,349c=baz(d=datetime(2023, 4, 12, 10, 30), e=-10.5, f="world"),350),351)352]353df = pl.DataFrame(data)354# shape: (1, 2)355# ┌─────┬───────────────────────────────────┐356# │ x ┆ y │357# │ --- ┆ --- │358# │ i64 ┆ struct[3] │359# ╞═════╪═══════════════════════════════════╡360# │ 100 ┆ {"hello",800,{2023-04-12 10:30:0… │361# └─────┴───────────────────────────────────┘362363assert df.schema == {364"x": pl.Int64,365"y": pl.Struct(366[367pl.Field("a", pl.String),368pl.Field("b", pl.Int64),369pl.Field(370"c",371pl.Struct(372[373pl.Field("d", pl.Datetime("us")),374pl.Field("e", pl.Float64),375pl.Field("f", pl.String),376]377),378),379]380),381}382assert df.row(0) == (383100,384{385"a": "hello",386"b": 800,387"c": {388"d": datetime(2023, 4, 12, 10, 30),389"e": -10.5,390"f": "world",391},392},393)394395# validate nested schema override396override_struct_schema: dict[str, PolarsDataType] = {397"x": pl.Int16,398"y": pl.Struct(399[400pl.Field("a", pl.String),401pl.Field("b", pl.Int32),402pl.Field(403name="c",404dtype=pl.Struct(405[406pl.Field("d", pl.Datetime("ms")),407pl.Field("e", pl.Float32),408pl.Field("f", pl.String),409]410),411),412]413),414}415for schema, schema_overrides in (416(None, override_struct_schema),417(override_struct_schema, None),418):419df = (420pl.DataFrame(data, schema=schema, schema_overrides=schema_overrides)421.unnest("y")422.unnest("c")423)424# shape: (1, 6)425# ┌─────┬───────┬─────┬─────────────────────┬───────┬───────┐426# │ x ┆ a ┆ b ┆ d ┆ e ┆ f │427# │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │428# │ i16 ┆ str ┆ i32 ┆ datetime[ms] ┆ f32 ┆ str │429# ╞═════╪═══════╪═════╪═════════════════════╪═══════╪═══════╡430# │ 100 ┆ hello ┆ 800 ┆ 2023-04-12 10:30:00 ┆ -10.5 ┆ world │431# └─────┴───────┴─────┴─────────────────────┴───────┴───────┘432assert df.schema == {433"x": pl.Int16,434"a": pl.String,435"b": pl.Int32,436"d": pl.Datetime("ms"),437"e": pl.Float32,438"f": pl.String,439}440assert df.row(0) == (441100,442"hello",443800,444datetime(2023, 4, 12, 10, 30),445-10.5,446"world",447)448449450def test_dataclasses_initvar_typing() -> None:451@dataclasses.dataclass452class ABC:453x: date454y: float455z: dataclasses.InitVar[list[str]] = None456457# should be able to parse the initvar typing...458abc = ABC(x=date(1999, 12, 31), y=100.0)459df = pl.DataFrame([abc])460461# ...but should not load the initvar field into the DataFrame462assert dataclasses.asdict(abc) == df.rows(named=True)[0]463464465@pytest.mark.parametrize(466"nt",467[468namedtuple("TestData", ["id", "info"]), # noqa: PYI024469NamedTuple("TestData", [("id", int), ("info", str)]),470],471)472def test_collections_namedtuple(nt: type) -> None:473nt_data = [nt(1, "a"), nt(2, "b"), nt(3, "c")]474475result = pl.DataFrame(nt_data)476expected = pl.DataFrame({"id": [1, 2, 3], "info": ["a", "b", "c"]})477assert_frame_equal(result, expected)478479result = pl.DataFrame({"data": nt_data, "misc": ["x", "y", "z"]})480expected = pl.DataFrame(481{482"data": [483{"id": 1, "info": "a"},484{"id": 2, "info": "b"},485{"id": 3, "info": "c"},486],487"misc": ["x", "y", "z"],488}489)490assert_frame_equal(result, expected)491492493def test_init_ndarray() -> None:494# Empty array495df = pl.DataFrame(np.array([]))496assert_frame_equal(df, pl.DataFrame())497498# 1D array499df = pl.DataFrame(np.array([1, 2, 3], dtype=np.int64), schema=["a"])500expected = pl.DataFrame({"a": [1, 2, 3]})501assert_frame_equal(df, expected)502503df = pl.DataFrame(np.array([1, 2, 3]), schema=[("a", pl.Int32)])504expected = pl.DataFrame({"a": [1, 2, 3]}).with_columns(pl.col("a").cast(pl.Int32))505assert_frame_equal(df, expected)506507# 2D array (or 2x 1D array) - should default to column orientation (if C-contiguous)508for data in (509np.array([[1, 2], [3, 4]], dtype=np.int64),510[np.array([1, 2], dtype=np.int64), np.array([3, 4], dtype=np.int64)],511):512df = pl.DataFrame(data, orient="col")513expected = pl.DataFrame({"column_0": [1, 2], "column_1": [3, 4]})514assert_frame_equal(df, expected)515516df = pl.DataFrame([[1, 2.0, "a"], [None, None, None]], orient="row")517expected = pl.DataFrame(518{"column_0": [1, None], "column_1": [2.0, None], "column_2": ["a", None]}519)520assert_frame_equal(df, expected)521522df = pl.DataFrame(523data=[[1, 2.0, "a"], [None, None, None]],524schema=[("x", pl.Boolean), ("y", pl.Int32), "z"],525orient="row",526)527assert df.rows() == [(True, 2, "a"), (None, None, None)]528assert df.schema == {"x": pl.Boolean, "y": pl.Int32, "z": pl.String}529530# 2D array - default to column orientation531df = pl.DataFrame(np.array([[1, 2], [3, 4]], dtype=np.int64))532expected = pl.DataFrame({"column_0": [1, 3], "column_1": [2, 4]})533assert_frame_equal(df, expected)534535# no orientation, numpy convention536df = pl.DataFrame(np.ones((3, 1), dtype=np.int64))537assert df.shape == (3, 1)538539# 2D array - row orientation inferred540df = pl.DataFrame(541np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64), schema=["a", "b", "c"]542)543expected = pl.DataFrame({"a": [1, 4], "b": [2, 5], "c": [3, 6]})544assert_frame_equal(df, expected)545546# 2D array - column orientation inferred547df = pl.DataFrame(548np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64), schema=["a", "b"]549)550expected = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})551assert_frame_equal(df, expected)552553# List column from 2D array with single-column schema554df = pl.DataFrame(np.arange(4).reshape(-1, 1).astype(np.int64), schema=["a"])555assert_frame_equal(df, pl.DataFrame({"a": [0, 1, 2, 3]}))556assert np.array_equal(df.to_numpy(), np.arange(4).reshape(-1, 1).astype(np.int64))557558df = pl.DataFrame(np.arange(4).reshape(-1, 2).astype(np.int64), schema=["a"])559assert_frame_equal(560df,561pl.DataFrame(562{"a": [[0, 1], [2, 3]]}, schema={"a": pl.Array(pl.Int64, shape=2)}563),564)565566# 2D numpy arrays567df = pl.DataFrame({"a": np.arange(5, dtype=np.int64).reshape(1, -1)})568assert df.dtypes == [pl.Array(pl.Int64, shape=5)]569assert df.shape == (1, 1)570571df = pl.DataFrame({"a": np.arange(10, dtype=np.int64).reshape(2, -1)})572assert df.dtypes == [pl.Array(pl.Int64, shape=5)]573assert df.shape == (2, 1)574assert df.rows() == [([0, 1, 2, 3, 4],), ([5, 6, 7, 8, 9],)]575576test_rows = [(1, 2), (3, 4)]577df = pl.DataFrame([np.array(test_rows[0]), np.array(test_rows[1])], orient="row")578expected = pl.DataFrame(test_rows, orient="row")579assert_frame_equal(df, expected)580581# round trip export/init582for shape in ((4, 4), (4, 8), (8, 4)):583np_ones = np.ones(shape=shape, dtype=np.float64)584names = [f"c{i}" for i in range(shape[1])]585586df = pl.DataFrame(np_ones, schema=names)587assert_frame_equal(df, pl.DataFrame(np.asarray(df), schema=names))588589590def test_init_ndarray_errors() -> None:591# 2D array: orientation conflicts with columns592with pytest.raises(ValueError):593pl.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]), schema=["a", "b"], orient="row")594595with pytest.raises(ValueError):596pl.DataFrame(597np.array([[1, 2, 3], [4, 5, 6]]),598schema=[("a", pl.UInt32), ("b", pl.UInt32)],599orient="row",600)601602# Invalid orient value603with pytest.raises(ValueError):604pl.DataFrame(605np.array([[1, 2, 3], [4, 5, 6]]),606orient="wrong", # type: ignore[arg-type]607)608609# Dimensions mismatch610with pytest.raises(ValueError):611_ = pl.DataFrame(np.array([1, 2, 3]), schema=[])612613# Cannot init with 3D array614with pytest.raises(ValueError):615_ = pl.DataFrame(np.random.randn(2, 2, 2))616617618def test_init_ndarray_nan() -> None:619# numpy arrays containing NaN620df0 = pl.DataFrame(621data={"x": [1.0, 2.5, float("nan")], "y": [4.0, float("nan"), 6.5]},622)623df1 = pl.DataFrame(624data={"x": np.array([1.0, 2.5, np.nan]), "y": np.array([4.0, np.nan, 6.5])},625)626df2 = pl.DataFrame(627data={"x": np.array([1.0, 2.5, np.nan]), "y": np.array([4.0, np.nan, 6.5])},628nan_to_null=True,629)630assert_frame_equal(df0, df1)631assert df2.rows() == [(1.0, 4.0), (2.5, None), (None, 6.5)]632633s0 = pl.Series("n", [1.0, 2.5, float("nan")])634s1 = pl.Series("n", np.array([1.0, 2.5, float("nan")]))635s2 = pl.Series("n", np.array([1.0, 2.5, float("nan")]), nan_to_null=True)636637assert_series_equal(s0, s1)638assert s2.to_list() == [1.0, 2.5, None]639640641def test_init_ndarray_square() -> None:642# 2D square array; ensure that we maintain convention643# (first axis = rows) with/without an explicit schema644arr = np.arange(4).reshape(2, 2)645assert (646[(0, 1), (2, 3)]647== pl.DataFrame(arr).rows()648== pl.DataFrame(arr, schema=["a", "b"]).rows()649)650# check that we tie-break square arrays using fortran vs c-contiguous row/col major651df_c = pl.DataFrame(652data=np.array([[1, 2], [3, 4]], dtype=np.int64, order="C"),653schema=["x", "y"],654)655assert_frame_equal(df_c, pl.DataFrame({"x": [1, 3], "y": [2, 4]}))656657df_f = pl.DataFrame(658data=np.array([[1, 2], [3, 4]], dtype=np.int64, order="F"),659schema=["x", "y"],660)661assert_frame_equal(df_f, pl.DataFrame({"x": [1, 2], "y": [3, 4]}))662663664def test_init_numpy_unavailable(monkeypatch: Any) -> None:665monkeypatch.setattr(pl.dataframe.frame, "_check_for_numpy", lambda x: False)666with pytest.raises(TypeError):667pl.DataFrame(np.array([1, 2, 3]), schema=["a"])668669670def test_init_numpy_scalars() -> None:671df = pl.DataFrame(672{673"bool": [np.bool_(True), np.bool_(False)],674"i8": [np.int8(16), np.int8(64)],675"u32": [np.uint32(1234), np.uint32(9876)],676}677)678df_expected = pl.from_records(679data=[(True, 16, 1234), (False, 64, 9876)],680schema=OrderedDict([("bool", pl.Boolean), ("i8", pl.Int8), ("u32", pl.UInt32)]),681orient="row",682)683assert_frame_equal(df, df_expected)684685686def test_null_array_print_format() -> None:687pa_tbl_null = pa.table({"a": [None, None]})688df_null = pl.from_arrow(pa_tbl_null)689assert df_null.shape == (2, 1)690assert df_null.dtypes == [pl.Null] # type: ignore[union-attr]691assert df_null.rows() == [(None,), (None,)] # type: ignore[union-attr]692693assert (694str(df_null) == "shape: (2, 1)\n"695"┌──────┐\n"696"│ a │\n"697"│ --- │\n"698"│ null │\n"699"╞══════╡\n"700"│ null │\n"701"│ null │\n"702"└──────┘"703)704705706def test_init_arrow() -> None:707# Handle unnamed column708df = pl.DataFrame(pa.table({"a": [1, 2], None: [3, 4]}))709expected = pl.DataFrame({"a": [1, 2], "None": [3, 4]})710assert_frame_equal(df, expected)711712# Rename columns713df = pl.DataFrame(pa.table({"a": [1, 2], "b": [3, 4]}), schema=["c", "d"])714expected = pl.DataFrame({"c": [1, 2], "d": [3, 4]})715assert_frame_equal(df, expected)716717df = pl.DataFrame(718pa.table({"a": [1, 2], None: [3, 4]}),719schema=[("c", pl.Int32), ("d", pl.Float32)],720)721assert df.schema == {"c": pl.Int32, "d": pl.Float32}722assert df.rows() == [(1, 3.0), (2, 4.0)]723724# Bad columns argument725with pytest.raises(ValueError):726pl.DataFrame(pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}), schema=["c", "d", "e"])727728729def test_init_arrow_dupes() -> None:730tbl = pa.Table.from_arrays(731arrays=[732pa.array([1, 2, 3], type=pa.int32()),733pa.array([4, 5, 6], type=pa.int32()),734pa.array(735[7, 8, 9], type=pa.decimal128(38, 10)736), # included as this triggers a panic during construction alongside duplicate fields737],738schema=pa.schema(739[("col", pa.int32()), ("col", pa.int32()), ("col3", pa.decimal128(38, 10))]740),741)742with pytest.raises(743DuplicateError,744match=r"""column appears more than once; names must be unique: \["col"\]""",745):746pl.DataFrame(tbl)747748749def test_init_from_frame() -> None:750df1 = pl.DataFrame({"id": [0, 1], "misc": ["a", "b"], "val": [-10, 10]})751assert_frame_equal(df1, pl.DataFrame(df1))752753df2 = pl.DataFrame(df1, schema=["a", "b", "c"])754assert_frame_equal(df2, pl.DataFrame(df2))755756df3 = pl.DataFrame(df1, schema=["a", "b", "c"], schema_overrides={"val": pl.Int8})757assert_frame_equal(df3, pl.DataFrame(df3))758759assert df1.schema == {"id": pl.Int64, "misc": pl.String, "val": pl.Int64}760assert df2.schema == {"a": pl.Int64, "b": pl.String, "c": pl.Int64}761assert df3.schema == {"a": pl.Int64, "b": pl.String, "c": pl.Int8}762assert df1.rows() == df2.rows() == df3.rows()763764s1 = pl.Series("s", df3)765s2 = pl.Series(df3)766767assert s1.name == "s"768assert s2.name == ""769770771def test_init_series() -> None:772# List of Series773df = pl.DataFrame([pl.Series("a", [1, 2, 3]), pl.Series("b", [4, 5, 6])])774expected = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})775assert_frame_equal(df, expected)776777# Tuple of Series778df = pl.DataFrame((pl.Series("a", (1, 2, 3)), pl.Series("b", (4, 5, 6))))779assert_frame_equal(df, expected)780781df = pl.DataFrame(782(pl.Series("a", (1, 2, 3)), pl.Series("b", (4, 5, 6))),783schema=[("x", pl.Float64), ("y", pl.Float64)],784)785assert df.schema == {"x": pl.Float64, "y": pl.Float64}786assert df.rows() == [(1.0, 4.0), (2.0, 5.0), (3.0, 6.0)]787788# List of unnamed Series789df = pl.DataFrame([pl.Series([1, 2, 3]), pl.Series([4, 5, 6])])790col0 = pl.Series("column_0", [1, 2, 3])791col1 = pl.Series("column_1", [4, 5, 6])792expected = pl.DataFrame([col0, col1])793assert_frame_equal(df, expected)794795df = pl.DataFrame([pl.Series([0.0]), pl.Series([1.0])])796assert df.schema == {"column_0": pl.Float64, "column_1": pl.Float64}797assert df.rows() == [(0.0, 1.0)]798799df = pl.DataFrame(800[pl.Series([None]), pl.Series([1.0])],801schema=[("x", pl.Date), ("y", pl.Boolean)],802)803assert df.schema == {"x": pl.Date, "y": pl.Boolean}804assert df.rows() == [(None, True)]805806# Single Series807df = pl.DataFrame(pl.Series("a", [1, 2, 3]))808expected = pl.DataFrame({"a": [1, 2, 3]})809assert df.schema == {"a": pl.Int64}810assert_frame_equal(df, expected)811812df = pl.DataFrame(pl.Series("a", [1, 2, 3]), schema=[("a", pl.UInt32)])813assert df.rows() == [(1,), (2,), (3,)]814assert df.schema == {"a": pl.UInt32}815816# nested list, with/without explicit dtype817s1 = pl.Series([[[2, 2]]])818assert s1.dtype == pl.List(pl.List(pl.Int64))819820s2 = pl.Series([[[2, 2]]], dtype=pl.List(pl.List(pl.UInt8)))821assert s2.dtype == pl.List(pl.List(pl.UInt8))822823nested_dtype = pl.List(pl.List(pl.UInt8))824s3 = pl.Series("x", dtype=nested_dtype)825s4 = pl.Series(s3)826for s in (s3, s4):827assert s.dtype == nested_dtype828assert s.to_list() == []829assert s.name == "x"830831s5 = pl.Series("", df, dtype=pl.Int8)832assert_series_equal(s5, pl.Series("", [1, 2, 3], dtype=pl.Int8))833834835@pytest.mark.parametrize(836("dtype", "expected_dtype"),837[838(int, pl.Int64),839(bytes, pl.Binary),840(float, pl.Float64),841(str, pl.String),842(date, pl.Date),843(time, pl.Time),844(datetime, pl.Datetime("us")),845(timedelta, pl.Duration("us")),846(Decimal, pl.Decimal(precision=None, scale=0)),847],848)849def test_init_py_dtype(dtype: Any, expected_dtype: PolarsDataType) -> None:850for s in (851pl.Series("s", [None], dtype=dtype),852pl.Series("s", [], dtype=dtype),853):854assert s.dtype == expected_dtype855856for df in (857pl.DataFrame({"col": [None]}, schema={"col": dtype}),858pl.DataFrame({"col": []}, schema={"col": dtype}),859):860assert df.schema == {"col": expected_dtype}861862863def test_init_py_dtype_misc_float() -> None:864assert pl.Series([100], dtype=float).dtype == pl.Float64 # type: ignore[arg-type]865866df = pl.DataFrame(867{"x": [100.0], "y": [200], "z": [None]},868schema={"x": float, "y": float, "z": float},869)870assert df.schema == {"x": pl.Float64, "y": pl.Float64, "z": pl.Float64}871assert df.rows() == [(100.0, 200.0, None)]872873874def test_init_seq_of_seq() -> None:875# List of lists876df = pl.DataFrame([[1, 2, 3], [4, 5, 6]], schema=["a", "b", "c"], orient="row")877expected = pl.DataFrame({"a": [1, 4], "b": [2, 5], "c": [3, 6]})878assert_frame_equal(df, expected)879880df = pl.DataFrame(881[[1, 2, 3], [4, 5, 6]],882schema=[("a", pl.Int8), ("b", pl.Int16), ("c", pl.Int32)],883orient="row",884)885assert df.schema == {"a": pl.Int8, "b": pl.Int16, "c": pl.Int32}886assert df.rows() == [(1, 2, 3), (4, 5, 6)]887888# Tuple of tuples, default to column orientation889df = pl.DataFrame(((1, 2, 3), (4, 5, 6)))890expected = pl.DataFrame({"column_0": [1, 2, 3], "column_1": [4, 5, 6]})891assert_frame_equal(df, expected)892893# Row orientation894df = pl.DataFrame(((1, 2), (3, 4)), schema=("a", "b"), orient="row")895expected = pl.DataFrame({"a": [1, 3], "b": [2, 4]})896assert_frame_equal(df, expected)897898df = pl.DataFrame(899((1, 2), (3, 4)), schema=(("a", pl.Float32), ("b", pl.Float32)), orient="row"900)901assert df.schema == {"a": pl.Float32, "b": pl.Float32}902assert df.rows() == [(1.0, 2.0), (3.0, 4.0)]903904# Wrong orient value905with pytest.raises(ValueError):906df = pl.DataFrame(((1, 2), (3, 4)), orient="wrong") # type: ignore[arg-type]907908909def test_init_1d_sequence() -> None:910# Empty list911df = pl.DataFrame([])912assert_frame_equal(df, pl.DataFrame())913914# List/array of strings915data = ["a", "b", "c"]916for a in (data, np.array(data)):917df = pl.DataFrame(a, schema=["s"])918expected = pl.DataFrame({"s": data})919assert_frame_equal(df, expected)920921df = pl.DataFrame([None, True, False], schema=[("xx", pl.Int8)])922assert df.schema == {"xx": pl.Int8}923assert df.rows() == [(None,), (1,), (0,)]924925# String sequence926result = pl.DataFrame("abc", schema=["s"])927expected = pl.DataFrame({"s": ["a", "b", "c"]})928assert_frame_equal(result, expected)929930# datetimes sequence931df = pl.DataFrame([datetime(2020, 1, 1)], schema={"ts": pl.Datetime("ms")})932assert df.schema == {"ts": pl.Datetime("ms")}933df = pl.DataFrame(934[datetime(2020, 1, 1, tzinfo=timezone.utc)], schema={"ts": pl.Datetime("ms")}935)936assert df.schema == {"ts": pl.Datetime("ms", "UTC")}937df = pl.DataFrame(938[datetime(2020, 1, 1, tzinfo=timezone(timedelta(hours=1)))],939schema={"ts": pl.Datetime("ms")},940)941assert df.schema == {"ts": pl.Datetime("ms", "UTC")}942df = pl.DataFrame(943[datetime(2020, 1, 1, tzinfo=ZoneInfo("Asia/Kathmandu"))],944schema={"ts": pl.Datetime("ms")},945)946assert df.schema == {"ts": pl.Datetime("ms", "Asia/Kathmandu")}947948949def test_init_pandas(monkeypatch: Any) -> None:950pandas_df = pd.DataFrame([[1, 2], [3, 4]], columns=[1, 2])951952# integer column names953df = pl.DataFrame(pandas_df)954expected = pl.DataFrame({"1": [1, 3], "2": [2, 4]})955assert_frame_equal(df, expected)956assert df.schema == {"1": pl.Int64, "2": pl.Int64}957958# override column names, types959df = pl.DataFrame(pandas_df, schema=[("x", pl.Float64), ("y", pl.Float64)])960assert df.schema == {"x": pl.Float64, "y": pl.Float64}961assert df.rows() == [(1.0, 2.0), (3.0, 4.0)]962963# subclassed pandas object, with/without data & overrides964# type error fixed in pandas-stubs 2.3.0.250703, which doesn't support Python3.9965class XSeries(pd.Series): # type: ignore[type-arg, unused-ignore]966@property967def _constructor(self) -> type:968return XSeries969970df = pl.DataFrame(971data=[972XSeries(name="x", data=[], dtype=np.dtype("<M8[ns]")),973XSeries(name="y", data=[], dtype=np.dtype("f8")),974XSeries(name="z", data=[], dtype=np.dtype("?")),975],976)977assert df.schema == {"x": pl.Datetime("ns"), "y": pl.Float64, "z": pl.Boolean}978assert df.rows() == []979980df = pl.DataFrame(981data=[982XSeries(983name="x",984data=[datetime(2022, 10, 31, 10, 30, 45, 123456)],985dtype=np.dtype("<M8[ns]"),986)987],988schema={"colx": pl.Datetime("us")},989)990assert df.schema == {"colx": pl.Datetime("us")}991assert df.rows() == [(datetime(2022, 10, 31, 10, 30, 45, 123456),)]992993# pandas is not available994monkeypatch.setattr(pl.dataframe.frame, "_check_for_pandas", lambda x: False)995996# pandas 2.2 and higher implement the Arrow PyCapsule Interface, so the constructor997# will still work even without using pandas APIs998if parse_version(pd.__version__) >= parse_version("2.2.0"):999df = pl.DataFrame(pandas_df)1000assert_frame_equal(df, expected)10011002else:1003with pytest.raises(TypeError):1004pl.DataFrame(pandas_df)100510061007def test_init_errors() -> None:1008# Length mismatch1009with pytest.raises(ShapeError):1010pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0, 4.0]})10111012# Columns don't match data dimensions1013with pytest.raises(ShapeError):1014pl.DataFrame([[1, 2], [3, 4]], schema=["a", "b", "c"])10151016# Unmatched input1017with pytest.raises(TypeError):1018pl.DataFrame(0)101910201021def test_init_records() -> None:1022dicts = [1023{"a": 1, "b": 2},1024{"b": 1, "a": 2},1025{"a": 1, "b": 2},1026]1027df = pl.DataFrame(dicts)1028expected = pl.DataFrame({"a": [1, 2, 1], "b": [2, 1, 2]})1029assert_frame_equal(df, expected)1030assert df.to_dicts() == dicts10311032df_cd = pl.DataFrame(dicts, schema=["a", "c", "d"])1033expected_values = {1034"a": [1, 2, 1],1035"c": [None, None, None],1036"d": [None, None, None],1037}1038assert df_cd.to_dict(as_series=False) == expected_values10391040data = {"a": 1, "b": 2, "c": 3}10411042df1 = pl.from_dicts([data])1043assert df1.columns == ["a", "b", "c"]10441045df1.columns = ["x", "y", "z"]1046assert df1.columns == ["x", "y", "z"]10471048df2 = pl.from_dicts([data], schema=["c", "b", "a"])1049assert df2.columns == ["c", "b", "a"]10501051for colname in ("c", "b", "a"):1052result = pl.from_dicts([data], schema=[colname])1053expected_values = {colname: [data[colname]]}1054assert result.to_dict(as_series=False) == expected_values105510561057def test_init_records_schema_order() -> None:1058cols: list[str] = ["a", "b", "c", "d"]1059data: list[dict[str, int]] = [1060{"c": 3, "b": 2, "a": 1},1061{"b": 2, "d": 4},1062{},1063{"a": 1, "b": 2, "c": 3},1064{"d": 4, "b": 2, "a": 1},1065{"c": 3, "b": 2},1066]1067lookup = {"a": 1, "b": 2, "c": 3, "d": 4, "e": None}10681069for constructor in (pl.from_dicts, pl.DataFrame):1070# ensure field values are loaded according to the declared schema order1071for _ in range(8):1072shuffle(data)1073shuffle(cols)10741075df = constructor(data, schema=cols)1076for col in df.columns:1077assert all(value in (None, lookup[col]) for value in df[col].to_list())10781079# have schema override inferred types, omit some columns, add a new one1080schema = {"a": pl.Int8, "c": pl.Int16, "e": pl.Int32}1081df = constructor(data, schema=schema)10821083assert df.schema == schema1084for col in df.columns:1085assert all(value in (None, lookup[col]) for value in df[col].to_list())108610871088def test_init_only_columns() -> None:1089df = pl.DataFrame(schema=["a", "b", "c"])1090expected = pl.DataFrame({"a": [], "b": [], "c": []})1091assert_frame_equal(df, expected)10921093# Validate construction with various flavours of no/empty data1094no_data: Any1095for no_data in (None, {}, []):1096df = pl.DataFrame(1097data=no_data,1098schema=[1099("a", pl.Date),1100("b", pl.UInt64),1101("c", pl.Int8),1102("d", pl.List(pl.UInt8)),1103],1104)1105expected = pl.DataFrame({"a": [], "b": [], "c": []}).with_columns(1106pl.col("a").cast(pl.Date),1107pl.col("b").cast(pl.UInt64),1108pl.col("c").cast(pl.Int8),1109)1110expected.insert_column(3, pl.Series("d", [], pl.List(pl.UInt8)))11111112assert df.shape == (0, 4)1113assert_frame_equal(df, expected)1114assert df.dtypes == [pl.Date, pl.UInt64, pl.Int8, pl.List]1115assert pl.List(pl.UInt8).is_(df.schema["d"])11161117if TYPE_CHECKING:1118assert_type(pl.List(pl.UInt8).is_(df.schema["d"]), bool)11191120dfe = df.clear()1121assert len(dfe) == 01122assert df.schema == dfe.schema1123assert dfe.shape == df.shape112411251126def test_from_dicts_list_without_dtype() -> None:1127result = pl.from_dicts(1128[{"id": 1, "hint": ["some_text_here"]}, {"id": 2, "hint": [None]}]1129)1130expected = pl.DataFrame({"id": [1, 2], "hint": [["some_text_here"], [None]]})1131assert_frame_equal(result, expected)113211331134def test_from_dicts_list_struct_without_inner_dtype() -> None:1135df = pl.DataFrame(1136{1137"users": [1138[{"category": "A"}, {"category": "B"}],1139[{"category": None}, {"category": None}],1140],1141"days_of_week": [1, 2],1142}1143)1144expected = {1145"users": [1146[{"category": "A"}, {"category": "B"}],1147[{"category": None}, {"category": None}],1148],1149"days_of_week": [1, 2],1150}1151assert df.to_dict(as_series=False) == expected115211531154def test_from_dicts_list_struct_without_inner_dtype_5611() -> None:1155result = pl.from_dicts(1156[1157{"a": []},1158{"a": [{"b": 1}]},1159]1160)1161expected = pl.DataFrame({"a": [[], [{"b": 1}]]})1162assert_frame_equal(result, expected)116311641165def test_from_dict_upcast_primitive() -> None:1166df = pl.from_dict({"a": [1, 2.1, 3], "b": [4, 5, 6.4]}, strict=False)1167assert df.dtypes == [pl.Float64, pl.Float64]116811691170def test_u64_lit_5031() -> None:1171df = pl.DataFrame({"foo": [1, 2, 3]}).with_columns(pl.col("foo").cast(pl.UInt64))1172assert df.filter(pl.col("foo") < (1 << 64) - 20).shape == (3, 1)1173assert df["foo"].to_list() == [1, 2, 3]117411751176def test_from_dicts_missing_columns() -> None:1177# missing columns from some of the data dicts1178data = [{"a": 1}, {"b": 2}]1179result = pl.from_dicts(data)1180expected = pl.DataFrame({"a": [1, None], "b": [None, 2]})1181assert_frame_equal(result, expected)11821183# partial schema with some columns missing; only load the declared keys1184data = [{"a": 1, "b": 2}]1185result = pl.from_dicts(data, schema=["a"])1186expected = pl.DataFrame({"a": [1]})1187assert_frame_equal(result, expected)118811891190def test_from_dicts_schema_columns_do_not_match() -> None:1191data = [{"a": 1, "b": 2}]1192result = pl.from_dicts(data, schema=["x"])1193expected = pl.DataFrame({"x": [None]})1194assert_frame_equal(result, expected)119511961197def test_from_dicts_infer_integer_types() -> None:1198data = [1199{1200"a": 2**7 - 1,1201"b": 2**15 - 1,1202"c": 2**31 - 1,1203"d": 2**63 - 1,1204"e": 2**127 - 1,1205}1206]1207result = pl.from_dicts(data).schema1208# all values inferred as i64 except for values too large for i641209expected = {1210"a": pl.Int64,1211"b": pl.Int64,1212"c": pl.Int64,1213"d": pl.Int64,1214"e": pl.Int128,1215}1216assert result == expected12171218with pytest.raises(OverflowError):1219pl.from_dicts([{"too_big": 2**127}])122012211222def test_from_dicts_list_large_int_17006() -> None:1223data = [{"x": [2**64 - 1]}]12241225result = pl.from_dicts(data, schema={"x": pl.List(pl.UInt64)})1226expected = pl.DataFrame({"x": [[2**64 - 1]]}, schema={"x": pl.List(pl.UInt64)})1227assert_frame_equal(result, expected)12281229result = pl.from_dicts(data, schema={"x": pl.Array(pl.UInt64, 1)})1230expected = pl.DataFrame({"x": [[2**64 - 1]]}, schema={"x": pl.Array(pl.UInt64, 1)})1231assert_frame_equal(result, expected)123212331234def test_from_rows_dtype() -> None:1235# 50 is the default inference length1236# 51821237df = pl.DataFrame(1238data=[(None, None)] * 50 + [("1.23", None)],1239schema=[("foo", pl.String), ("bar", pl.String)],1240orient="row",1241)1242assert df.dtypes == [pl.String, pl.String]1243assert df.null_count().row(0) == (50, 51)12441245type1 = [{"c1": 206, "c2": "type1", "c3": {"x1": "abcd", "x2": "jkl;"}}]1246type2 = [1247{"c1": 208, "c2": "type2", "c3": {"a1": "abcd", "a2": "jkl;", "a3": "qwerty"}}1248]12491250df = pl.DataFrame(1251data=type1 * 50 + type2,1252schema=[("c1", pl.Int32), ("c2", pl.Object), ("c3", pl.Object)],1253)1254assert df.dtypes == [pl.Int32, pl.Object, pl.Object]12551256# 50 is the default inference length1257# 52661258type1 = [{"c1": 206, "c2": "type1", "c3": {"x1": "abcd", "x2": "jkl;"}}]1259type2 = [1260{"c1": 208, "c2": "type2", "c3": {"a1": "abcd", "a2": "jkl;", "a3": "qwerty"}}1261]12621263df = pl.DataFrame(1264data=type1 * 50 + type2,1265schema=[("c1", pl.Int32), ("c2", pl.Object), ("c3", pl.Object)],1266)1267assert df.dtypes == [pl.Int32, pl.Object, pl.Object]1268assert df.null_count().row(0) == (0, 0, 0)12691270dc = _TestBazDC(d=datetime(2020, 2, 22), e=42.0, f="xyz")1271df = pl.DataFrame([[dc]], schema={"d": pl.Object})1272assert df.schema == {"d": pl.Object}1273assert df.item() == dc127412751276def test_from_dicts_schema() -> None:1277data = [{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}]12781279# let polars infer the dtypes, but inform it about a 3rd column.1280for schema, overrides in (1281({"a": pl.Unknown, "b": pl.Unknown, "c": pl.Int32}, None),1282({"a": None, "b": None, "c": None}, {"c": pl.Int32}),1283(["a", "b", ("c", pl.Int32)], None),1284):1285df = pl.from_dicts(1286data,1287schema=schema, # type: ignore[arg-type]1288schema_overrides=overrides,1289)1290assert df.dtypes == [pl.Int64, pl.Int64, pl.Int32]1291assert df.to_dict(as_series=False) == {1292"a": [1, 2, 3],1293"b": [4, 5, 6],1294"c": [None, None, None],1295}12961297# provide data that resolves to an empty frame (ref: scalar1298# expansion shortcut), with schema/override hints1299schema = {"colx": pl.String, "coly": pl.Int32}13001301for param in ("schema", "schema_overrides"):1302df = pl.DataFrame({"colx": [], "coly": 0}, **{param: schema}) # type: ignore[arg-type]1303assert df.schema == schema130413051306def test_nested_read_dicts_4143() -> None:1307result = pl.from_dicts(1308[1309{1310"id": 1,1311"hint": [1312{"some_text_here": "text", "list_": [1, 2, 4]},1313{"some_text_here": "text", "list_": [1, 2, 4]},1314],1315},1316{1317"id": 2,1318"hint": [1319{"some_text_here": None, "list_": [1]},1320{"some_text_here": None, "list_": [2]},1321],1322},1323]1324)1325expected = {1326"hint": [1327[1328{"some_text_here": "text", "list_": [1, 2, 4]},1329{"some_text_here": "text", "list_": [1, 2, 4]},1330],1331[1332{"some_text_here": None, "list_": [1]},1333{"some_text_here": None, "list_": [2]},1334],1335],1336"id": [1, 2],1337}1338assert result.to_dict(as_series=False) == expected133913401341def test_nested_read_dicts_4143_2() -> None:1342result = pl.from_dicts(1343[1344{1345"id": 1,1346"hint": [1347{"some_text_here": "text", "list_": [1, 2, 4]},1348{"some_text_here": "text", "list_": [1, 2, 4]},1349],1350},1351{1352"id": 2,1353"hint": [1354{"some_text_here": "text", "list_": []},1355{"some_text_here": "text", "list_": []},1356],1357},1358]1359)13601361assert result.dtypes == [1362pl.Int64,1363pl.List(pl.Struct({"some_text_here": pl.String, "list_": pl.List(pl.Int64)})),1364]1365expected = {1366"id": [1, 2],1367"hint": [1368[1369{"some_text_here": "text", "list_": [1, 2, 4]},1370{"some_text_here": "text", "list_": [1, 2, 4]},1371],1372[1373{"some_text_here": "text", "list_": []},1374{"some_text_here": "text", "list_": []},1375],1376],1377}1378assert result.to_dict(as_series=False) == expected137913801381def test_from_records_nullable_structs() -> None:1382records = [1383{"id": 1, "items": [{"item_id": 100, "description": None}]},1384{"id": 1, "items": [{"item_id": 100, "description": "hi"}]},1385]13861387schema: list[tuple[str, PolarsDataType]] = [1388("id", pl.UInt16),1389(1390"items",1391pl.List(1392pl.Struct(1393[pl.Field("item_id", pl.UInt32), pl.Field("description", pl.String)]1394)1395),1396),1397]13981399schema_options: list[list[tuple[str, PolarsDataType]] | None] = [schema, None]1400for s in schema_options:1401result = pl.DataFrame(records, schema=s, orient="row")1402expected = {1403"id": [1, 1],1404"items": [1405[{"item_id": 100, "description": None}],1406[{"item_id": 100, "description": "hi"}],1407],1408}1409assert result.to_dict(as_series=False) == expected14101411# check initialisation without any records1412df = pl.DataFrame(schema=schema)1413dict_schema = dict(schema)1414assert df.to_dict(as_series=False) == {"id": [], "items": []}1415assert df.schema == dict_schema14161417dtype: PolarsDataType = dict_schema["items"]1418series = pl.Series("items", dtype=dtype)1419assert series.to_frame().to_dict(as_series=False) == {"items": []}1420assert series.dtype == dict_schema["items"]1421assert series.to_list() == []142214231424@pytest.mark.parametrize("unnest_column", ["a", pl.col("a"), cs.by_name("a")])1425def test_from_categorical_in_struct_defined_by_schema(unnest_column: Any) -> None:1426df = pl.DataFrame(1427{"a": [{"value": "foo", "counts": 1}, {"value": "bar", "counts": 2}]},1428schema={"a": pl.Struct({"value": pl.Categorical, "counts": pl.UInt32})},1429)14301431expected = pl.DataFrame(1432{"value": ["foo", "bar"], "counts": [1, 2]},1433schema={"value": pl.Categorical, "counts": pl.UInt32},1434)14351436res_eager = df.unnest(unnest_column)1437assert_frame_equal(res_eager, expected, categorical_as_str=True)14381439res_lazy = df.lazy().unnest(unnest_column)1440assert_frame_equal(res_lazy.collect(), expected, categorical_as_str=True)144114421443def test_nested_schema_construction() -> None:1444schema = {1445"node_groups": pl.List(1446pl.Struct(1447[1448pl.Field("parent_node_group_id", pl.UInt8),1449pl.Field(1450"nodes",1451pl.List(1452pl.Struct(1453[1454pl.Field("name", pl.String),1455pl.Field(1456"sub_nodes",1457pl.List(1458pl.Struct(1459[1460pl.Field("internal_id", pl.UInt64),1461pl.Field("value", pl.UInt32),1462]1463)1464),1465),1466]1467)1468),1469),1470]1471)1472)1473}1474df = pl.DataFrame(1475{1476"node_groups": [1477[{"nodes": []}, {"nodes": [{"name": "", "sub_nodes": []}]}],1478]1479},1480schema=schema,1481)14821483assert df.schema == schema1484assert df.to_dict(as_series=False) == {1485"node_groups": [1486[1487{"parent_node_group_id": None, "nodes": []},1488{1489"parent_node_group_id": None,1490"nodes": [{"name": "", "sub_nodes": []}],1491},1492]1493]1494}149514961497def test_nested_schema_construction2() -> None:1498schema = {1499"node_groups": pl.List(1500pl.Struct(1501[1502pl.Field(1503"nodes",1504pl.List(1505pl.Struct(1506[1507pl.Field("name", pl.String),1508pl.Field("time", pl.UInt32),1509]1510)1511),1512)1513]1514)1515)1516}1517df = pl.DataFrame(1518[1519{"node_groups": [{"nodes": [{"name": "a", "time": 0}]}]},1520{"node_groups": [{"nodes": []}]},1521],1522schema=schema,1523)1524assert df.schema == schema1525assert df.to_dict(as_series=False) == {1526"node_groups": [[{"nodes": [{"name": "a", "time": 0}]}], [{"nodes": []}]]1527}152815291530def test_arrow_to_pyseries_with_one_chunk_does_not_copy_data() -> None:1531from polars._utils.construction import arrow_to_pyseries15321533original_array = pa.chunked_array([[1, 2, 3]], type=pa.int64())1534pyseries = arrow_to_pyseries("", original_array)1535assert (1536pyseries.get_chunks()[0]._get_buffer_info()[0]1537== original_array.chunks[0].buffers()[1].address1538)153915401541def test_init_with_explicit_binary_schema() -> None:1542df = pl.DataFrame({"a": [b"hello", b"world"]}, schema={"a": pl.Binary})1543assert df.schema == {"a": pl.Binary}1544assert df["a"].to_list() == [b"hello", b"world"]15451546s = pl.Series("a", [b"hello", b"world"], dtype=pl.Binary)1547assert s.dtype == pl.Binary1548assert s.to_list() == [b"hello", b"world"]154915501551def test_nested_categorical() -> None:1552s = pl.Series([["a"]], dtype=pl.List(pl.Categorical))1553assert s.to_list() == [["a"]]1554assert s.dtype == pl.List(pl.Categorical)155515561557def test_datetime_date_subclasses() -> None:1558class FakeDate(date): ...15591560class FakeDateChild(FakeDate): ...15611562class FakeDatetime(FakeDate, datetime): ...15631564result = pl.Series([FakeDate(2020, 1, 1)])1565expected = pl.Series([date(2020, 1, 1)])1566assert_series_equal(result, expected)15671568result = pl.Series([FakeDateChild(2020, 1, 1)])1569expected = pl.Series([date(2020, 1, 1)])1570assert_series_equal(result, expected)15711572result = pl.Series([FakeDatetime(2020, 1, 1, 3)])1573expected = pl.Series([datetime(2020, 1, 1, 3)])1574assert_series_equal(result, expected)157515761577def test_list_null_constructor() -> None:1578s = pl.Series("a", [[None], [None]], dtype=pl.List(pl.Null))1579assert s.dtype == pl.List(pl.Null)1580assert s.to_list() == [[None], [None]]15811582# nested1583dtype = pl.List(pl.List(pl.Int8))1584values = [1585[],1586[[], []],1587[[33, 112]],1588]1589s = pl.Series(1590name="colx",1591values=values,1592dtype=dtype,1593)1594assert s.dtype == dtype1595assert s.to_list() == values15961597# nested1598# small order change has influence1599dtype = pl.List(pl.List(pl.Int8))1600values = [1601[[], []],1602[],1603[[33, 112]],1604]1605s = pl.Series(1606name="colx",1607values=values,1608dtype=dtype,1609)1610assert s.dtype == dtype1611assert s.to_list() == values161216131614def test_numpy_float_construction_av() -> None:1615np_dict = {"a": np.float64(1)}1616assert_frame_equal(pl.DataFrame(np_dict), pl.DataFrame({"a": 1.0}))161716181619def test_df_init_dict_raise_on_expression_input() -> None:1620with pytest.raises(1621TypeError,1622match="passing Expr objects to the DataFrame constructor is not supported",1623):1624pl.DataFrame({"a": pl.int_range(0, 3)})1625with pytest.raises(TypeError):1626pl.DataFrame({"a": pl.int_range(0, 3), "b": [3, 4, 5]})16271628# Passing a list of expressions is allowed1629df = pl.DataFrame({"a": [pl.int_range(0, 3)]})1630assert df.get_column("a").dtype.is_object()163116321633def test_df_schema_sequences() -> None:1634schema = [1635["address", pl.String],1636["key", pl.Int64],1637["value", pl.Float32],1638]1639df = pl.DataFrame(schema=schema) # type: ignore[arg-type]1640assert df.schema == {"address": pl.String, "key": pl.Int64, "value": pl.Float32}164116421643def test_df_schema_sequences_incorrect_length() -> None:1644schema = [1645["address", pl.String, pl.Int8],1646["key", pl.Int64],1647["value", pl.Float32],1648]1649with pytest.raises(ValueError):1650pl.DataFrame(schema=schema) # type: ignore[arg-type]165116521653@pytest.mark.parametrize(1654("input", "infer_func", "expected_dtype"),1655[1656("f8", numpy_char_code_to_dtype, pl.Float64),1657("f4", numpy_char_code_to_dtype, pl.Float32),1658("i4", numpy_char_code_to_dtype, pl.Int32),1659("u1", numpy_char_code_to_dtype, pl.UInt8),1660("?", numpy_char_code_to_dtype, pl.Boolean),1661("m8", numpy_char_code_to_dtype, pl.Duration("us")),1662("M8", numpy_char_code_to_dtype, pl.Datetime("us")),1663],1664)1665def test_numpy_inference(1666input: Any,1667infer_func: Callable[[Any], PolarsDataType],1668expected_dtype: PolarsDataType,1669) -> None:1670result = infer_func(input)1671assert result == expected_dtype167216731674def test_array_construction() -> None:1675payload = [[1, 2, 3], None, [4, 2, 3]]16761677dtype = pl.Array(pl.Int64, 3)1678s = pl.Series(payload, dtype=dtype)1679assert s.dtype == dtype1680assert s.to_list() == payload16811682# inner type1683dtype = pl.Array(pl.UInt8, 2)1684payload = [[1, 2], None, [3, 4]]1685s = pl.Series(payload, dtype=dtype)1686assert s.dtype == dtype1687assert s.to_list() == payload16881689# create using schema1690df = pl.DataFrame(1691schema={1692"a": pl.Array(pl.Float32, 3),1693"b": pl.Array(pl.Datetime("ms"), 5),1694}1695)1696assert df.dtypes == [1697pl.Array(pl.Float32, 3),1698pl.Array(pl.Datetime("ms"), 5),1699]1700assert df.rows() == []17011702# from dicts1703rows = [1704{"row_id": "a", "data": [1, 2, 3]},1705{"row_id": "b", "data": [2, 3, 4]},1706]1707schema = {"row_id": pl.String(), "data": pl.Array(inner=pl.Int64, shape=3)}1708df = pl.from_dicts(rows, schema=schema)1709assert df.schema == schema1710assert df.rows() == [("a", [1, 2, 3]), ("b", [2, 3, 4])]171117121713@pytest.mark.may_fail_auto_streaming1714def test_pycapsule_interface(df: pl.DataFrame) -> None:1715df = df.rechunk()1716pyarrow_table = df.to_arrow()17171718# Array via C data interface1719pyarrow_array = pyarrow_table["bools"].chunk(0)1720round_trip_series = pl.Series(PyCapsuleArrayHolder(pyarrow_array))1721assert df["bools"].equals(round_trip_series, check_dtypes=True, check_names=False)17221723# empty Array via C data interface1724empty_pyarrow_array = pa.array([], type=pyarrow_array.type)1725round_trip_series = pl.Series(PyCapsuleArrayHolder(empty_pyarrow_array))1726assert df["bools"].dtype == round_trip_series.dtype17271728# RecordBatch via C array interface1729pyarrow_record_batch = pyarrow_table.to_batches()[0]1730round_trip_df = pl.DataFrame(PyCapsuleArrayHolder(pyarrow_record_batch))1731assert df.equals(round_trip_df)17321733# ChunkedArray via C stream interface1734pyarrow_chunked_array = pyarrow_table["bools"]1735round_trip_series = pl.Series(PyCapsuleStreamHolder(pyarrow_chunked_array))1736assert df["bools"].equals(round_trip_series, check_dtypes=True, check_names=False)17371738# empty ChunkedArray via C stream interface1739empty_chunked_array = pa.chunked_array([], type=pyarrow_chunked_array.type)1740round_trip_series = pl.Series(PyCapsuleStreamHolder(empty_chunked_array))1741assert df["bools"].dtype == round_trip_series.dtype17421743# Table via C stream interface1744round_trip_df = pl.DataFrame(PyCapsuleStreamHolder(pyarrow_table))1745assert df.equals(round_trip_df)17461747# empty Table via C stream interface1748empty_df = df[:0].to_arrow()1749round_trip_df = pl.DataFrame(PyCapsuleStreamHolder(empty_df))1750orig_schema = df.schema1751round_trip_schema = round_trip_df.schema17521753# The "enum" schema is not preserved because categories are lost via C data1754# interface1755orig_schema.pop("enum")1756round_trip_schema.pop("enum")17571758assert orig_schema == round_trip_schema17591760# RecordBatchReader via C stream interface1761pyarrow_reader = pa.RecordBatchReader.from_batches(1762pyarrow_table.schema, pyarrow_table.to_batches()1763)1764round_trip_df = pl.DataFrame(PyCapsuleStreamHolder(pyarrow_reader))1765assert df.equals(round_trip_df)176617671768@pytest.mark.parametrize(1769"tz",1770[1771None,1772ZoneInfo("Asia/Tokyo"),1773ZoneInfo("Europe/Amsterdam"),1774ZoneInfo("UTC"),1775timezone.utc,1776],1777)1778def test_init_list_of_dicts_with_timezone(tz: Any) -> None:1779dt = datetime(2023, 1, 1, 0, 0, 0, 0, tzinfo=tz)17801781df = pl.DataFrame([{"dt": dt}, {"dt": dt}])1782expected = pl.DataFrame({"dt": [dt, dt]})1783assert_frame_equal(df, expected)17841785assert df.schema == {"dt": pl.Datetime("us", time_zone=tz)}178617871788@pytest.mark.parametrize(1789"tz",1790[1791None,1792ZoneInfo("Asia/Tokyo"),1793ZoneInfo("Europe/Amsterdam"),1794ZoneInfo("UTC"),1795timezone.utc,1796],1797)1798def test_init_list_of_nested_dicts_with_timezone(tz: Any) -> None:1799dt = datetime(2021, 1, 1, 0, 0, 0, 0, tzinfo=tz)1800data = [{"timestamp": {"content": datetime(2021, 1, 1, 0, 0, tzinfo=tz)}}]18011802df = pl.DataFrame(data).unnest("timestamp")1803expected = pl.DataFrame({"content": [dt]})1804assert_frame_equal(df, expected)18051806assert df.schema == {"content": pl.Datetime("us", time_zone=tz)}180718081809def test_init_from_subclassed_types() -> None:1810# more detailed test of one custom subclass...1811import codecs18121813class SuperSecretString(str):1814def __new__(cls, value: str) -> Self:1815return super().__new__(cls, value)18161817def __repr__(self) -> str:1818return codecs.encode(self, "rot_13")18191820w = "windmolen"1821sstr = SuperSecretString(w)18221823assert sstr == w1824assert isinstance(sstr, str)1825assert repr(sstr) == "jvaqzbyra"1826assert_series_equal(pl.Series([w, w]), pl.Series([sstr, sstr]))18271828# ...then validate across other basic types1829for BaseType, value in (1830(int, 42),1831(float, 5.5),1832(bytes, b"value"),1833(str, "value"),1834):18351836class SubclassedType(BaseType): # type: ignore[misc,valid-type]1837def __new__(cls, value: Any) -> Self:1838return super().__new__(cls, value) # type: ignore[no-any-return]18391840assert (1841pl.Series([value]).to_list() == pl.Series([SubclassedType(value)]).to_list()1842)184318441845def test_series_init_with_python_type_7737() -> None:1846assert pl.Series([], dtype=int).dtype == pl.Int64 # type: ignore[arg-type]1847assert pl.Series([], dtype=float).dtype == pl.Float64 # type: ignore[arg-type]1848assert pl.Series([], dtype=bool).dtype == pl.Boolean # type: ignore[arg-type]1849assert pl.Series([], dtype=str).dtype == pl.Utf8 # type: ignore[arg-type]18501851with pytest.raises(TypeError):1852pl.Series(["a"], dtype=int) # type: ignore[arg-type]18531854with pytest.raises(TypeError):1855pl.Series([True], dtype=str) # type: ignore[arg-type]185618571858def test_init_from_list_shape_6968() -> None:1859df1 = pl.DataFrame([[1, None], [2, None], [3, None]])1860df2 = pl.DataFrame([[None, None], [2, None], [3, None]])1861assert df1.shape == (2, 3)1862assert df2.shape == (2, 3)186318641865