Path: blob/main/py-polars/tests/unit/constructors/test_constructors.py
8326 views
from __future__ import annotations12from collections import OrderedDict, namedtuple3from datetime import date, datetime, time, timedelta, timezone4from decimal import Decimal5from random import shuffle6from typing import TYPE_CHECKING, Any, Literal, NamedTuple7from zoneinfo import ZoneInfo89import numpy as np10import pandas as pd11import pyarrow as pa12import pytest13from packaging.version import parse as parse_version14from pydantic import BaseModel, Field, TypeAdapter1516import polars as pl17import polars.selectors as cs18from polars._dependencies import dataclasses, pydantic19from polars._utils.construction.utils import try_get_type_hints20from polars.datatypes import numpy_char_code_to_dtype21from polars.exceptions import DuplicateError, ShapeError22from polars.testing import assert_frame_equal, assert_series_equal23from tests.unit.utils.pycapsule_utils import PyCapsuleArrayHolder, PyCapsuleStreamHolder2425if TYPE_CHECKING:26import sys27from collections.abc import Callable2829from polars._typing import PolarsDataType3031if sys.version_info >= (3, 11):32from typing import Self33else:34from typing_extensions import Self3536from typing_extensions import assert_type373839# -----------------------------------------------------------------------------------40# nested dataclasses, models, namedtuple classes (can't be defined inside test func)41# -----------------------------------------------------------------------------------42@dataclasses.dataclass43class _TestBazDC:44d: datetime45e: float46f: str474849@dataclasses.dataclass50class _TestBarDC:51a: str52b: int53c: _TestBazDC545556@dataclasses.dataclass57class _TestFooDC:58x: int59y: _TestBarDC606162class _TestBazPD(pydantic.BaseModel):63d: datetime64e: float65f: str666768class _TestBarPD(pydantic.BaseModel):69a: str70b: int71c: _TestBazPD727374class _TestFooPD(pydantic.BaseModel):75x: int76y: _TestBarPD777879class _TestBazNT(NamedTuple):80d: datetime81e: float82f: str838485class _TestBarNT(NamedTuple):86a: str87b: int88c: _TestBazNT899091class _TestFooNT(NamedTuple):92x: int93y: _TestBarNT949596# --------------------------------------------------------------------------------979899def test_init_dict() -> None:100# Empty dictionary101df = pl.DataFrame({})102assert df.shape == (0, 0)103104# Empty dictionary/values105df = pl.DataFrame({"a": [], "b": []})106assert df.shape == (0, 2)107assert df.schema == {"a": pl.Null, "b": pl.Null}108109for df in (110pl.DataFrame({}, schema={"a": pl.Date, "b": pl.String}),111pl.DataFrame({"a": [], "b": []}, schema={"a": pl.Date, "b": pl.String}),112):113assert df.shape == (0, 2)114assert df.schema == {"a": pl.Date, "b": pl.String}115116# List of empty list117df = pl.DataFrame({"a": [[]], "b": [[]]})118expected = {"a": pl.List(pl.Null), "b": pl.List(pl.Null)}119assert df.schema == expected120assert df.rows() == [([], [])]121122# Mixed dtypes123df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})124assert df.shape == (3, 2)125assert df.columns == ["a", "b"]126assert df.dtypes == [pl.Int64, pl.Float64]127128df = pl.DataFrame(129data={"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]},130schema=[("a", pl.Int8), ("b", pl.Float32)],131)132assert df.schema == {"a": pl.Int8, "b": pl.Float32}133134# Values contained in tuples135df = pl.DataFrame({"a": (1, 2, 3), "b": [1.0, 2.0, 3.0]})136assert df.shape == (3, 2)137138# Datetime/Date types (from both python and integer values)139py_datetimes = (140datetime(2022, 12, 31, 23, 59, 59),141datetime(2022, 12, 31, 23, 59, 59),142)143py_dates = (date(2022, 12, 31), date(2022, 12, 31))144int_datetimes = [1672531199000000, 1672531199000000]145int_dates = [19357, 19357]146147for dates, datetimes, coldefs in (148# test inferred and explicit (given both py/polars dtypes)149(py_dates, py_datetimes, None),150(py_dates, py_datetimes, [("dt", date), ("dtm", datetime)]),151(py_dates, py_datetimes, [("dt", pl.Date), ("dtm", pl.Datetime)]),152(int_dates, int_datetimes, [("dt", date), ("dtm", datetime)]),153(int_dates, int_datetimes, [("dt", pl.Date), ("dtm", pl.Datetime)]),154):155df = pl.DataFrame(156data={"dt": dates, "dtm": datetimes},157schema=coldefs,158)159assert df.schema == {"dt": pl.Date, "dtm": pl.Datetime("us")}160assert df.rows() == list(zip(py_dates, py_datetimes, strict=True))161162# Overriding dict column names/types163df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, schema=["c", "d"])164assert df.columns == ["c", "d"]165166df = pl.DataFrame(167{"a": [1, 2, 3], "b": [4, 5, 6]},168schema=["c", ("d", pl.Int8)],169) # partial type info (allowed, but mypy doesn't like it ;p)170assert df.schema == {"c": pl.Int64, "d": pl.Int8}171172df = pl.DataFrame(173{"a": [1, 2, 3], "b": [4, 5, 6]}, schema=[("c", pl.Int8), ("d", pl.Int16)]174)175assert df.schema == {"c": pl.Int8, "d": pl.Int16}176177# empty nested objects178for empty_val in [None, "", {}, []]: # type: ignore[var-annotated]179test = [{"field": {"sub_field": empty_val, "sub_field_2": 2}}]180df = pl.DataFrame(test, schema={"field": pl.Object})181assert df["field"][0] == test[0]["field"]182183184def test_error_string_dtypes() -> None:185with pytest.raises(TypeError, match="cannot parse input"):186pl.DataFrame(187data={"x": [1, 2], "y": [3, 4], "z": [5, 6]},188schema={"x": "i16", "y": "i32", "z": "f32"}, # type: ignore[dict-item]189)190191with pytest.raises(TypeError, match="cannot parse input"):192pl.Series("n", [1, 2, 3], dtype="f32") # type: ignore[arg-type]193194195def test_init_structured_objects() -> None:196# validate init from dataclass, namedtuple, and pydantic model objects197@dataclasses.dataclass198class TradeDC:199timestamp: datetime200ticker: str201price: float202size: int | None = None203204class TradePD(pydantic.BaseModel):205timestamp: datetime206ticker: str207price: float208size: int209210class TradeNT(NamedTuple):211timestamp: datetime212ticker: str213price: float214size: int | None = None215216raw_data = [217(datetime(2022, 9, 8, 14, 30, 45), "AAPL", 157.5, 125),218(datetime(2022, 9, 9, 10, 15, 12), "FLSY", 10.0, 1500),219(datetime(2022, 9, 7, 15, 30), "MU", 55.5, 400),220]221columns = ["timestamp", "ticker", "price", "size"]222223for TradeClass in (TradeDC, TradeNT, TradePD):224trades = [225TradeClass(**dict(zip(columns, values, strict=True))) # type: ignore[arg-type]226for values in raw_data227]228229for DF in (pl.DataFrame, pl.from_records):230df = DF(data=trades)231assert df.schema == {232"timestamp": pl.Datetime("us"),233"ticker": pl.String,234"price": pl.Float64,235"size": pl.Int64,236}237assert df.rows() == raw_data238239# partial dtypes override240df = DF(241data=trades,242schema_overrides={"timestamp": pl.Datetime("ms"), "size": pl.Int32},243)244assert df.schema == {245"timestamp": pl.Datetime("ms"),246"ticker": pl.String,247"price": pl.Float64,248"size": pl.Int32,249}250251# in conjunction with full 'columns' override (rename/downcast)252df = pl.DataFrame(253data=trades,254schema=[255("ts", pl.Datetime("ms")),256("tk", pl.Categorical),257("pc", pl.Float64),258("sz", pl.UInt16),259],260)261assert df.schema == {262"ts": pl.Datetime("ms"),263"tk": pl.Categorical(),264"pc": pl.Float64,265"sz": pl.UInt16,266}267assert df.rows() == raw_data268269# cover a miscellaneous edge-case when detecting the annotations270assert try_get_type_hints(obj=type(None)) == {}271272273def test_init_pydantic_2x() -> None:274class PageView(BaseModel):275user_id: str276ts: datetime = Field(alias=["ts", "$date"]) # type: ignore[literal-required, call-overload]277path: str = Field("?", alias=["url", "path"]) # type: ignore[literal-required, call-overload]278referer: str = Field("?", alias="referer")279event: Literal["leave", "enter"] = Field("enter")280time_on_page: int = Field(0, serialization_alias="top")281282data_json = """283[{284"user_id": "x",285"ts": {"$date": "2021-01-01T00:00:00.000Z"},286"url": "/latest/foobar",287"referer": "https://google.com",288"event": "enter",289"top": 123290}]291"""292adapter: TypeAdapter[Any] = TypeAdapter(list[PageView])293models = adapter.validate_json(data_json)294295result = pl.DataFrame(models)296expected = pl.DataFrame(297{298"user_id": ["x"],299"ts": [datetime(2021, 1, 1, 0, 0)],300"path": ["?"],301"referer": ["https://google.com"],302"event": ["enter"],303"time_on_page": [0],304}305)306assert_frame_equal(result, expected)307308309def test_init_structured_objects_unhashable() -> None:310# cover an edge-case with namedtuple fields that aren't hashable311312class Test(NamedTuple):313dt: datetime314info: dict[str, int]315316test_data = [317Test(datetime(2017, 1, 1), {"a": 1, "b": 2}),318Test(datetime(2017, 1, 2), {"a": 2, "b": 2}),319]320df = pl.DataFrame(test_data)321# shape: (2, 2)322# ┌─────────────────────┬───────────┐323# │ dt ┆ info │324# │ --- ┆ --- │325# │ datetime[μs] ┆ struct[2] │326# ╞═════════════════════╪═══════════╡327# │ 2017-01-01 00:00:00 ┆ {1,2} │328# │ 2017-01-02 00:00:00 ┆ {2,2} │329# └─────────────────────┴───────────┘330assert df.schema == {331"dt": pl.Datetime(time_unit="us", time_zone=None),332"info": pl.Struct([pl.Field("a", pl.Int64), pl.Field("b", pl.Int64)]),333}334assert df.rows() == test_data335336337@pytest.mark.parametrize(338("foo", "bar", "baz"),339[340(_TestFooDC, _TestBarDC, _TestBazDC),341(_TestFooPD, _TestBarPD, _TestBazPD),342(_TestFooNT, _TestBarNT, _TestBazNT),343],344)345def test_init_structured_objects_nested(foo: Any, bar: Any, baz: Any) -> None:346data = [347foo(348x=100,349y=bar(350a="hello",351b=800,352c=baz(d=datetime(2023, 4, 12, 10, 30), e=-10.5, f="world"),353),354)355]356df = pl.DataFrame(data)357# shape: (1, 2)358# ┌─────┬───────────────────────────────────┐359# │ x ┆ y │360# │ --- ┆ --- │361# │ i64 ┆ struct[3] │362# ╞═════╪═══════════════════════════════════╡363# │ 100 ┆ {"hello",800,{2023-04-12 10:30:0… │364# └─────┴───────────────────────────────────┘365366assert df.schema == {367"x": pl.Int64,368"y": pl.Struct(369[370pl.Field("a", pl.String),371pl.Field("b", pl.Int64),372pl.Field(373"c",374pl.Struct(375[376pl.Field("d", pl.Datetime("us")),377pl.Field("e", pl.Float64),378pl.Field("f", pl.String),379]380),381),382]383),384}385assert df.row(0) == (386100,387{388"a": "hello",389"b": 800,390"c": {391"d": datetime(2023, 4, 12, 10, 30),392"e": -10.5,393"f": "world",394},395},396)397398# validate nested schema override399override_struct_schema: dict[str, PolarsDataType] = {400"x": pl.Int16,401"y": pl.Struct(402[403pl.Field("a", pl.String),404pl.Field("b", pl.Int32),405pl.Field(406name="c",407dtype=pl.Struct(408[409pl.Field("d", pl.Datetime("ms")),410pl.Field("e", pl.Float32),411pl.Field("f", pl.String),412]413),414),415]416),417}418for schema, schema_overrides in (419(None, override_struct_schema),420(override_struct_schema, None),421):422df = (423pl.DataFrame(data, schema=schema, schema_overrides=schema_overrides)424.unnest("y")425.unnest("c")426)427# shape: (1, 6)428# ┌─────┬───────┬─────┬─────────────────────┬───────┬───────┐429# │ x ┆ a ┆ b ┆ d ┆ e ┆ f │430# │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │431# │ i16 ┆ str ┆ i32 ┆ datetime[ms] ┆ f32 ┆ str │432# ╞═════╪═══════╪═════╪═════════════════════╪═══════╪═══════╡433# │ 100 ┆ hello ┆ 800 ┆ 2023-04-12 10:30:00 ┆ -10.5 ┆ world │434# └─────┴───────┴─────┴─────────────────────┴───────┴───────┘435assert df.schema == {436"x": pl.Int16,437"a": pl.String,438"b": pl.Int32,439"d": pl.Datetime("ms"),440"e": pl.Float32,441"f": pl.String,442}443assert df.row(0) == (444100,445"hello",446800,447datetime(2023, 4, 12, 10, 30),448-10.5,449"world",450)451452453def test_dataclasses_initvar_typing() -> None:454@dataclasses.dataclass455class ABC:456x: date457y: float458z: dataclasses.InitVar[list[str]] = None459460# should be able to parse the initvar typing...461abc = ABC(x=date(1999, 12, 31), y=100.0)462df = pl.DataFrame([abc])463464# ...but should not load the initvar field into the DataFrame465assert dataclasses.asdict(abc) == df.rows(named=True)[0]466467468@pytest.mark.parametrize(469"nt",470[471namedtuple("TestData", ["id", "info"]), # noqa: PYI024472NamedTuple("TestData", [("id", int), ("info", str)]),473],474)475def test_collections_namedtuple(nt: type) -> None:476nt_data = [nt(1, "a"), nt(2, "b"), nt(3, "c")]477478result = pl.DataFrame(nt_data)479expected = pl.DataFrame({"id": [1, 2, 3], "info": ["a", "b", "c"]})480assert_frame_equal(result, expected)481482result = pl.DataFrame({"data": nt_data, "misc": ["x", "y", "z"]})483expected = pl.DataFrame(484{485"data": [486{"id": 1, "info": "a"},487{"id": 2, "info": "b"},488{"id": 3, "info": "c"},489],490"misc": ["x", "y", "z"],491}492)493assert_frame_equal(result, expected)494495496def test_init_ndarray() -> None:497# Empty array498df = pl.DataFrame(np.array([]))499assert_frame_equal(df, pl.DataFrame())500501# 1D array502df = pl.DataFrame(np.array([1, 2, 3], dtype=np.int64), schema=["a"])503expected = pl.DataFrame({"a": [1, 2, 3]})504assert_frame_equal(df, expected)505506df = pl.DataFrame(np.array([1, 2, 3]), schema=[("a", pl.Int32)])507expected = pl.DataFrame({"a": [1, 2, 3]}).with_columns(pl.col("a").cast(pl.Int32))508assert_frame_equal(df, expected)509510# 2D array (or 2x 1D array) - should default to column orientation (if C-contiguous)511for data in (512np.array([[1, 2], [3, 4]], dtype=np.int64),513[np.array([1, 2], dtype=np.int64), np.array([3, 4], dtype=np.int64)],514):515df = pl.DataFrame(data, orient="col")516expected = pl.DataFrame({"column_0": [1, 2], "column_1": [3, 4]})517assert_frame_equal(df, expected)518519df = pl.DataFrame([[1, 2.0, "a"], [None, None, None]], orient="row")520expected = pl.DataFrame(521{"column_0": [1, None], "column_1": [2.0, None], "column_2": ["a", None]}522)523assert_frame_equal(df, expected)524525df = pl.DataFrame(526data=[[1, 2.0, "a"], [None, None, None]],527schema=[("x", pl.Boolean), ("y", pl.Int32), "z"],528orient="row",529)530assert df.rows() == [(True, 2, "a"), (None, None, None)]531assert df.schema == {"x": pl.Boolean, "y": pl.Int32, "z": pl.String}532533# 2D array - default to column orientation534df = pl.DataFrame(np.array([[1, 2], [3, 4]], dtype=np.int64))535expected = pl.DataFrame({"column_0": [1, 3], "column_1": [2, 4]})536assert_frame_equal(df, expected)537538# no orientation, numpy convention539df = pl.DataFrame(np.ones((3, 1), dtype=np.int64))540assert df.shape == (3, 1)541542# 2D array - row orientation inferred543df = pl.DataFrame(544np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64), schema=["a", "b", "c"]545)546expected = pl.DataFrame({"a": [1, 4], "b": [2, 5], "c": [3, 6]})547assert_frame_equal(df, expected)548549# 2D array - column orientation inferred550df = pl.DataFrame(551np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64), schema=["a", "b"]552)553expected = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})554assert_frame_equal(df, expected)555556# List column from 2D array with single-column schema557df = pl.DataFrame(np.arange(4).reshape(-1, 1).astype(np.int64), schema=["a"])558assert_frame_equal(df, pl.DataFrame({"a": [0, 1, 2, 3]}))559assert np.array_equal(df.to_numpy(), np.arange(4).reshape(-1, 1).astype(np.int64))560561df = pl.DataFrame(np.arange(4).reshape(-1, 2).astype(np.int64), schema=["a"])562assert_frame_equal(563df,564pl.DataFrame(565{"a": [[0, 1], [2, 3]]}, schema={"a": pl.Array(pl.Int64, shape=2)}566),567)568569# 2D numpy arrays570df = pl.DataFrame({"a": np.arange(5, dtype=np.int64).reshape(1, -1)})571assert df.dtypes == [pl.Array(pl.Int64, shape=5)]572assert df.shape == (1, 1)573574df = pl.DataFrame({"a": np.arange(10, dtype=np.int64).reshape(2, -1)})575assert df.dtypes == [pl.Array(pl.Int64, shape=5)]576assert df.shape == (2, 1)577assert df.rows() == [([0, 1, 2, 3, 4],), ([5, 6, 7, 8, 9],)]578579test_rows = [(1, 2), (3, 4)]580df = pl.DataFrame([np.array(test_rows[0]), np.array(test_rows[1])], orient="row")581expected = pl.DataFrame(test_rows, orient="row")582assert_frame_equal(df, expected)583584# round trip export/init585for shape in ((4, 4), (4, 8), (8, 4)):586np_ones = np.ones(shape=shape, dtype=np.float64)587names = [f"c{i}" for i in range(shape[1])]588589df = pl.DataFrame(np_ones, schema=names)590assert_frame_equal(df, pl.DataFrame(np.asarray(df), schema=names))591592593def test_init_ndarray_errors() -> None:594# 2D array: orientation conflicts with columns595with pytest.raises(ValueError):596pl.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]), schema=["a", "b"], orient="row")597598with pytest.raises(ValueError):599pl.DataFrame(600np.array([[1, 2, 3], [4, 5, 6]]),601schema=[("a", pl.UInt32), ("b", pl.UInt32)],602orient="row",603)604605# Invalid orient value606with pytest.raises(ValueError):607pl.DataFrame(608np.array([[1, 2, 3], [4, 5, 6]]),609orient="wrong", # type: ignore[arg-type]610)611612# Dimensions mismatch613with pytest.raises(ValueError):614_ = pl.DataFrame(np.array([1, 2, 3]), schema=[])615616# Cannot init with 3D array617with pytest.raises(ValueError):618_ = pl.DataFrame(np.random.randn(2, 2, 2))619620621def test_init_ndarray_nan() -> None:622# numpy arrays containing NaN623df0 = pl.DataFrame(624data={"x": [1.0, 2.5, float("nan")], "y": [4.0, float("nan"), 6.5]},625)626df1 = pl.DataFrame(627data={"x": np.array([1.0, 2.5, np.nan]), "y": np.array([4.0, np.nan, 6.5])},628)629df2 = pl.DataFrame(630data={"x": np.array([1.0, 2.5, np.nan]), "y": np.array([4.0, np.nan, 6.5])},631nan_to_null=True,632)633assert_frame_equal(df0, df1)634assert df2.rows() == [(1.0, 4.0), (2.5, None), (None, 6.5)]635636s0 = pl.Series("n", [1.0, 2.5, float("nan")])637s1 = pl.Series("n", np.array([1.0, 2.5, float("nan")]))638s2 = pl.Series("n", np.array([1.0, 2.5, float("nan")]), nan_to_null=True)639640assert_series_equal(s0, s1)641assert s2.to_list() == [1.0, 2.5, None]642643644def test_init_ndarray_square() -> None:645# 2D square array; ensure that we maintain convention646# (first axis = rows) with/without an explicit schema647arr = np.arange(4).reshape(2, 2)648assert (649[(0, 1), (2, 3)]650== pl.DataFrame(arr).rows()651== pl.DataFrame(arr, schema=["a", "b"]).rows()652)653# check that we tie-break square arrays using fortran vs c-contiguous row/col major654df_c = pl.DataFrame(655data=np.array([[1, 2], [3, 4]], dtype=np.int64, order="C"),656schema=["x", "y"],657)658assert_frame_equal(df_c, pl.DataFrame({"x": [1, 3], "y": [2, 4]}))659660df_f = pl.DataFrame(661data=np.array([[1, 2], [3, 4]], dtype=np.int64, order="F"),662schema=["x", "y"],663)664assert_frame_equal(df_f, pl.DataFrame({"x": [1, 2], "y": [3, 4]}))665666667def test_init_numpy_unavailable(monkeypatch: Any) -> None:668monkeypatch.setattr(pl.dataframe.frame, "_check_for_numpy", lambda x: False)669with pytest.raises(TypeError):670pl.DataFrame(np.array([1, 2, 3]), schema=["a"])671672673def test_init_numpy_scalars() -> None:674df = pl.DataFrame(675{676"bool": [np.bool_(True), np.bool_(False)],677"i8": [np.int8(16), np.int8(64)],678"u32": [np.uint32(1234), np.uint32(9876)],679}680)681df_expected = pl.from_records(682data=[(True, 16, 1234), (False, 64, 9876)],683schema=OrderedDict([("bool", pl.Boolean), ("i8", pl.Int8), ("u32", pl.UInt32)]),684orient="row",685)686assert_frame_equal(df, df_expected)687688689def test_null_array_print_format() -> None:690pa_tbl_null = pa.table({"a": [None, None]})691df_null = pl.from_arrow(pa_tbl_null)692assert df_null.shape == (2, 1)693assert df_null.dtypes == [pl.Null] # type: ignore[union-attr]694assert df_null.rows() == [(None,), (None,)] # type: ignore[union-attr]695696assert (697str(df_null) == "shape: (2, 1)\n"698"┌──────┐\n"699"│ a │\n"700"│ --- │\n"701"│ null │\n"702"╞══════╡\n"703"│ null │\n"704"│ null │\n"705"└──────┘"706)707708709def test_init_arrow() -> None:710# Handle unnamed column711df = pl.DataFrame(pa.table({"a": [1, 2], None: [3, 4]}))712expected = pl.DataFrame({"a": [1, 2], "None": [3, 4]})713assert_frame_equal(df, expected)714715# Rename columns716df = pl.DataFrame(pa.table({"a": [1, 2], "b": [3, 4]}), schema=["c", "d"])717expected = pl.DataFrame({"c": [1, 2], "d": [3, 4]})718assert_frame_equal(df, expected)719720df = pl.DataFrame(721pa.table({"a": [1, 2], None: [3, 4]}),722schema=[("c", pl.Int32), ("d", pl.Float32)],723)724assert df.schema == {"c": pl.Int32, "d": pl.Float32}725assert df.rows() == [(1, 3.0), (2, 4.0)]726727# Bad columns argument728with pytest.raises(ValueError):729pl.DataFrame(pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}), schema=["c", "d", "e"])730731732def test_init_arrow_dupes() -> None:733tbl = pa.Table.from_arrays(734arrays=[735pa.array([1, 2, 3], type=pa.int32()),736pa.array([4, 5, 6], type=pa.int32()),737pa.array(738[7, 8, 9], type=pa.decimal128(38, 10)739), # included as this triggers a panic during construction alongside duplicate fields740],741schema=pa.schema(742[("col", pa.int32()), ("col", pa.int32()), ("col3", pa.decimal128(38, 10))]743),744)745with pytest.raises(746DuplicateError,747match=r"""column appears more than once; names must be unique: \["col"\]""",748):749pl.DataFrame(tbl)750751752def test_init_from_frame() -> None:753df1 = pl.DataFrame({"id": [0, 1], "misc": ["a", "b"], "val": [-10, 10]})754assert_frame_equal(df1, pl.DataFrame(df1))755756df2 = pl.DataFrame(df1, schema=["a", "b", "c"])757assert_frame_equal(df2, pl.DataFrame(df2))758759df3 = pl.DataFrame(df1, schema=["a", "b", "c"], schema_overrides={"val": pl.Int8})760assert_frame_equal(df3, pl.DataFrame(df3))761762assert df1.schema == {"id": pl.Int64, "misc": pl.String, "val": pl.Int64}763assert df2.schema == {"a": pl.Int64, "b": pl.String, "c": pl.Int64}764assert df3.schema == {"a": pl.Int64, "b": pl.String, "c": pl.Int8}765assert df1.rows() == df2.rows() == df3.rows()766767s1 = pl.Series("s", df3)768s2 = pl.Series(df3)769770assert s1.name == "s"771assert s2.name == ""772773774def test_init_series() -> None:775# List of Series776df = pl.DataFrame([pl.Series("a", [1, 2, 3]), pl.Series("b", [4, 5, 6])])777expected = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})778assert_frame_equal(df, expected)779780# Tuple of Series781df = pl.DataFrame((pl.Series("a", (1, 2, 3)), pl.Series("b", (4, 5, 6))))782assert_frame_equal(df, expected)783784df = pl.DataFrame(785(pl.Series("a", (1, 2, 3)), pl.Series("b", (4, 5, 6))),786schema=[("x", pl.Float64), ("y", pl.Float64)],787)788assert df.schema == {"x": pl.Float64, "y": pl.Float64}789assert df.rows() == [(1.0, 4.0), (2.0, 5.0), (3.0, 6.0)]790791# List of unnamed Series792df = pl.DataFrame([pl.Series([1, 2, 3]), pl.Series([4, 5, 6])])793col0 = pl.Series("column_0", [1, 2, 3])794col1 = pl.Series("column_1", [4, 5, 6])795expected = pl.DataFrame([col0, col1])796assert_frame_equal(df, expected)797798df = pl.DataFrame([pl.Series([0.0]), pl.Series([1.0])])799assert df.schema == {"column_0": pl.Float64, "column_1": pl.Float64}800assert df.rows() == [(0.0, 1.0)]801802df = pl.DataFrame(803[pl.Series([None]), pl.Series([1.0])],804schema=[("x", pl.Date), ("y", pl.Boolean)],805)806assert df.schema == {"x": pl.Date, "y": pl.Boolean}807assert df.rows() == [(None, True)]808809# Single Series810df = pl.DataFrame(pl.Series("a", [1, 2, 3]))811expected = pl.DataFrame({"a": [1, 2, 3]})812assert df.schema == {"a": pl.Int64}813assert_frame_equal(df, expected)814815df = pl.DataFrame(pl.Series("a", [1, 2, 3]), schema=[("a", pl.UInt32)])816assert df.rows() == [(1,), (2,), (3,)]817assert df.schema == {"a": pl.UInt32}818819# nested list, with/without explicit dtype820s1 = pl.Series([[[2, 2]]])821assert s1.dtype == pl.List(pl.List(pl.Int64))822823s2 = pl.Series([[[2, 2]]], dtype=pl.List(pl.List(pl.UInt8)))824assert s2.dtype == pl.List(pl.List(pl.UInt8))825826nested_dtype = pl.List(pl.List(pl.UInt8))827s3 = pl.Series("x", dtype=nested_dtype)828s4 = pl.Series(s3)829for s in (s3, s4):830assert s.dtype == nested_dtype831assert s.to_list() == []832assert s.name == "x"833834s5 = pl.Series("", df, dtype=pl.Int8)835assert_series_equal(s5, pl.Series("", [1, 2, 3], dtype=pl.Int8))836837838@pytest.mark.parametrize(839("dtype", "expected_dtype"),840[841(int, pl.Int64),842(bytes, pl.Binary),843(float, pl.Float64),844(str, pl.String),845(date, pl.Date),846(time, pl.Time),847(datetime, pl.Datetime("us")),848(timedelta, pl.Duration("us")),849(Decimal, pl.Decimal(scale=0)),850],851)852def test_init_py_dtype(dtype: Any, expected_dtype: PolarsDataType) -> None:853for s in (854pl.Series("s", [None], dtype=dtype),855pl.Series("s", [], dtype=dtype),856):857assert s.dtype == expected_dtype858859for df in (860pl.DataFrame({"col": [None]}, schema={"col": dtype}),861pl.DataFrame({"col": []}, schema={"col": dtype}),862):863assert df.schema == {"col": expected_dtype}864865866def test_init_py_dtype_misc_float() -> None:867assert pl.Series([100], dtype=float).dtype == pl.Float64 # type: ignore[arg-type]868869df = pl.DataFrame(870{"x": [100.0], "y": [200], "z": [None]},871schema={"x": float, "y": float, "z": float},872)873assert df.schema == {"x": pl.Float64, "y": pl.Float64, "z": pl.Float64}874assert df.rows() == [(100.0, 200.0, None)]875876877def test_init_seq_of_seq() -> None:878# List of lists879df = pl.DataFrame([[1, 2, 3], [4, 5, 6]], schema=["a", "b", "c"], orient="row")880expected = pl.DataFrame({"a": [1, 4], "b": [2, 5], "c": [3, 6]})881assert_frame_equal(df, expected)882883df = pl.DataFrame(884[[1, 2, 3], [4, 5, 6]],885schema=[("a", pl.Int8), ("b", pl.Int16), ("c", pl.Int32)],886orient="row",887)888assert df.schema == {"a": pl.Int8, "b": pl.Int16, "c": pl.Int32}889assert df.rows() == [(1, 2, 3), (4, 5, 6)]890891# Tuple of tuples, default to column orientation892df = pl.DataFrame(((1, 2, 3), (4, 5, 6)))893expected = pl.DataFrame({"column_0": [1, 2, 3], "column_1": [4, 5, 6]})894assert_frame_equal(df, expected)895896# Row orientation897df = pl.DataFrame(((1, 2), (3, 4)), schema=("a", "b"), orient="row")898expected = pl.DataFrame({"a": [1, 3], "b": [2, 4]})899assert_frame_equal(df, expected)900901df = pl.DataFrame(902((1, 2), (3, 4)), schema=(("a", pl.Float32), ("b", pl.Float32)), orient="row"903)904assert df.schema == {"a": pl.Float32, "b": pl.Float32}905assert df.rows() == [(1.0, 2.0), (3.0, 4.0)]906907# Wrong orient value908with pytest.raises(ValueError):909df = pl.DataFrame(((1, 2), (3, 4)), orient="wrong") # type: ignore[arg-type]910911912def test_init_1d_sequence() -> None:913# Empty list914df = pl.DataFrame([])915assert_frame_equal(df, pl.DataFrame())916917# List/array of strings918data = ["a", "b", "c"]919for a in (data, np.array(data)):920df = pl.DataFrame(a, schema=["s"])921expected = pl.DataFrame({"s": data})922assert_frame_equal(df, expected)923924df = pl.DataFrame([None, True, False], schema=[("xx", pl.Int8)])925assert df.schema == {"xx": pl.Int8}926assert df.rows() == [(None,), (1,), (0,)]927928# String sequence929result = pl.DataFrame("abc", schema=["s"])930expected = pl.DataFrame({"s": ["a", "b", "c"]})931assert_frame_equal(result, expected)932933# datetimes sequence934df = pl.DataFrame([datetime(2020, 1, 1)], schema={"ts": pl.Datetime("ms")})935assert df.schema == {"ts": pl.Datetime("ms")}936df = pl.DataFrame(937[datetime(2020, 1, 1, tzinfo=timezone.utc)], schema={"ts": pl.Datetime("ms")}938)939assert df.schema == {"ts": pl.Datetime("ms", "UTC")}940df = pl.DataFrame(941[datetime(2020, 1, 1, tzinfo=timezone(timedelta(hours=1)))],942schema={"ts": pl.Datetime("ms")},943)944assert df.schema == {"ts": pl.Datetime("ms", "UTC")}945df = pl.DataFrame(946[datetime(2020, 1, 1, tzinfo=ZoneInfo("Asia/Kathmandu"))],947schema={"ts": pl.Datetime("ms")},948)949assert df.schema == {"ts": pl.Datetime("ms", "Asia/Kathmandu")}950951952def test_init_pandas(monkeypatch: Any) -> None:953pandas_df = pd.DataFrame([[1, 2], [3, 4]], columns=[1, 2])954955# integer column names956df = pl.DataFrame(pandas_df)957expected = pl.DataFrame({"1": [1, 3], "2": [2, 4]})958assert_frame_equal(df, expected)959assert df.schema == {"1": pl.Int64, "2": pl.Int64}960961# override column names, types962df = pl.DataFrame(pandas_df, schema=[("x", pl.Float64), ("y", pl.Float64)])963assert df.schema == {"x": pl.Float64, "y": pl.Float64}964assert df.rows() == [(1.0, 2.0), (3.0, 4.0)]965966# subclassed pandas object, with/without data & overrides967# type error fixed in pandas-stubs 2.3.0.250703, which doesn't support Python3.9968class XSeries(pd.Series): # type: ignore[type-arg, unused-ignore]969@property970def _constructor(self) -> type:971return XSeries972973df = pl.DataFrame(974data=[975XSeries(name="x", data=[], dtype=np.dtype("<M8[ns]")),976XSeries(name="y", data=[], dtype=np.dtype("f8")),977XSeries(name="z", data=[], dtype=np.dtype("?")),978],979)980assert df.schema == {"x": pl.Datetime("ns"), "y": pl.Float64, "z": pl.Boolean}981assert df.rows() == []982983df = pl.DataFrame(984data=[985XSeries(986name="x",987data=[datetime(2022, 10, 31, 10, 30, 45, 123456)],988dtype=np.dtype("<M8[ns]"),989)990],991schema={"colx": pl.Datetime("us")},992)993assert df.schema == {"colx": pl.Datetime("us")}994assert df.rows() == [(datetime(2022, 10, 31, 10, 30, 45, 123456),)]995996# pandas is not available997monkeypatch.setattr(pl.dataframe.frame, "_check_for_pandas", lambda x: False)998999# pandas 2.2 and higher implement the Arrow PyCapsule Interface, so the constructor1000# will still work even without using pandas APIs1001if parse_version(pd.__version__) >= parse_version("2.2.0"):1002df = pl.DataFrame(pandas_df)1003assert_frame_equal(df, expected)10041005else:1006with pytest.raises(TypeError):1007pl.DataFrame(pandas_df)100810091010def test_init_errors() -> None:1011# Length mismatch1012with pytest.raises(ShapeError):1013pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0, 4.0]})10141015# Columns don't match data dimensions1016with pytest.raises(ShapeError):1017pl.DataFrame([[1, 2], [3, 4]], schema=["a", "b", "c"])10181019# Unmatched input1020with pytest.raises(TypeError):1021pl.DataFrame(0)102210231024def test_init_records() -> None:1025dicts = [1026{"a": 1, "b": 2},1027{"b": 1, "a": 2},1028{"a": 1, "b": 2},1029]1030df = pl.DataFrame(dicts)1031expected = pl.DataFrame({"a": [1, 2, 1], "b": [2, 1, 2]})1032assert_frame_equal(df, expected)1033assert df.to_dicts() == dicts10341035df_cd = pl.DataFrame(dicts, schema=["a", "c", "d"])1036expected_values = {1037"a": [1, 2, 1],1038"c": [None, None, None],1039"d": [None, None, None],1040}1041assert df_cd.to_dict(as_series=False) == expected_values10421043data = {"a": 1, "b": 2, "c": 3}10441045df1 = pl.from_dicts([data])1046assert df1.columns == ["a", "b", "c"]10471048df1.columns = ["x", "y", "z"]1049assert df1.columns == ["x", "y", "z"]10501051df2 = pl.from_dicts([data], schema=["c", "b", "a"])1052assert df2.columns == ["c", "b", "a"]10531054for colname in ("c", "b", "a"):1055result = pl.from_dicts([data], schema=[colname])1056expected_values = {colname: [data[colname]]}1057assert result.to_dict(as_series=False) == expected_values105810591060def test_init_records_schema_order() -> None:1061cols: list[str] = ["a", "b", "c", "d"]1062data: list[dict[str, int]] = [1063{"c": 3, "b": 2, "a": 1},1064{"b": 2, "d": 4},1065{},1066{"a": 1, "b": 2, "c": 3},1067{"d": 4, "b": 2, "a": 1},1068{"c": 3, "b": 2},1069]1070lookup = {"a": 1, "b": 2, "c": 3, "d": 4, "e": None}10711072for constructor in (pl.from_dicts, pl.DataFrame):1073# ensure field values are loaded according to the declared schema order1074for _ in range(8):1075shuffle(data)1076shuffle(cols)10771078df = constructor(data, schema=cols)1079for col in df.columns:1080assert all(value in (None, lookup[col]) for value in df[col].to_list())10811082# have schema override inferred types, omit some columns, add a new one1083schema = {"a": pl.Int8, "c": pl.Int16, "e": pl.Int32}1084df = constructor(data, schema=schema)10851086assert df.schema == schema1087for col in df.columns:1088assert all(value in (None, lookup[col]) for value in df[col].to_list())108910901091def test_init_only_columns() -> None:1092df = pl.DataFrame(schema=["a", "b", "c"])1093expected = pl.DataFrame({"a": [], "b": [], "c": []})1094assert_frame_equal(df, expected)10951096# Validate construction with various flavours of no/empty data1097no_data: Any1098for no_data in (None, {}, []):1099df = pl.DataFrame(1100data=no_data,1101schema=[1102("a", pl.Date),1103("b", pl.UInt64),1104("c", pl.Int8),1105("d", pl.List(pl.UInt8)),1106],1107)1108expected = pl.DataFrame({"a": [], "b": [], "c": []}).with_columns(1109pl.col("a").cast(pl.Date),1110pl.col("b").cast(pl.UInt64),1111pl.col("c").cast(pl.Int8),1112)1113expected.insert_column(3, pl.Series("d", [], pl.List(pl.UInt8)))11141115assert df.shape == (0, 4)1116assert_frame_equal(df, expected)1117assert df.dtypes == [pl.Date, pl.UInt64, pl.Int8, pl.List]1118assert pl.List(pl.UInt8).is_(df.schema["d"])11191120if TYPE_CHECKING:1121assert_type(pl.List(pl.UInt8).is_(df.schema["d"]), bool)11221123dfe = df.clear()1124assert len(dfe) == 01125assert df.schema == dfe.schema1126assert dfe.shape == df.shape112711281129def test_from_dicts_list_without_dtype() -> None:1130result = pl.from_dicts(1131[{"id": 1, "hint": ["some_text_here"]}, {"id": 2, "hint": [None]}]1132)1133expected = pl.DataFrame({"id": [1, 2], "hint": [["some_text_here"], [None]]})1134assert_frame_equal(result, expected)113511361137def test_from_dicts_list_struct_without_inner_dtype() -> None:1138df = pl.DataFrame(1139{1140"users": [1141[{"category": "A"}, {"category": "B"}],1142[{"category": None}, {"category": None}],1143],1144"days_of_week": [1, 2],1145}1146)1147expected = {1148"users": [1149[{"category": "A"}, {"category": "B"}],1150[{"category": None}, {"category": None}],1151],1152"days_of_week": [1, 2],1153}1154assert df.to_dict(as_series=False) == expected115511561157def test_from_dicts_list_struct_without_inner_dtype_5611() -> None:1158result = pl.from_dicts(1159[1160{"a": []},1161{"a": [{"b": 1}]},1162]1163)1164expected = pl.DataFrame({"a": [[], [{"b": 1}]]})1165assert_frame_equal(result, expected)116611671168def test_from_dict_upcast_primitive() -> None:1169df = pl.from_dict({"a": [1, 2.1, 3], "b": [4, 5, 6.4]}, strict=False)1170assert df.dtypes == [pl.Float64, pl.Float64]117111721173def test_u64_lit_5031() -> None:1174df = pl.DataFrame({"foo": [1, 2, 3]}).with_columns(pl.col("foo").cast(pl.UInt64))1175assert df.filter(pl.col("foo") < (1 << 64) - 20).shape == (3, 1)1176assert df["foo"].to_list() == [1, 2, 3]117711781179def test_from_dicts_missing_columns() -> None:1180# missing columns from some of the data dicts1181data = [{"a": 1}, {"b": 2}]1182result = pl.from_dicts(data)1183expected = pl.DataFrame({"a": [1, None], "b": [None, 2]})1184assert_frame_equal(result, expected)11851186# partial schema with some columns missing; only load the declared keys1187data = [{"a": 1, "b": 2}]1188result = pl.from_dicts(data, schema=["a"])1189expected = pl.DataFrame({"a": [1]})1190assert_frame_equal(result, expected)119111921193def test_from_dicts_schema_columns_do_not_match() -> None:1194data = [{"a": 1, "b": 2}]1195result = pl.from_dicts(data, schema=["x"])1196expected = pl.DataFrame({"x": [None]})1197assert_frame_equal(result, expected)119811991200def test_from_dicts_infer_integer_types() -> None:1201data = [1202{1203"a": 2**7 - 1,1204"b": 2**15 - 1,1205"c": 2**31 - 1,1206"d": 2**63 - 1,1207"e": 2**127 - 1,1208"f": 2**128 - 1,1209}1210]1211result = pl.from_dicts(data).schema1212# all values inferred as i64 except for values too large for i641213expected = {1214"a": pl.Int64,1215"b": pl.Int64,1216"c": pl.Int64,1217"d": pl.Int64,1218"e": pl.Int128,1219"f": pl.UInt128,1220}1221assert result == expected12221223with pytest.raises(OverflowError):1224pl.from_dicts([{"too_big": 2**128}])122512261227def test_from_dicts_list_large_int_17006() -> None:1228data = [{"x": [2**64 - 1]}]12291230result = pl.from_dicts(data, schema={"x": pl.List(pl.UInt64)})1231expected = pl.DataFrame({"x": [[2**64 - 1]]}, schema={"x": pl.List(pl.UInt64)})1232assert_frame_equal(result, expected)12331234result = pl.from_dicts(data, schema={"x": pl.Array(pl.UInt64, 1)})1235expected = pl.DataFrame({"x": [[2**64 - 1]]}, schema={"x": pl.Array(pl.UInt64, 1)})1236assert_frame_equal(result, expected)123712381239def test_from_rows_dtype() -> None:1240# 50 is the default inference length1241# 51821242df = pl.DataFrame(1243data=[(None, None)] * 50 + [("1.23", None)],1244schema=[("foo", pl.String), ("bar", pl.String)],1245orient="row",1246)1247assert df.dtypes == [pl.String, pl.String]1248assert df.null_count().row(0) == (50, 51)12491250type1 = [{"c1": 206, "c2": "type1", "c3": {"x1": "abcd", "x2": "jkl;"}}]1251type2 = [1252{"c1": 208, "c2": "type2", "c3": {"a1": "abcd", "a2": "jkl;", "a3": "qwerty"}}1253]12541255df = pl.DataFrame(1256data=type1 * 50 + type2,1257schema=[("c1", pl.Int32), ("c2", pl.Object), ("c3", pl.Object)],1258)1259assert df.dtypes == [pl.Int32, pl.Object, pl.Object]12601261# 50 is the default inference length1262# 52661263type1 = [{"c1": 206, "c2": "type1", "c3": {"x1": "abcd", "x2": "jkl;"}}]1264type2 = [1265{"c1": 208, "c2": "type2", "c3": {"a1": "abcd", "a2": "jkl;", "a3": "qwerty"}}1266]12671268df = pl.DataFrame(1269data=type1 * 50 + type2,1270schema=[("c1", pl.Int32), ("c2", pl.Object), ("c3", pl.Object)],1271)1272assert df.dtypes == [pl.Int32, pl.Object, pl.Object]1273assert df.null_count().row(0) == (0, 0, 0)12741275dc = _TestBazDC(d=datetime(2020, 2, 22), e=42.0, f="xyz")1276df = pl.DataFrame([[dc]], schema={"d": pl.Object})1277assert df.schema == {"d": pl.Object}1278assert df.item() == dc127912801281def test_from_dicts_schema() -> None:1282data = [{"a": 1, "b": 4}, {"a": 2, "b": 5}, {"a": 3, "b": 6}]12831284# let polars infer the dtypes, but inform it about a 3rd column.1285for schema, overrides in (1286({"a": pl.Unknown, "b": pl.Unknown, "c": pl.Int32}, None),1287({"a": None, "b": None, "c": None}, {"c": pl.Int32}),1288(["a", "b", ("c", pl.Int32)], None),1289):1290df = pl.from_dicts(1291data,1292schema=schema, # type: ignore[arg-type]1293schema_overrides=overrides,1294)1295assert df.dtypes == [pl.Int64, pl.Int64, pl.Int32]1296assert df.to_dict(as_series=False) == {1297"a": [1, 2, 3],1298"b": [4, 5, 6],1299"c": [None, None, None],1300}13011302# provide data that resolves to an empty frame (ref: scalar1303# expansion shortcut), with schema/override hints1304schema = {"colx": pl.String, "coly": pl.Int32}13051306for param in ("schema", "schema_overrides"):1307df = pl.DataFrame({"colx": [], "coly": 0}, **{param: schema}) # type: ignore[arg-type]1308assert df.schema == schema130913101311def test_nested_read_dicts_4143() -> None:1312result = pl.from_dicts(1313[1314{1315"id": 1,1316"hint": [1317{"some_text_here": "text", "list_": [1, 2, 4]},1318{"some_text_here": "text", "list_": [1, 2, 4]},1319],1320},1321{1322"id": 2,1323"hint": [1324{"some_text_here": None, "list_": [1]},1325{"some_text_here": None, "list_": [2]},1326],1327},1328]1329)1330expected = {1331"hint": [1332[1333{"some_text_here": "text", "list_": [1, 2, 4]},1334{"some_text_here": "text", "list_": [1, 2, 4]},1335],1336[1337{"some_text_here": None, "list_": [1]},1338{"some_text_here": None, "list_": [2]},1339],1340],1341"id": [1, 2],1342}1343assert result.to_dict(as_series=False) == expected134413451346def test_nested_read_dicts_4143_2() -> None:1347result = pl.from_dicts(1348[1349{1350"id": 1,1351"hint": [1352{"some_text_here": "text", "list_": [1, 2, 4]},1353{"some_text_here": "text", "list_": [1, 2, 4]},1354],1355},1356{1357"id": 2,1358"hint": [1359{"some_text_here": "text", "list_": []},1360{"some_text_here": "text", "list_": []},1361],1362},1363]1364)13651366assert result.dtypes == [1367pl.Int64,1368pl.List(pl.Struct({"some_text_here": pl.String, "list_": pl.List(pl.Int64)})),1369]1370expected = {1371"id": [1, 2],1372"hint": [1373[1374{"some_text_here": "text", "list_": [1, 2, 4]},1375{"some_text_here": "text", "list_": [1, 2, 4]},1376],1377[1378{"some_text_here": "text", "list_": []},1379{"some_text_here": "text", "list_": []},1380],1381],1382}1383assert result.to_dict(as_series=False) == expected138413851386def test_from_records_nullable_structs() -> None:1387records = [1388{"id": 1, "items": [{"item_id": 100, "description": None}]},1389{"id": 1, "items": [{"item_id": 100, "description": "hi"}]},1390]13911392schema: list[tuple[str, PolarsDataType]] = [1393("id", pl.UInt16),1394(1395"items",1396pl.List(1397pl.Struct(1398[pl.Field("item_id", pl.UInt32), pl.Field("description", pl.String)]1399)1400),1401),1402]14031404schema_options: list[list[tuple[str, PolarsDataType]] | None] = [schema, None]1405for s in schema_options:1406result = pl.DataFrame(records, schema=s, orient="row")1407expected = {1408"id": [1, 1],1409"items": [1410[{"item_id": 100, "description": None}],1411[{"item_id": 100, "description": "hi"}],1412],1413}1414assert result.to_dict(as_series=False) == expected14151416# check initialisation without any records1417df = pl.DataFrame(schema=schema)1418dict_schema = dict(schema)1419assert df.to_dict(as_series=False) == {"id": [], "items": []}1420assert df.schema == dict_schema14211422dtype: PolarsDataType = dict_schema["items"]1423series = pl.Series("items", dtype=dtype)1424assert series.to_frame().to_dict(as_series=False) == {"items": []}1425assert series.dtype == dict_schema["items"]1426assert series.to_list() == []142714281429@pytest.mark.parametrize("unnest_column", ["a", pl.col("a"), cs.by_name("a")])1430def test_from_categorical_in_struct_defined_by_schema(unnest_column: Any) -> None:1431df = pl.DataFrame(1432{"a": [{"value": "foo", "counts": 1}, {"value": "bar", "counts": 2}]},1433schema={"a": pl.Struct({"value": pl.Categorical, "counts": pl.UInt32})},1434)14351436expected = pl.DataFrame(1437{"value": ["foo", "bar"], "counts": [1, 2]},1438schema={"value": pl.Categorical, "counts": pl.UInt32},1439)14401441res_eager = df.unnest(unnest_column)1442assert_frame_equal(res_eager, expected, categorical_as_str=True)14431444res_lazy = df.lazy().unnest(unnest_column)1445assert_frame_equal(res_lazy.collect(), expected, categorical_as_str=True)144614471448def test_nested_schema_construction() -> None:1449schema = {1450"node_groups": pl.List(1451pl.Struct(1452[1453pl.Field("parent_node_group_id", pl.UInt8),1454pl.Field(1455"nodes",1456pl.List(1457pl.Struct(1458[1459pl.Field("name", pl.String),1460pl.Field(1461"sub_nodes",1462pl.List(1463pl.Struct(1464[1465pl.Field("internal_id", pl.UInt64),1466pl.Field("value", pl.UInt32),1467]1468)1469),1470),1471]1472)1473),1474),1475]1476)1477)1478}1479df = pl.DataFrame(1480{1481"node_groups": [1482[{"nodes": []}, {"nodes": [{"name": "", "sub_nodes": []}]}],1483]1484},1485schema=schema,1486)14871488assert df.schema == schema1489assert df.to_dict(as_series=False) == {1490"node_groups": [1491[1492{"parent_node_group_id": None, "nodes": []},1493{1494"parent_node_group_id": None,1495"nodes": [{"name": "", "sub_nodes": []}],1496},1497]1498]1499}150015011502def test_nested_schema_construction2() -> None:1503schema = {1504"node_groups": pl.List(1505pl.Struct(1506[1507pl.Field(1508"nodes",1509pl.List(1510pl.Struct(1511[1512pl.Field("name", pl.String),1513pl.Field("time", pl.UInt32),1514]1515)1516),1517)1518]1519)1520)1521}1522df = pl.DataFrame(1523[1524{"node_groups": [{"nodes": [{"name": "a", "time": 0}]}]},1525{"node_groups": [{"nodes": []}]},1526],1527schema=schema,1528)1529assert df.schema == schema1530assert df.to_dict(as_series=False) == {1531"node_groups": [[{"nodes": [{"name": "a", "time": 0}]}], [{"nodes": []}]]1532}153315341535def test_arrow_to_pyseries_with_one_chunk_does_not_copy_data() -> None:1536from polars._utils.construction import arrow_to_pyseries15371538original_array = pa.chunked_array([[1, 2, 3]], type=pa.int64())1539pyseries = arrow_to_pyseries("", original_array)1540assert (1541pyseries.get_chunks()[0]._get_buffer_info()[0]1542== original_array.chunks[0].buffers()[1].address1543)154415451546def test_init_with_explicit_binary_schema() -> None:1547df = pl.DataFrame({"a": [b"hello", b"world"]}, schema={"a": pl.Binary})1548assert df.schema == {"a": pl.Binary}1549assert df["a"].to_list() == [b"hello", b"world"]15501551s = pl.Series("a", [b"hello", b"world"], dtype=pl.Binary)1552assert s.dtype == pl.Binary1553assert s.to_list() == [b"hello", b"world"]155415551556def test_nested_categorical() -> None:1557s = pl.Series([["a"]], dtype=pl.List(pl.Categorical))1558assert s.to_list() == [["a"]]1559assert s.dtype == pl.List(pl.Categorical)156015611562def test_datetime_date_subclasses() -> None:1563class FakeDate(date): ...15641565class FakeDateChild(FakeDate): ...15661567class FakeDatetime(FakeDate, datetime): ...15681569result = pl.Series([FakeDate(2020, 1, 1)])1570expected = pl.Series([date(2020, 1, 1)])1571assert_series_equal(result, expected)15721573result = pl.Series([FakeDateChild(2020, 1, 1)])1574expected = pl.Series([date(2020, 1, 1)])1575assert_series_equal(result, expected)15761577result = pl.Series([FakeDatetime(2020, 1, 1, 3)])1578expected = pl.Series([datetime(2020, 1, 1, 3)])1579assert_series_equal(result, expected)158015811582def test_list_null_constructor() -> None:1583s = pl.Series("a", [[None], [None]], dtype=pl.List(pl.Null))1584assert s.dtype == pl.List(pl.Null)1585assert s.to_list() == [[None], [None]]15861587# nested1588dtype = pl.List(pl.List(pl.Int8))1589values = [1590[],1591[[], []],1592[[33, 112]],1593]1594s = pl.Series(1595name="colx",1596values=values,1597dtype=dtype,1598)1599assert s.dtype == dtype1600assert s.to_list() == values16011602# nested1603# small order change has influence1604dtype = pl.List(pl.List(pl.Int8))1605values = [1606[[], []],1607[],1608[[33, 112]],1609]1610s = pl.Series(1611name="colx",1612values=values,1613dtype=dtype,1614)1615assert s.dtype == dtype1616assert s.to_list() == values161716181619def test_numpy_float_construction_av() -> None:1620np_dict = {"a": np.float64(1)}1621assert_frame_equal(pl.DataFrame(np_dict), pl.DataFrame({"a": 1.0}))162216231624def test_df_init_dict_raise_on_expression_input() -> None:1625with pytest.raises(1626TypeError,1627match="passing Expr objects to the DataFrame constructor is not supported",1628):1629pl.DataFrame({"a": pl.int_range(0, 3)})1630with pytest.raises(TypeError):1631pl.DataFrame({"a": pl.int_range(0, 3), "b": [3, 4, 5]})16321633# Passing a list of expressions is allowed1634df = pl.DataFrame({"a": [pl.int_range(0, 3)]})1635assert df.get_column("a").dtype.is_object()163616371638def test_df_schema_sequences() -> None:1639schema = [1640["address", pl.String],1641["key", pl.Int64],1642["value", pl.Float32],1643]1644df = pl.DataFrame(schema=schema) # type: ignore[arg-type]1645assert df.schema == {"address": pl.String, "key": pl.Int64, "value": pl.Float32}164616471648def test_df_schema_sequences_incorrect_length() -> None:1649schema = [1650["address", pl.String, pl.Int8],1651["key", pl.Int64],1652["value", pl.Float32],1653]1654with pytest.raises(ValueError):1655pl.DataFrame(schema=schema) # type: ignore[arg-type]165616571658@pytest.mark.parametrize(1659("input", "infer_func", "expected_dtype"),1660[1661("f8", numpy_char_code_to_dtype, pl.Float64),1662("f4", numpy_char_code_to_dtype, pl.Float32),1663("f2", numpy_char_code_to_dtype, pl.Float16),1664("i4", numpy_char_code_to_dtype, pl.Int32),1665("u1", numpy_char_code_to_dtype, pl.UInt8),1666("?", numpy_char_code_to_dtype, pl.Boolean),1667("m8", numpy_char_code_to_dtype, pl.Duration("us")),1668("M8", numpy_char_code_to_dtype, pl.Datetime("us")),1669],1670)1671def test_numpy_inference(1672input: Any,1673infer_func: Callable[[Any], PolarsDataType],1674expected_dtype: PolarsDataType,1675) -> None:1676result = infer_func(input)1677assert result == expected_dtype167816791680def test_array_construction() -> None:1681payload = [[1, 2, 3], None, [4, 2, 3]]16821683dtype = pl.Array(pl.Int64, 3)1684s = pl.Series(payload, dtype=dtype)1685assert s.dtype == dtype1686assert s.to_list() == payload16871688# inner type1689dtype = pl.Array(pl.UInt8, 2)1690payload = [[1, 2], None, [3, 4]]1691s = pl.Series(payload, dtype=dtype)1692assert s.dtype == dtype1693assert s.to_list() == payload16941695# create using schema1696df = pl.DataFrame(1697schema={1698"a": pl.Array(pl.Float32, 3),1699"b": pl.Array(pl.Datetime("ms"), 5),1700}1701)1702assert df.dtypes == [1703pl.Array(pl.Float32, 3),1704pl.Array(pl.Datetime("ms"), 5),1705]1706assert df.rows() == []17071708# from dicts1709rows = [1710{"row_id": "a", "data": [1, 2, 3]},1711{"row_id": "b", "data": [2, 3, 4]},1712]1713schema = {"row_id": pl.String(), "data": pl.Array(inner=pl.Int64, shape=3)}1714df = pl.from_dicts(rows, schema=schema)1715assert df.schema == schema1716assert df.rows() == [("a", [1, 2, 3]), ("b", [2, 3, 4])]171717181719@pytest.mark.may_fail_auto_streaming1720def test_pycapsule_interface(df: pl.DataFrame) -> None:1721df = df.rechunk()1722pyarrow_table = df.to_arrow()17231724# Array via C data interface1725pyarrow_array = pyarrow_table["bools"].chunk(0)1726round_trip_series = pl.Series(PyCapsuleArrayHolder(pyarrow_array))1727assert df["bools"].equals(round_trip_series, check_dtypes=True, check_names=False)17281729# empty Array via C data interface1730empty_pyarrow_array = pa.array([], type=pyarrow_array.type)1731round_trip_series = pl.Series(PyCapsuleArrayHolder(empty_pyarrow_array))1732assert df["bools"].dtype == round_trip_series.dtype17331734# RecordBatch via C array interface1735pyarrow_record_batch = pyarrow_table.to_batches()[0]1736round_trip_df = pl.DataFrame(PyCapsuleArrayHolder(pyarrow_record_batch))1737assert df.equals(round_trip_df)17381739# ChunkedArray via C stream interface1740pyarrow_chunked_array = pyarrow_table["bools"]1741round_trip_series = pl.Series(PyCapsuleStreamHolder(pyarrow_chunked_array))1742assert df["bools"].equals(round_trip_series, check_dtypes=True, check_names=False)17431744# empty ChunkedArray via C stream interface1745empty_chunked_array = pa.chunked_array([], type=pyarrow_chunked_array.type)1746round_trip_series = pl.Series(PyCapsuleStreamHolder(empty_chunked_array))1747assert df["bools"].dtype == round_trip_series.dtype17481749# Table via C stream interface1750round_trip_df = pl.DataFrame(PyCapsuleStreamHolder(pyarrow_table))1751assert df.equals(round_trip_df)17521753# empty Table via C stream interface1754empty_df = df[:0].to_arrow()1755round_trip_df = pl.DataFrame(PyCapsuleStreamHolder(empty_df))1756orig_schema = df.schema1757round_trip_schema = round_trip_df.schema17581759# The "enum" schema is not preserved because categories are lost via C data1760# interface1761orig_schema.pop("enum")1762round_trip_schema.pop("enum")17631764assert orig_schema == round_trip_schema17651766# RecordBatchReader via C stream interface1767pyarrow_reader = pa.RecordBatchReader.from_batches(1768pyarrow_table.schema, pyarrow_table.to_batches()1769)1770round_trip_df = pl.DataFrame(PyCapsuleStreamHolder(pyarrow_reader))1771assert df.equals(round_trip_df)177217731774@pytest.mark.parametrize(1775"tz",1776[1777None,1778ZoneInfo("Asia/Tokyo"),1779ZoneInfo("Europe/Amsterdam"),1780ZoneInfo("UTC"),1781timezone.utc,1782],1783)1784def test_init_list_of_dicts_with_timezone(tz: Any) -> None:1785dt = datetime(2023, 1, 1, 0, 0, 0, 0, tzinfo=tz)17861787df = pl.DataFrame([{"dt": dt}, {"dt": dt}])1788expected = pl.DataFrame({"dt": [dt, dt]})1789assert_frame_equal(df, expected)17901791assert df.schema == {"dt": pl.Datetime("us", time_zone=tz)}179217931794@pytest.mark.parametrize(1795"tz",1796[1797None,1798ZoneInfo("Asia/Tokyo"),1799ZoneInfo("Europe/Amsterdam"),1800ZoneInfo("UTC"),1801timezone.utc,1802],1803)1804def test_init_list_of_nested_dicts_with_timezone(tz: Any) -> None:1805dt = datetime(2021, 1, 1, 0, 0, 0, 0, tzinfo=tz)1806data = [{"timestamp": {"content": datetime(2021, 1, 1, 0, 0, tzinfo=tz)}}]18071808df = pl.DataFrame(data).unnest("timestamp")1809expected = pl.DataFrame({"content": [dt]})1810assert_frame_equal(df, expected)18111812assert df.schema == {"content": pl.Datetime("us", time_zone=tz)}181318141815def test_init_from_subclassed_types() -> None:1816# more detailed test of one custom subclass...1817import codecs18181819class SuperSecretString(str):1820def __new__(cls, value: str) -> Self:1821return super().__new__(cls, value)18221823def __repr__(self) -> str:1824return codecs.encode(self, "rot_13")18251826w = "windmolen"1827sstr = SuperSecretString(w)18281829assert sstr == w1830assert isinstance(sstr, str)1831assert repr(sstr) == "jvaqzbyra"1832assert_series_equal(pl.Series([w, w]), pl.Series([sstr, sstr]))18331834# ...then validate across other basic types1835for BaseType, value in (1836(int, 42),1837(float, 5.5),1838(bytes, b"value"),1839(str, "value"),1840):18411842class SubclassedType(BaseType): # type: ignore[misc,valid-type]1843def __new__(cls, value: Any) -> Self:1844return super().__new__(cls, value) # type: ignore[no-any-return]18451846assert (1847pl.Series([value]).to_list() == pl.Series([SubclassedType(value)]).to_list()1848)184918501851def test_series_init_with_python_type_7737() -> None:1852assert pl.Series([], dtype=int).dtype == pl.Int64 # type: ignore[arg-type]1853assert pl.Series([], dtype=float).dtype == pl.Float64 # type: ignore[arg-type]1854assert pl.Series([], dtype=bool).dtype == pl.Boolean # type: ignore[arg-type]1855assert pl.Series([], dtype=str).dtype == pl.Utf8 # type: ignore[arg-type]18561857with pytest.raises(TypeError):1858pl.Series(["a"], dtype=int) # type: ignore[arg-type]18591860with pytest.raises(TypeError):1861pl.Series([True], dtype=str) # type: ignore[arg-type]186218631864def test_init_from_list_shape_6968() -> None:1865df1 = pl.DataFrame([[1, None], [2, None], [3, None]])1866df2 = pl.DataFrame([[None, None], [2, None], [3, None]])1867assert df1.shape == (2, 3)1868assert df2.shape == (2, 3)186918701871def test_dataframe_height() -> None:1872assert pl.DataFrame(height=10).shape == (10, 0)1873assert pl.DataFrame(pl.DataFrame(height=10)).shape == (10, 0)18741875assert_frame_equal(1876pl.DataFrame({"a": [0, 1, 2]}, height=3), pl.DataFrame({"a": [0, 1, 2]})1877)18781879with pytest.raises(1880pl.exceptions.ShapeError,1881match=r"height of data \(3\) does not match specified height \(99\)",1882):1883pl.DataFrame({"a": [0, 1, 2]}, height=99)18841885with pytest.raises(1886pl.exceptions.ShapeError,1887match=r"height of data \(3\) does not match specified height \(0\)",1888):1889pl.DataFrame({"a": [0, 1, 2]}, height=0)18901891with pytest.raises(1892pl.exceptions.ShapeError,1893match=r"height of data \(10\) does not match specified height \(5\)",1894):1895pl.DataFrame(pl.DataFrame(height=10), height=5)18961897assert_frame_equal(pl.DataFrame(height=10), pl.DataFrame(height=10))18981899with pytest.raises(AssertionError):1900assert_frame_equal(pl.DataFrame(height=5), pl.DataFrame(height=10))19011902with pytest.raises(AssertionError):1903assert_frame_equal(pl.DataFrame(), pl.DataFrame(height=10))190419051906