Path: blob/main/py-polars/tests/unit/operations/test_group_by_dynamic.py
6939 views
from __future__ import annotations12from datetime import date, datetime, timedelta, timezone3from typing import TYPE_CHECKING, Any4from zoneinfo import ZoneInfo56import numpy as np7import pytest89import polars as pl10from polars.exceptions import ComputeError, InvalidOperationError11from polars.testing import assert_frame_equal1213if TYPE_CHECKING:14from polars._typing import Label, StartBy151617@pytest.mark.parametrize(18("input_df", "expected_grouped_df"),19[20(21(22pl.DataFrame(23{24"dt": [25datetime(2021, 12, 31, 0, 0, 0),26datetime(2022, 1, 1, 0, 0, 1),27datetime(2022, 3, 31, 0, 0, 1),28datetime(2022, 4, 1, 0, 0, 1),29]30}31)32),33pl.DataFrame(34{35"dt": [36datetime(2021, 10, 1),37datetime(2022, 1, 1),38datetime(2022, 4, 1),39],40"num_points": [1, 2, 1],41},42schema={"dt": pl.Datetime, "num_points": pl.UInt32},43).sort("dt"),44)45],46)47def test_group_by_dynamic(48input_df: pl.DataFrame, expected_grouped_df: pl.DataFrame49) -> None:50result = (51input_df.sort("dt")52.group_by_dynamic("dt", every="1q")53.agg(pl.col("dt").count().alias("num_points"))54.sort("dt")55)56assert_frame_equal(result, expected_grouped_df)575859@pytest.mark.parametrize(60("every", "offset"),61[62("3d", "-1d"),63(timedelta(days=3), timedelta(days=-1)),64],65)66def test_dynamic_group_by_timezone_awareness(67every: str | timedelta, offset: str | timedelta68) -> None:69df = pl.DataFrame(70(71pl.datetime_range(72datetime(2020, 1, 1),73datetime(2020, 1, 10),74timedelta(days=1),75time_unit="ns",76eager=True,77)78.alias("datetime")79.dt.replace_time_zone("UTC"),80pl.arange(1, 11, eager=True).alias("value"),81)82)8384assert (85df.group_by_dynamic(86"datetime",87every=every,88offset=offset,89closed="right",90include_boundaries=True,91label="datapoint",92).agg(pl.col("value").last())93).dtypes == [pl.Datetime("ns", "UTC")] * 3 + [pl.Int64]949596@pytest.mark.parametrize("tzinfo", [None, ZoneInfo("UTC"), ZoneInfo("Asia/Kathmandu")])97def test_group_by_dynamic_startby_5599(tzinfo: ZoneInfo | None) -> None:98# start by datapoint99start = datetime(2022, 12, 16, tzinfo=tzinfo)100stop = datetime(2022, 12, 16, hour=3, tzinfo=tzinfo)101df = pl.DataFrame({"date": pl.datetime_range(start, stop, "30m", eager=True)})102103assert df.group_by_dynamic(104"date",105every="31m",106include_boundaries=True,107label="datapoint",108start_by="datapoint",109).agg(pl.len()).to_dict(as_series=False) == {110"_lower_boundary": [111datetime(2022, 12, 16, 0, 0, tzinfo=tzinfo),112datetime(2022, 12, 16, 0, 31, tzinfo=tzinfo),113datetime(2022, 12, 16, 1, 2, tzinfo=tzinfo),114datetime(2022, 12, 16, 1, 33, tzinfo=tzinfo),115datetime(2022, 12, 16, 2, 4, tzinfo=tzinfo),116datetime(2022, 12, 16, 2, 35, tzinfo=tzinfo),117],118"_upper_boundary": [119datetime(2022, 12, 16, 0, 31, tzinfo=tzinfo),120datetime(2022, 12, 16, 1, 2, tzinfo=tzinfo),121datetime(2022, 12, 16, 1, 33, tzinfo=tzinfo),122datetime(2022, 12, 16, 2, 4, tzinfo=tzinfo),123datetime(2022, 12, 16, 2, 35, tzinfo=tzinfo),124datetime(2022, 12, 16, 3, 6, tzinfo=tzinfo),125],126"date": [127datetime(2022, 12, 16, 0, 0, tzinfo=tzinfo),128datetime(2022, 12, 16, 1, 0, tzinfo=tzinfo),129datetime(2022, 12, 16, 1, 30, tzinfo=tzinfo),130datetime(2022, 12, 16, 2, 0, tzinfo=tzinfo),131datetime(2022, 12, 16, 2, 30, tzinfo=tzinfo),132datetime(2022, 12, 16, 3, 0, tzinfo=tzinfo),133],134"len": [2, 1, 1, 1, 1, 1],135}136137# start by monday138start = datetime(2022, 1, 1, tzinfo=tzinfo)139stop = datetime(2022, 1, 12, 7, tzinfo=tzinfo)140141df = pl.DataFrame(142{"date": pl.datetime_range(start, stop, "12h", eager=True)}143).with_columns(pl.col("date").dt.weekday().alias("day"))144145result = df.group_by_dynamic(146"date",147every="1w",148period="3d",149include_boundaries=True,150start_by="monday",151label="datapoint",152).agg([pl.len(), pl.col("day").first().alias("data_day")])153assert result.to_dict(as_series=False) == {154"_lower_boundary": [155datetime(2022, 1, 3, 0, 0, tzinfo=tzinfo),156datetime(2022, 1, 10, 0, 0, tzinfo=tzinfo),157],158"_upper_boundary": [159datetime(2022, 1, 6, 0, 0, tzinfo=tzinfo),160datetime(2022, 1, 13, 0, 0, tzinfo=tzinfo),161],162"date": [163datetime(2022, 1, 3, 0, 0, tzinfo=tzinfo),164datetime(2022, 1, 10, 0, 0, tzinfo=tzinfo),165],166"len": [6, 5],167"data_day": [1, 1],168}169# start by saturday170result = df.group_by_dynamic(171"date",172every="1w",173period="3d",174include_boundaries=True,175start_by="saturday",176label="datapoint",177).agg([pl.len(), pl.col("day").first().alias("data_day")])178assert result.to_dict(as_series=False) == {179"_lower_boundary": [180datetime(2022, 1, 1, 0, 0, tzinfo=tzinfo),181datetime(2022, 1, 8, 0, 0, tzinfo=tzinfo),182],183"_upper_boundary": [184datetime(2022, 1, 4, 0, 0, tzinfo=tzinfo),185datetime(2022, 1, 11, 0, 0, tzinfo=tzinfo),186],187"date": [188datetime(2022, 1, 1, 0, 0, tzinfo=tzinfo),189datetime(2022, 1, 8, 0, 0, tzinfo=tzinfo),190],191"len": [6, 6],192"data_day": [6, 6],193}194195196def test_group_by_dynamic_by_monday_and_offset_5444() -> None:197df = pl.DataFrame(198{199"date": [200"2022-11-01",201"2022-11-02",202"2022-11-05",203"2022-11-08",204"2022-11-08",205"2022-11-09",206"2022-11-10",207],208"label": ["a", "b", "a", "a", "b", "a", "b"],209"value": [1, 2, 3, 4, 5, 6, 7],210}211).with_columns(pl.col("date").str.strptime(pl.Date, "%Y-%m-%d").set_sorted())212213result = df.group_by_dynamic(214"date", every="1w", offset="1d", group_by="label", start_by="monday"215).agg(pl.col("value").sum())216217assert result.to_dict(as_series=False) == {218"label": ["a", "a", "b", "b"],219"date": [220date(2022, 11, 1),221date(2022, 11, 8),222date(2022, 11, 1),223date(2022, 11, 8),224],225"value": [4, 10, 2, 12],226}227228# test empty229result_empty = (230df.filter(pl.col("date") == date(1, 1, 1))231.group_by_dynamic(232"date", every="1w", offset="1d", group_by="label", start_by="monday"233)234.agg(pl.col("value").sum())235)236assert result_empty.schema == result.schema237238239@pytest.mark.parametrize(240("label", "expected"),241[242("left", [datetime(2020, 1, 1), datetime(2020, 1, 2)]),243("right", [datetime(2020, 1, 2), datetime(2020, 1, 3)]),244("datapoint", [datetime(2020, 1, 1, 1), datetime(2020, 1, 2, 3)]),245],246)247def test_group_by_dynamic_label(label: Label, expected: list[datetime]) -> None:248df = pl.DataFrame(249{250"ts": [251datetime(2020, 1, 1, 1),252datetime(2020, 1, 1, 2),253datetime(2020, 1, 2, 3),254datetime(2020, 1, 2, 4),255],256"n": [1, 2, 3, 4],257"group": ["a", "a", "b", "b"],258}259).sort("ts")260result = (261df.group_by_dynamic("ts", every="1d", label=label, group_by="group")262.agg(pl.col("n"))["ts"]263.to_list()264)265assert result == expected266267268@pytest.mark.parametrize(269("label", "expected"),270[271("left", [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)]),272("right", [datetime(2020, 1, 2), datetime(2020, 1, 3), datetime(2020, 1, 4)]),273(274"datapoint",275[datetime(2020, 1, 1, 1), datetime(2020, 1, 2, 2), datetime(2020, 1, 3, 3)],276),277],278)279def test_group_by_dynamic_label_with_by(label: Label, expected: list[datetime]) -> None:280df = pl.DataFrame(281{282"ts": [283datetime(2020, 1, 1, 1),284datetime(2020, 1, 2, 2),285datetime(2020, 1, 3, 3),286],287"n": [1, 2, 3],288}289).sort("ts")290result = (291df.group_by_dynamic("ts", every="1d", label=label)292.agg(pl.col("n"))["ts"]293.to_list()294)295assert result == expected296297298def test_group_by_dynamic_slice_pushdown() -> None:299df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "a", "b"], "c": [1, 3, 5]}).lazy()300df = (301df.sort("a")302.group_by_dynamic("a", group_by="b", every="2i")303.agg((pl.col("c") - pl.col("c").shift(fill_value=0)).sum().alias("c"))304)305assert df.head(2).collect().to_dict(as_series=False) == {306"b": ["a", "a"],307"a": [0, 2],308"c": [1, 3],309}310311312def test_rolling_kernels_group_by_dynamic_7548() -> None:313assert pl.DataFrame(314{"time": pl.arange(0, 4, eager=True), "value": pl.arange(0, 4, eager=True)}315).group_by_dynamic("time", every="1i", period="3i").agg(316pl.col("value"),317pl.col("value").min().alias("min_value"),318pl.col("value").max().alias("max_value"),319pl.col("value").sum().alias("sum_value"),320).to_dict(as_series=False) == {321"time": [0, 1, 2, 3],322"value": [[0, 1, 2], [1, 2, 3], [2, 3], [3]],323"min_value": [0, 1, 2, 3],324"max_value": [2, 3, 3, 3],325"sum_value": [3, 6, 5, 3],326}327328329def test_rolling_dynamic_sortedness_check() -> None:330# when the by argument is passed, the sortedness flag331# will be unset as the take shuffles data, so we must explicitly332# check the sortedness333df = pl.DataFrame(334{335"idx": [1, 2, -1, 2, 1, 1],336"group": [1, 1, 1, 2, 2, 1],337}338)339340with pytest.raises(ComputeError, match=r"input data is not sorted"):341df.group_by_dynamic("idx", every="2i", group_by="group").agg(342pl.col("idx").alias("idx1")343)344345# no `by` argument346with pytest.raises(347InvalidOperationError,348match=r"argument in operation 'group_by_dynamic' is not sorted",349):350df.group_by_dynamic("idx", every="2i").agg(pl.col("idx").alias("idx1"))351352353@pytest.mark.parametrize("time_zone", [None, "UTC", "Asia/Kathmandu"])354def test_group_by_dynamic_elementwise_following_mean_agg_6904(355time_zone: str | None,356) -> None:357df = (358pl.DataFrame(359{360"a": [datetime(2021, 1, 1) + timedelta(seconds=2**i) for i in range(5)],361"b": [float(i) for i in range(5)],362}363)364.with_columns(pl.col("a").dt.replace_time_zone(time_zone))365.lazy()366.set_sorted("a")367.group_by_dynamic("a", every="10s", period="100s")368.agg([pl.col("b").mean().sin().alias("c")])369.collect()370)371assert_frame_equal(372df,373pl.DataFrame(374{375"a": [376datetime(2021, 1, 1, 0, 0),377datetime(2021, 1, 1, 0, 0, 10),378],379"c": [0.9092974268256817, -0.7568024953079282],380}381).with_columns(pl.col("a").dt.replace_time_zone(time_zone)),382)383384385@pytest.mark.parametrize("every", ["1h", timedelta(hours=1)])386@pytest.mark.parametrize("tzinfo", [None, ZoneInfo("UTC"), ZoneInfo("Asia/Kathmandu")])387def test_group_by_dynamic_lazy(every: str | timedelta, tzinfo: ZoneInfo | None) -> None:388ldf = pl.LazyFrame(389{390"time": pl.datetime_range(391start=datetime(2021, 12, 16, tzinfo=tzinfo),392end=datetime(2021, 12, 16, 2, tzinfo=tzinfo),393interval="30m",394eager=True,395),396"n": range(5),397}398)399df = (400ldf.group_by_dynamic("time", every=every, closed="right")401.agg(402[403pl.col("time").min().alias("time_min"),404pl.col("time").max().alias("time_max"),405]406)407.collect()408)409assert sorted(df.rows()) == [410(411datetime(2021, 12, 15, 23, 0, tzinfo=tzinfo),412datetime(2021, 12, 16, 0, 0, tzinfo=tzinfo),413datetime(2021, 12, 16, 0, 0, tzinfo=tzinfo),414),415(416datetime(2021, 12, 16, 0, 0, tzinfo=tzinfo),417datetime(2021, 12, 16, 0, 30, tzinfo=tzinfo),418datetime(2021, 12, 16, 1, 0, tzinfo=tzinfo),419),420(421datetime(2021, 12, 16, 1, 0, tzinfo=tzinfo),422datetime(2021, 12, 16, 1, 30, tzinfo=tzinfo),423datetime(2021, 12, 16, 2, 0, tzinfo=tzinfo),424),425]426427428def test_group_by_dynamic_validation() -> None:429df = pl.DataFrame(430{431"index": [0, 0, 1, 1],432"group": ["banana", "pear", "banana", "pear"],433"weight": [2, 3, 5, 7],434}435)436437with pytest.raises(ComputeError, match="'every' argument must be positive"):438df.group_by_dynamic("index", group_by="group", every="-1i", period="2i").agg(439pl.col("weight")440)441442443def test_no_sorted_no_error() -> None:444df = pl.DataFrame(445{446"dt": [datetime(2001, 1, 1), datetime(2001, 1, 2)],447}448)449result = df.group_by_dynamic("dt", every="1h").agg(pl.len().alias("count"))450expected = pl.DataFrame(451{452"dt": [datetime(2001, 1, 1), datetime(2001, 1, 2)],453"count": [1, 1],454},455schema_overrides={"count": pl.get_index_type()},456)457assert_frame_equal(result, expected)458459460@pytest.mark.parametrize("tzinfo", [None, ZoneInfo("UTC"), ZoneInfo("Asia/Kathmandu")])461def test_truncate_negative_offset(tzinfo: ZoneInfo | None) -> None:462time_zone = tzinfo.key if tzinfo is not None else None463df = pl.DataFrame(464{465"event_date": [466datetime(2021, 4, 11),467datetime(2021, 4, 29),468datetime(2021, 5, 29),469],470"adm1_code": [1, 2, 1],471}472).set_sorted("event_date")473df = df.with_columns(pl.col("event_date").dt.replace_time_zone(time_zone))474out = df.group_by_dynamic(475index_column="event_date",476every="1mo",477period="2mo",478offset="-1mo",479include_boundaries=True,480).agg(481[482pl.col("adm1_code"),483]484)485486assert out["event_date"].to_list() == [487datetime(2021, 3, 1, tzinfo=tzinfo),488datetime(2021, 4, 1, tzinfo=tzinfo),489datetime(2021, 5, 1, tzinfo=tzinfo),490]491df = pl.DataFrame(492{493"event_date": [494datetime(2021, 4, 11),495datetime(2021, 4, 29),496datetime(2021, 5, 29),497],498"adm1_code": [1, 2, 1],499"five_type": ["a", "b", "a"],500"actor": ["a", "a", "a"],501"admin": ["a", "a", "a"],502"fatalities": [10, 20, 30],503}504).set_sorted("event_date")505df = df.with_columns(pl.col("event_date").dt.replace_time_zone(time_zone))506507out = df.group_by_dynamic(508index_column="event_date",509every="1mo",510group_by=["admin", "five_type", "actor"],511).agg([pl.col("adm1_code").unique(), (pl.col("fatalities") > 0).sum()])512513assert out["event_date"].to_list() == [514datetime(2021, 4, 1, tzinfo=tzinfo),515datetime(2021, 5, 1, tzinfo=tzinfo),516datetime(2021, 4, 1, tzinfo=tzinfo),517]518519for dt in [pl.Int32, pl.Int64]:520df = (521pl.DataFrame(522{523"idx": np.arange(6),524"A": ["A", "A", "B", "B", "B", "C"],525}526)527.with_columns(pl.col("idx").cast(dt))528.set_sorted("idx")529)530531out = df.group_by_dynamic(532"idx", every="2i", period="3i", include_boundaries=True533).agg(pl.col("A"))534535assert out.shape == (3, 4)536assert out["A"].to_list() == [537["A", "A", "B"],538["B", "B", "B"],539["B", "C"],540]541542543def test_groupy_by_dynamic_median_10695() -> None:544df = pl.DataFrame(545{546"timestamp": pl.datetime_range(547datetime(2023, 8, 22, 15, 44, 30),548datetime(2023, 8, 22, 15, 48, 50),549"20s",550eager=True,551),552"foo": [2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],553}554)555556assert df.group_by_dynamic(557index_column="timestamp",558every="60s",559period="3m",560).agg(pl.col("foo").median()).to_dict(as_series=False) == {561"timestamp": [562datetime(2023, 8, 22, 15, 44),563datetime(2023, 8, 22, 15, 45),564datetime(2023, 8, 22, 15, 46),565datetime(2023, 8, 22, 15, 47),566datetime(2023, 8, 22, 15, 48),567],568"foo": [1.0, 1.0, 1.0, 1.0, 1.0],569}570571572def test_group_by_dynamic_when_conversion_crosses_dates_7274() -> None:573df = (574pl.DataFrame(575data={576"timestamp": ["1970-01-01 00:00:00+01:00", "1970-01-01 01:00:00+01:00"],577"value": [1, 1],578}579)580.with_columns(581pl.col("timestamp")582.str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S%:z")583.dt.convert_time_zone("Africa/Lagos")584.set_sorted()585)586.with_columns(587pl.col("timestamp")588.dt.convert_time_zone("UTC")589.alias("timestamp_utc")590.set_sorted()591)592)593result = df.group_by_dynamic(594index_column="timestamp", every="1d", closed="left"595).agg(pl.col("value").count())596expected = pl.DataFrame({"timestamp": [datetime(1970, 1, 1)], "value": [2]})597expected = expected.with_columns(598pl.col("timestamp").dt.replace_time_zone("Africa/Lagos"),599pl.col("value").cast(pl.UInt32),600)601assert_frame_equal(result, expected)602result = df.group_by_dynamic(603index_column="timestamp_utc", every="1d", closed="left"604).agg(pl.col("value").count())605expected = pl.DataFrame(606{607"timestamp_utc": [datetime(1969, 12, 31), datetime(1970, 1, 1)],608"value": [1, 1],609}610)611expected = expected.with_columns(612pl.col("timestamp_utc").dt.replace_time_zone("UTC"),613pl.col("value").cast(pl.UInt32),614)615assert_frame_equal(result, expected)616617618@pytest.mark.parametrize("time_zone", [None, "UTC", "Asia/Kathmandu"])619def test_default_negative_every_offset_dynamic_group_by(time_zone: str | None) -> None:620# 2791621dts = [622datetime(2020, 1, 1),623datetime(2020, 1, 2),624datetime(2020, 2, 1),625datetime(2020, 3, 1),626]627df = pl.DataFrame({"dt": dts, "idx": range(len(dts))}).set_sorted("dt")628df = df.with_columns(pl.col("dt").dt.replace_time_zone(time_zone))629out = df.group_by_dynamic(index_column="dt", every="1mo", closed="right").agg(630pl.col("idx")631)632633expected = pl.DataFrame(634{635"dt": [636datetime(2019, 12, 1, 0, 0),637datetime(2020, 1, 1, 0, 0),638datetime(2020, 2, 1, 0, 0),639],640"idx": [[0], [1, 2], [3]],641}642)643expected = expected.with_columns(pl.col("dt").dt.replace_time_zone(time_zone))644assert_frame_equal(out, expected)645646647@pytest.mark.parametrize(648("rule", "offset"),649[650("1h", timedelta(hours=2)),651("1d", timedelta(days=2)),652("1w", timedelta(weeks=2)),653],654)655def test_group_by_dynamic_crossing_dst(rule: str, offset: timedelta) -> None:656start_dt = datetime(2021, 11, 7)657end_dt = start_dt + offset658date_range = pl.datetime_range(659start_dt, end_dt, rule, time_zone="US/Central", eager=True660)661df = pl.DataFrame({"time": date_range, "value": range(len(date_range))})662result = df.group_by_dynamic("time", every=rule, start_by="datapoint").agg(663pl.col("value").mean()664)665expected = pl.DataFrame(666{"time": date_range, "value": range(len(date_range))},667schema_overrides={"value": pl.Float64},668)669assert_frame_equal(result, expected)670671672@pytest.mark.parametrize(673("start_by", "expected_time", "expected_value"),674[675(676"monday",677[678datetime(2021, 11, 1),679datetime(2021, 11, 8),680],681[0.0, 4.0],682),683(684"tuesday",685[686datetime(2021, 11, 2),687datetime(2021, 11, 9),688],689[0.5, 4.5],690),691(692"wednesday",693[694datetime(2021, 11, 3),695datetime(2021, 11, 10),696],697[1.0, 5.0],698),699(700"thursday",701[702datetime(2021, 11, 4),703datetime(2021, 11, 11),704],705[1.5, 5.5],706),707(708"friday",709[710datetime(2021, 11, 5),711datetime(2021, 11, 12),712],713[2.0, 6.0],714),715(716"saturday",717[718datetime(2021, 11, 6),719datetime(2021, 11, 13),720],721[2.5, 6.5],722),723(724"sunday",725[726datetime(2021, 11, 7),727datetime(2021, 11, 14),728],729[3.0, 7.0],730),731],732)733def test_group_by_dynamic_startby_monday_crossing_dst(734start_by: StartBy, expected_time: list[datetime], expected_value: list[float]735) -> None:736start_dt = datetime(2021, 11, 7)737end_dt = datetime(2021, 11, 14)738date_range = pl.datetime_range(739start_dt, end_dt, "1d", time_zone="US/Central", eager=True740)741df = pl.DataFrame({"time": date_range, "value": range(len(date_range))})742result = df.group_by_dynamic("time", every="1w", start_by=start_by).agg(743pl.col("value").mean()744)745expected = pl.DataFrame(746{"time": expected_time, "value": expected_value},747)748expected = expected.with_columns(pl.col("time").dt.replace_time_zone("US/Central"))749assert_frame_equal(result, expected)750751752def test_group_by_dynamic_startby_monday_dst_8737() -> None:753start_dt = datetime(2021, 11, 6, 20)754stop_dt = datetime(2021, 11, 7, 20)755date_range = pl.datetime_range(756start_dt, stop_dt, "1d", time_zone="US/Central", eager=True757)758df = pl.DataFrame({"time": date_range, "value": range(len(date_range))})759result = df.group_by_dynamic("time", every="1w", start_by="monday").agg(760pl.col("value").mean()761)762expected = pl.DataFrame(763{764"time": [765datetime(2021, 11, 1),766],767"value": [0.5],768},769)770expected = expected.with_columns(pl.col("time").dt.replace_time_zone("US/Central"))771assert_frame_equal(result, expected)772773774def test_group_by_dynamic_monthly_crossing_dst() -> None:775start_dt = datetime(2021, 11, 1)776end_dt = datetime(2021, 12, 1)777date_range = pl.datetime_range(778start_dt, end_dt, "1mo", time_zone="US/Central", eager=True779)780df = pl.DataFrame({"time": date_range, "value": range(len(date_range))})781result = df.group_by_dynamic("time", every="1mo").agg(pl.col("value").mean())782expected = pl.DataFrame(783{"time": date_range, "value": range(len(date_range))},784schema_overrides={"value": pl.Float64},785)786assert_frame_equal(result, expected)787788789def test_group_by_dynamic_2d_9333() -> None:790df = pl.DataFrame({"ts": [datetime(2000, 1, 1, 3)], "values": [10.0]})791df = df.with_columns(pl.col("ts").set_sorted())792result = df.group_by_dynamic("ts", every="2d").agg(pl.col("values"))793expected = pl.DataFrame({"ts": [datetime(1999, 12, 31, 0)], "values": [[10.0]]})794assert_frame_equal(result, expected)795796797@pytest.mark.parametrize("every", ["1h", timedelta(hours=1)])798@pytest.mark.parametrize("tzinfo", [None, ZoneInfo("UTC"), ZoneInfo("Asia/Kathmandu")])799def test_group_by_dynamic_iter(every: str | timedelta, tzinfo: ZoneInfo | None) -> None:800time_zone = tzinfo.key if tzinfo is not None else None801df = pl.DataFrame(802{803"datetime": [804datetime(2020, 1, 1, 10, 0),805datetime(2020, 1, 1, 10, 50),806datetime(2020, 1, 1, 11, 10),807],808"a": [1, 2, 2],809"b": [4, 5, 6],810}811).set_sorted("datetime")812df = df.with_columns(pl.col("datetime").dt.replace_time_zone(time_zone))813814# Without 'by' argument815result1 = [816(name, data.shape)817for name, data in df.group_by_dynamic("datetime", every=every, closed="left")818]819expected1 = [820((datetime(2020, 1, 1, 10, tzinfo=tzinfo),), (2, 3)),821((datetime(2020, 1, 1, 11, tzinfo=tzinfo),), (1, 3)),822]823assert result1 == expected1824825# With 'by' argument826result2 = [827(name, data.shape)828for name, data in df.group_by_dynamic(829"datetime", every=every, closed="left", group_by="a"830)831]832expected2 = [833((1, datetime(2020, 1, 1, 10, tzinfo=tzinfo)), (1, 3)),834((2, datetime(2020, 1, 1, 10, tzinfo=tzinfo)), (1, 3)),835((2, datetime(2020, 1, 1, 11, tzinfo=tzinfo)), (1, 3)),836]837assert result2 == expected2838839840# https://github.com/pola-rs/polars/issues/11339841@pytest.mark.parametrize("include_boundaries", [True, False])842def test_group_by_dynamic_lazy_schema(include_boundaries: bool) -> None:843lf = pl.LazyFrame(844{845"dt": pl.datetime_range(846start=datetime(2022, 2, 10),847end=datetime(2022, 2, 12),848eager=True,849),850"n": range(3),851}852)853854result = lf.group_by_dynamic(855"dt", every="2d", closed="right", include_boundaries=include_boundaries856).agg(pl.col("dt").min().alias("dt_min"))857858assert result.collect_schema() == result.collect().schema859860861def test_group_by_dynamic_12414() -> None:862df = pl.DataFrame(863{864"today": [865date(2023, 3, 3),866date(2023, 8, 31),867date(2023, 9, 1),868date(2023, 9, 4),869],870"b": [1, 2, 3, 4],871}872).sort("today")873assert df.group_by_dynamic(874"today",875every="6mo",876period="3d",877closed="left",878start_by="datapoint",879include_boundaries=True,880).agg(881gt_min_count=(pl.col.b >= (pl.col.b.min())).sum(),882).to_dict(as_series=False) == {883"_lower_boundary": [datetime(2023, 3, 3, 0, 0), datetime(2023, 9, 3, 0, 0)],884"_upper_boundary": [datetime(2023, 3, 6, 0, 0), datetime(2023, 9, 6, 0, 0)],885"today": [date(2023, 3, 3), date(2023, 9, 3)],886"gt_min_count": [1, 1],887}888889890@pytest.mark.parametrize("input", [[pl.col("b").sum()], pl.col("b").sum()])891def test_group_by_dynamic_agg_input_types(input: Any) -> None:892df = pl.LazyFrame({"index_column": [0, 1, 2, 3], "b": [1, 3, 1, 2]}).set_sorted(893"index_column"894)895result = df.group_by_dynamic(896index_column="index_column", every="2i", closed="right"897).agg(input)898899expected = pl.LazyFrame({"index_column": [-2, 0, 2], "b": [1, 4, 2]})900assert_frame_equal(result, expected)901902903@pytest.mark.parametrize("input", [str, "b".join])904def test_group_by_dynamic_agg_bad_input_types(input: Any) -> None:905df = pl.LazyFrame({"index_column": [0, 1, 2, 3], "b": [1, 3, 1, 2]}).set_sorted(906"index_column"907)908with pytest.raises(TypeError):909df.group_by_dynamic(910index_column="index_column", every="2i", closed="right"911).agg(input)912913914def test_group_by_dynamic_15225() -> None:915df = pl.DataFrame(916{917"a": [1, 2, 3],918"b": [date(2020, 1, 1), date(2020, 1, 2), date(2020, 1, 3)],919"c": [1, 1, 2],920}921)922result = df.group_by_dynamic("b", every="2d").agg(pl.sum("a"))923expected = pl.DataFrame({"b": [date(2020, 1, 1), date(2020, 1, 3)], "a": [3, 3]})924assert_frame_equal(result, expected)925result = df.group_by_dynamic("b", every="2d", group_by="c").agg(pl.sum("a"))926expected = pl.DataFrame(927{"c": [1, 2], "b": [date(2020, 1, 1), date(2020, 1, 3)], "a": [3, 3]}928)929assert_frame_equal(result, expected)930931932@pytest.mark.parametrize("start_by", ["window", "friday"])933def test_earliest_point_included_when_offset_is_set_15241(start_by: StartBy) -> None:934df = pl.DataFrame(935data={936"t": pl.Series(937[938datetime(2024, 3, 22, 3, 0, tzinfo=timezone.utc),939datetime(2024, 3, 22, 4, 0, tzinfo=timezone.utc),940datetime(2024, 3, 22, 5, 0, tzinfo=timezone.utc),941datetime(2024, 3, 22, 6, 0, tzinfo=timezone.utc),942]943),944"v": [1, 10, 100, 1000],945}946).set_sorted("t")947result = df.group_by_dynamic(948index_column="t",949every="1d",950offset=timedelta(hours=5),951start_by=start_by,952).agg("v")953expected = pl.DataFrame(954{955"t": [956datetime(2024, 3, 21, 5, 0, tzinfo=timezone.utc),957datetime(2024, 3, 22, 5, 0, tzinfo=timezone.utc),958],959"v": [[1, 10], [100, 1000]],960}961)962assert_frame_equal(result, expected)963964965def test_group_by_dynamic_invalid() -> None:966df = pl.DataFrame(967{968"values": [1, 4],969"times": [datetime(2020, 1, 3), datetime(2020, 1, 1)],970},971)972with pytest.raises(973InvalidOperationError, match="duration may not be a parsed integer"974):975(976df.sort("times")977.group_by_dynamic("times", every="3000i")978.agg(pl.col("values").sum().alias("sum"))979)980with pytest.raises(981InvalidOperationError, match="duration must be a parsed integer"982):983(984df.with_row_index()985.group_by_dynamic("index", every="3000d")986.agg(pl.col("values").sum().alias("sum"))987)988989990def test_group_by_dynamic_get() -> None:991df = pl.DataFrame(992{993"time": pl.date_range(pl.date(2021, 1, 1), pl.date(2021, 1, 8), eager=True),994"data": pl.arange(8, eager=True),995}996)997998assert df.group_by_dynamic(999index_column="time",1000every="2d",1001period="3d",1002start_by="datapoint",1003).agg(1004get=pl.col("data").get(1),1005).to_dict(as_series=False) == {1006"time": [1007date(2021, 1, 1),1008date(2021, 1, 3),1009date(2021, 1, 5),1010date(2021, 1, 7),1011],1012"get": [1, 3, 5, 7],1013}101410151016def test_group_by_dynamic_exclude_index_from_expansion_17075() -> None:1017lf = pl.LazyFrame(1018{1019"time": pl.datetime_range(1020start=datetime(2021, 12, 16),1021end=datetime(2021, 12, 16, 3),1022interval="30m",1023eager=True,1024),1025"n": range(7),1026"m": range(7),1027}1028)10291030assert lf.group_by_dynamic(1031"time", every="1h", closed="right"1032).last().collect().to_dict(as_series=False) == {1033"time": [1034datetime(2021, 12, 15, 23, 0),1035datetime(2021, 12, 16, 0, 0),1036datetime(2021, 12, 16, 1, 0),1037datetime(2021, 12, 16, 2, 0),1038],1039"n": [0, 2, 4, 6],1040"m": [0, 2, 4, 6],1041}104210431044def test_group_by_dynamic_overlapping_19704() -> None:1045df = pl.DataFrame(1046{1047"a": [datetime(2020, 1, 1), datetime(2020, 2, 1), datetime(2020, 3, 1)],1048"b": [1, 2, 3],1049}1050)1051result = df.group_by_dynamic(1052"a", every="1mo", period="45d", include_boundaries=True1053).agg(pl.col("b").sum())1054expected = pl.DataFrame(1055{1056"_lower_boundary": [1057datetime(2020, 1, 1, 0, 0),1058datetime(2020, 2, 1, 0, 0),1059datetime(2020, 3, 1, 0, 0),1060],1061"_upper_boundary": [1062datetime(2020, 2, 15, 0, 0),1063datetime(2020, 3, 17, 0, 0),1064datetime(2020, 4, 15, 0, 0),1065],1066"a": [1067datetime(2020, 1, 1, 0, 0),1068datetime(2020, 2, 1, 0, 0),1069datetime(2020, 3, 1, 0, 0),1070],1071"b": [3, 5, 3],1072}1073)1074assert_frame_equal(result, expected)107510761077def test_group_by_dynamic_single_row_22585() -> None:1078df = pl.DataFrame({"date": [date(2025, 1, 1)], "group": ["x"]})1079out = df.group_by_dynamic("date", every="1y", group_by=["group"]).agg(pl.len())1080expected = pl.DataFrame(1081{"group": ["x"], "date": [date(2025, 1, 1)], "len": [1]}1082).with_columns(pl.col("len").cast(pl.UInt32))1083assert_frame_equal(expected, out)108410851086def test_group_by_dynamic_zero_sum_23433() -> None:1087df = pl.DataFrame(1088{1089"g": [0, 0, 1, 1, 2, 2, 2, 3, 3],1090"x": [None, None, None, None, None, None, None, 1, 2],1091}1092)1093out = df.group_by_dynamic("g", every="1i", period="2i").agg(pl.col.x.sum())1094expected = pl.DataFrame({"g": [0, 1, 2, 3], "x": [0, 0, 3, 3]})1095assert_frame_equal(out, expected)109610971098def test_group_by_dynamic_null_mean_22724() -> None:1099time = pl.datetime_range(1100start=datetime(2025, 1, 1, 0, 0, 00),1101end=datetime(2025, 1, 1, 0, 0, 10),1102interval="1s",1103eager=True,1104)11051106b = pl.DataFrame(1107{1108"time": time,1109"value": [None, None, None, 0, None, None, None, None, -1, None, None],1110"empty": [None] * len(time),1111}1112).cast({"value": pl.Float32, "empty": pl.Float32})1113gb = b.group_by_dynamic("time", every="2s", period="3s", offset="-3s")1114out = gb.agg([pl.col("value").cast(pl.Float32).mean()])11151116expected = pl.DataFrame(1117{1118"time": pl.Series(1119[1120datetime(2024, 12, 31, 23, 59, 59),1121datetime(2025, 1, 1, 0, 0, 1),1122datetime(2025, 1, 1, 0, 0, 3),1123datetime(2025, 1, 1, 0, 0, 5),1124datetime(2025, 1, 1, 0, 0, 7),1125datetime(2025, 1, 1, 0, 0, 9),1126],1127dtype=pl.Datetime(time_unit="us", time_zone=None),1128),1129"value": pl.Series([None, 0.0, 0.0, None, -1.0, None], dtype=pl.Float32),1130}1131)1132assert_frame_equal(out, expected)113311341135