Path: blob/main/py-polars/tests/unit/dataframe/test_upsample.py
6939 views
from __future__ import annotations12from datetime import date, datetime3from typing import TYPE_CHECKING4from zoneinfo import ZoneInfo56import pytest78import polars as pl9from polars.exceptions import InvalidOperationError10from polars.testing import assert_frame_equal1112if TYPE_CHECKING:13from datetime import timezone1415from polars._typing import FillNullStrategy, PolarsIntegerType161718@pytest.mark.parametrize(19("time_zone", "tzinfo"),20[21(None, None),22("Europe/Warsaw", ZoneInfo("Europe/Warsaw")),23],24)25def test_upsample(time_zone: str | None, tzinfo: ZoneInfo | timezone | None) -> None:26df = pl.DataFrame(27{28"time": [29datetime(2021, 2, 1),30datetime(2021, 4, 1),31datetime(2021, 5, 1),32datetime(2021, 6, 1),33],34"admin": ["Åland", "Netherlands", "Åland", "Netherlands"],35"test2": [0, 1, 2, 3],36}37).with_columns(pl.col("time").dt.replace_time_zone(time_zone).set_sorted())3839up = df.upsample(40time_column="time",41every="1mo",42group_by="admin",43maintain_order=True,44).select(pl.all().fill_null(strategy="forward"))4546# this print will panic if timezones feature is not activated47# don't remove48print(up)4950expected = pl.DataFrame(51{52"time": [53datetime(2021, 2, 1, 0, 0),54datetime(2021, 3, 1, 0, 0),55datetime(2021, 4, 1, 0, 0),56datetime(2021, 5, 1, 0, 0),57datetime(2021, 4, 1, 0, 0),58datetime(2021, 5, 1, 0, 0),59datetime(2021, 6, 1, 0, 0),60],61"admin": [62"Åland",63"Åland",64"Åland",65"Åland",66"Netherlands",67"Netherlands",68"Netherlands",69],70"test2": [0, 0, 0, 2, 1, 1, 3],71}72)73expected = expected.with_columns(pl.col("time").dt.replace_time_zone(time_zone))7475assert_frame_equal(up, expected)767778@pytest.mark.parametrize("time_zone", [None, "US/Central"])79def test_upsample_crossing_dst(time_zone: str | None) -> None:80df = pl.DataFrame(81{82"time": pl.datetime_range(83datetime(2021, 11, 6),84datetime(2021, 11, 8),85time_zone=time_zone,86eager=True,87),88"values": [1, 2, 3],89}90)9192result = df.upsample(time_column="time", every="1d")9394expected = pl.DataFrame(95{96"time": [97datetime(2021, 11, 6),98datetime(2021, 11, 7),99datetime(2021, 11, 8),100],101"values": [1, 2, 3],102}103).with_columns(pl.col("time").dt.replace_time_zone(time_zone))104105assert_frame_equal(result, expected)106107108@pytest.mark.parametrize(109("time_zone", "tzinfo"),110[111(None, None),112("Pacific/Rarotonga", ZoneInfo("Pacific/Rarotonga")),113],114)115def test_upsample_time_zones(116time_zone: str | None, tzinfo: timezone | ZoneInfo | None117) -> None:118df = pl.DataFrame(119{120"time": pl.datetime_range(121start=datetime(2021, 12, 16),122end=datetime(2021, 12, 16, 3),123interval="30m",124eager=True,125),126"groups": ["a", "a", "a", "b", "b", "a", "a"],127"values": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],128}129)130expected = pl.DataFrame(131{132"time": [133datetime(2021, 12, 16, 0, 0),134datetime(2021, 12, 16, 1, 0),135datetime(2021, 12, 16, 2, 0),136datetime(2021, 12, 16, 3, 0),137],138"groups": ["a", "a", "b", "a"],139"values": [1.0, 3.0, 5.0, 7.0],140}141)142df = df.with_columns(pl.col("time").dt.replace_time_zone(time_zone))143expected = expected.with_columns(pl.col("time").dt.replace_time_zone(time_zone))144result = df.upsample(time_column="time", every="60m").fill_null(strategy="forward")145assert_frame_equal(result, expected)146147148@pytest.mark.parametrize(149("every", "fill", "expected_index", "expected_groups"),150[151(152"1i",153"forward",154[1, 2, 3, 4] + [5, 6, 7],155["a"] * 4 + ["b"] * 3,156),157(158"1i",159"backward",160[1, 2, 3, 4] + [5, 6, 7],161["a"] * 4 + ["b"] * 3,162),163],164)165@pytest.mark.parametrize("dtype", [pl.Int32, pl.Int64, pl.UInt32, pl.UInt64])166def test_upsample_index(167every: str,168fill: FillNullStrategy | None,169expected_index: list[int],170expected_groups: list[str],171dtype: PolarsIntegerType,172) -> None:173df = (174pl.DataFrame(175{176"index": [1, 2, 4] + [5, 7],177"groups": ["a"] * 3 + ["b"] * 2,178}179)180.with_columns(pl.col("index").cast(dtype))181.set_sorted("index")182)183expected = pl.DataFrame(184{185"index": expected_index,186"groups": expected_groups,187}188).with_columns(pl.col("index").cast(dtype))189result = (190df.upsample(time_column="index", group_by="groups", every=every)191.fill_null(strategy=fill)192.sort(["groups", "index"])193)194assert_frame_equal(result, expected)195196197@pytest.mark.parametrize("maintain_order", [True, False])198def test_upsample_index_invalid(199df: pl.DataFrame,200maintain_order: bool,201) -> None:202df = pl.DataFrame(203{204"index": [1, 2, 4, 5, 7],205"groups": ["a"] * 3 + ["b"] * 2,206}207).set_sorted("index")208209with pytest.raises(InvalidOperationError, match=r"must be a parsed integer"):210df.upsample(211time_column="index",212every="1h",213maintain_order=maintain_order,214)215216217def test_upsample_sorted_only_within_group() -> None:218df = pl.DataFrame(219{220"time": [221datetime(2021, 4, 1),222datetime(2021, 2, 1),223datetime(2021, 5, 1),224datetime(2021, 6, 1),225],226"admin": ["Netherlands", "Åland", "Åland", "Netherlands"],227"test2": [1, 0, 2, 3],228}229)230231up = df.upsample(232time_column="time",233every="1mo",234group_by="admin",235maintain_order=True,236).select(pl.all().fill_null(strategy="forward"))237238expected = pl.DataFrame(239{240"time": [241datetime(2021, 4, 1, 0, 0),242datetime(2021, 5, 1, 0, 0),243datetime(2021, 6, 1, 0, 0),244datetime(2021, 2, 1, 0, 0),245datetime(2021, 3, 1, 0, 0),246datetime(2021, 4, 1, 0, 0),247datetime(2021, 5, 1, 0, 0),248],249"admin": [250"Netherlands",251"Netherlands",252"Netherlands",253"Åland",254"Åland",255"Åland",256"Åland",257],258"test2": [1, 1, 3, 0, 0, 0, 2],259}260)261262assert_frame_equal(up, expected)263264265def test_upsample_sorted_only_within_group_but_no_group_by_provided() -> None:266df = pl.DataFrame(267{268"time": [269datetime(2021, 4, 1),270datetime(2021, 2, 1),271datetime(2021, 5, 1),272datetime(2021, 6, 1),273],274"admin": ["Netherlands", "Åland", "Åland", "Netherlands"],275"test2": [1, 0, 2, 3],276}277)278with pytest.raises(279InvalidOperationError,280match=r"argument in operation 'upsample' is not sorted, please sort the 'expr/series/column' first",281):282df.upsample(time_column="time", every="1mo")283284285def test_upsample_date() -> None:286df = pl.DataFrame({"date": [date(2025, 1, 1), date(2026, 1, 1)]})287result = df.upsample(time_column="date", every="3mo")288expected = pl.DataFrame(289{290"date": [291date(2025, 1, 1),292date(2025, 4, 1),293date(2025, 7, 1),294date(2025, 10, 1),295date(2026, 1, 1),296]297}298)299assert_frame_equal(result, expected)300301302