Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/dataframe/test_upsample.py
6939 views
1
from __future__ import annotations
2
3
from datetime import date, datetime
4
from typing import TYPE_CHECKING
5
from zoneinfo import ZoneInfo
6
7
import pytest
8
9
import polars as pl
10
from polars.exceptions import InvalidOperationError
11
from polars.testing import assert_frame_equal
12
13
if TYPE_CHECKING:
14
from datetime import timezone
15
16
from polars._typing import FillNullStrategy, PolarsIntegerType
17
18
19
@pytest.mark.parametrize(
20
("time_zone", "tzinfo"),
21
[
22
(None, None),
23
("Europe/Warsaw", ZoneInfo("Europe/Warsaw")),
24
],
25
)
26
def test_upsample(time_zone: str | None, tzinfo: ZoneInfo | timezone | None) -> None:
27
df = pl.DataFrame(
28
{
29
"time": [
30
datetime(2021, 2, 1),
31
datetime(2021, 4, 1),
32
datetime(2021, 5, 1),
33
datetime(2021, 6, 1),
34
],
35
"admin": ["Åland", "Netherlands", "Åland", "Netherlands"],
36
"test2": [0, 1, 2, 3],
37
}
38
).with_columns(pl.col("time").dt.replace_time_zone(time_zone).set_sorted())
39
40
up = df.upsample(
41
time_column="time",
42
every="1mo",
43
group_by="admin",
44
maintain_order=True,
45
).select(pl.all().fill_null(strategy="forward"))
46
47
# this print will panic if timezones feature is not activated
48
# don't remove
49
print(up)
50
51
expected = pl.DataFrame(
52
{
53
"time": [
54
datetime(2021, 2, 1, 0, 0),
55
datetime(2021, 3, 1, 0, 0),
56
datetime(2021, 4, 1, 0, 0),
57
datetime(2021, 5, 1, 0, 0),
58
datetime(2021, 4, 1, 0, 0),
59
datetime(2021, 5, 1, 0, 0),
60
datetime(2021, 6, 1, 0, 0),
61
],
62
"admin": [
63
"Åland",
64
"Åland",
65
"Åland",
66
"Åland",
67
"Netherlands",
68
"Netherlands",
69
"Netherlands",
70
],
71
"test2": [0, 0, 0, 2, 1, 1, 3],
72
}
73
)
74
expected = expected.with_columns(pl.col("time").dt.replace_time_zone(time_zone))
75
76
assert_frame_equal(up, expected)
77
78
79
@pytest.mark.parametrize("time_zone", [None, "US/Central"])
80
def test_upsample_crossing_dst(time_zone: str | None) -> None:
81
df = pl.DataFrame(
82
{
83
"time": pl.datetime_range(
84
datetime(2021, 11, 6),
85
datetime(2021, 11, 8),
86
time_zone=time_zone,
87
eager=True,
88
),
89
"values": [1, 2, 3],
90
}
91
)
92
93
result = df.upsample(time_column="time", every="1d")
94
95
expected = pl.DataFrame(
96
{
97
"time": [
98
datetime(2021, 11, 6),
99
datetime(2021, 11, 7),
100
datetime(2021, 11, 8),
101
],
102
"values": [1, 2, 3],
103
}
104
).with_columns(pl.col("time").dt.replace_time_zone(time_zone))
105
106
assert_frame_equal(result, expected)
107
108
109
@pytest.mark.parametrize(
110
("time_zone", "tzinfo"),
111
[
112
(None, None),
113
("Pacific/Rarotonga", ZoneInfo("Pacific/Rarotonga")),
114
],
115
)
116
def test_upsample_time_zones(
117
time_zone: str | None, tzinfo: timezone | ZoneInfo | None
118
) -> None:
119
df = pl.DataFrame(
120
{
121
"time": pl.datetime_range(
122
start=datetime(2021, 12, 16),
123
end=datetime(2021, 12, 16, 3),
124
interval="30m",
125
eager=True,
126
),
127
"groups": ["a", "a", "a", "b", "b", "a", "a"],
128
"values": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
129
}
130
)
131
expected = pl.DataFrame(
132
{
133
"time": [
134
datetime(2021, 12, 16, 0, 0),
135
datetime(2021, 12, 16, 1, 0),
136
datetime(2021, 12, 16, 2, 0),
137
datetime(2021, 12, 16, 3, 0),
138
],
139
"groups": ["a", "a", "b", "a"],
140
"values": [1.0, 3.0, 5.0, 7.0],
141
}
142
)
143
df = df.with_columns(pl.col("time").dt.replace_time_zone(time_zone))
144
expected = expected.with_columns(pl.col("time").dt.replace_time_zone(time_zone))
145
result = df.upsample(time_column="time", every="60m").fill_null(strategy="forward")
146
assert_frame_equal(result, expected)
147
148
149
@pytest.mark.parametrize(
150
("every", "fill", "expected_index", "expected_groups"),
151
[
152
(
153
"1i",
154
"forward",
155
[1, 2, 3, 4] + [5, 6, 7],
156
["a"] * 4 + ["b"] * 3,
157
),
158
(
159
"1i",
160
"backward",
161
[1, 2, 3, 4] + [5, 6, 7],
162
["a"] * 4 + ["b"] * 3,
163
),
164
],
165
)
166
@pytest.mark.parametrize("dtype", [pl.Int32, pl.Int64, pl.UInt32, pl.UInt64])
167
def test_upsample_index(
168
every: str,
169
fill: FillNullStrategy | None,
170
expected_index: list[int],
171
expected_groups: list[str],
172
dtype: PolarsIntegerType,
173
) -> None:
174
df = (
175
pl.DataFrame(
176
{
177
"index": [1, 2, 4] + [5, 7],
178
"groups": ["a"] * 3 + ["b"] * 2,
179
}
180
)
181
.with_columns(pl.col("index").cast(dtype))
182
.set_sorted("index")
183
)
184
expected = pl.DataFrame(
185
{
186
"index": expected_index,
187
"groups": expected_groups,
188
}
189
).with_columns(pl.col("index").cast(dtype))
190
result = (
191
df.upsample(time_column="index", group_by="groups", every=every)
192
.fill_null(strategy=fill)
193
.sort(["groups", "index"])
194
)
195
assert_frame_equal(result, expected)
196
197
198
@pytest.mark.parametrize("maintain_order", [True, False])
199
def test_upsample_index_invalid(
200
df: pl.DataFrame,
201
maintain_order: bool,
202
) -> None:
203
df = pl.DataFrame(
204
{
205
"index": [1, 2, 4, 5, 7],
206
"groups": ["a"] * 3 + ["b"] * 2,
207
}
208
).set_sorted("index")
209
210
with pytest.raises(InvalidOperationError, match=r"must be a parsed integer"):
211
df.upsample(
212
time_column="index",
213
every="1h",
214
maintain_order=maintain_order,
215
)
216
217
218
def test_upsample_sorted_only_within_group() -> None:
219
df = pl.DataFrame(
220
{
221
"time": [
222
datetime(2021, 4, 1),
223
datetime(2021, 2, 1),
224
datetime(2021, 5, 1),
225
datetime(2021, 6, 1),
226
],
227
"admin": ["Netherlands", "Åland", "Åland", "Netherlands"],
228
"test2": [1, 0, 2, 3],
229
}
230
)
231
232
up = df.upsample(
233
time_column="time",
234
every="1mo",
235
group_by="admin",
236
maintain_order=True,
237
).select(pl.all().fill_null(strategy="forward"))
238
239
expected = pl.DataFrame(
240
{
241
"time": [
242
datetime(2021, 4, 1, 0, 0),
243
datetime(2021, 5, 1, 0, 0),
244
datetime(2021, 6, 1, 0, 0),
245
datetime(2021, 2, 1, 0, 0),
246
datetime(2021, 3, 1, 0, 0),
247
datetime(2021, 4, 1, 0, 0),
248
datetime(2021, 5, 1, 0, 0),
249
],
250
"admin": [
251
"Netherlands",
252
"Netherlands",
253
"Netherlands",
254
"Åland",
255
"Åland",
256
"Åland",
257
"Åland",
258
],
259
"test2": [1, 1, 3, 0, 0, 0, 2],
260
}
261
)
262
263
assert_frame_equal(up, expected)
264
265
266
def test_upsample_sorted_only_within_group_but_no_group_by_provided() -> None:
267
df = pl.DataFrame(
268
{
269
"time": [
270
datetime(2021, 4, 1),
271
datetime(2021, 2, 1),
272
datetime(2021, 5, 1),
273
datetime(2021, 6, 1),
274
],
275
"admin": ["Netherlands", "Åland", "Åland", "Netherlands"],
276
"test2": [1, 0, 2, 3],
277
}
278
)
279
with pytest.raises(
280
InvalidOperationError,
281
match=r"argument in operation 'upsample' is not sorted, please sort the 'expr/series/column' first",
282
):
283
df.upsample(time_column="time", every="1mo")
284
285
286
def test_upsample_date() -> None:
287
df = pl.DataFrame({"date": [date(2025, 1, 1), date(2026, 1, 1)]})
288
result = df.upsample(time_column="date", every="3mo")
289
expected = pl.DataFrame(
290
{
291
"date": [
292
date(2025, 1, 1),
293
date(2025, 4, 1),
294
date(2025, 7, 1),
295
date(2025, 10, 1),
296
date(2026, 1, 1),
297
]
298
}
299
)
300
assert_frame_equal(result, expected)
301
302