Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/namespaces/test_strptime.py
8427 views
1
"""
2
Module for testing `.str.strptime` of the string namespace.
3
4
This method gets its own module due to its complexity.
5
"""
6
7
from __future__ import annotations
8
9
from contextlib import nullcontext as does_not_raise
10
from datetime import date, datetime, time, timedelta, timezone
11
from typing import TYPE_CHECKING, Any
12
from zoneinfo import ZoneInfo
13
14
import pytest
15
16
import polars as pl
17
from polars.exceptions import ChronoFormatWarning, ComputeError, InvalidOperationError
18
from polars.testing import assert_frame_equal, assert_series_equal
19
20
if TYPE_CHECKING:
21
from contextlib import AbstractContextManager
22
23
from polars._typing import PolarsTemporalType, TimeUnit
24
25
26
def test_str_strptime() -> None:
27
s = pl.Series(["2020-01-01", "2020-02-02"])
28
expected = pl.Series([date(2020, 1, 1), date(2020, 2, 2)])
29
assert_series_equal(s.str.strptime(pl.Date, "%Y-%m-%d"), expected)
30
31
s = pl.Series(["2020-01-01 00:00:00", "2020-02-02 03:20:10"])
32
expected = pl.Series(
33
[datetime(2020, 1, 1, 0, 0, 0), datetime(2020, 2, 2, 3, 20, 10)]
34
)
35
assert_series_equal(s.str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S"), expected)
36
37
s = pl.Series(["00:00:00", "03:20:10"])
38
expected = pl.Series([0, 12010000000000], dtype=pl.Time)
39
assert_series_equal(s.str.strptime(pl.Time, "%H:%M:%S"), expected)
40
41
42
def test_date_parse_omit_day() -> None:
43
df = pl.DataFrame({"month": ["2022-01"]})
44
assert df.select(pl.col("month").str.to_date(format="%Y-%m")).item() == date(
45
2022, 1, 1
46
)
47
assert df.select(
48
pl.col("month").str.to_datetime(format="%Y-%m")
49
).item() == datetime(2022, 1, 1)
50
51
52
def test_to_datetime_precision() -> None:
53
s = pl.Series(
54
"date", ["2022-09-12 21:54:36.789321456", "2022-09-13 12:34:56.987456321"]
55
)
56
ds = s.str.to_datetime()
57
assert ds.cast(pl.Date).is_not_null().all()
58
assert getattr(ds.dtype, "time_unit", None) == "us"
59
60
time_units: list[TimeUnit] = ["ms", "us", "ns"]
61
suffixes = ["%.3f", "%.6f", "%.9f"]
62
contexts: list[AbstractContextManager[Any]] = [
63
pytest.raises(InvalidOperationError),
64
pytest.raises(InvalidOperationError),
65
does_not_raise(),
66
]
67
test_data = zip(
68
time_units,
69
suffixes,
70
(
71
[789000000, 987000000],
72
[789321000, 987456000],
73
[789321456, 987456321],
74
),
75
contexts,
76
strict=False,
77
)
78
for time_unit, suffix, expected_values, context in test_data:
79
with context:
80
s.str.to_datetime(f"%Y-%m-%d %H:%M:%S{suffix}", time_unit=time_unit)
81
ds = s.str.to_datetime("%Y-%m-%d %H:%M:%S%.f", time_unit=time_unit)
82
assert getattr(ds.dtype, "time_unit", None) == time_unit
83
assert ds.dt.nanosecond().to_list() == expected_values
84
85
86
@pytest.mark.parametrize(
87
("time_unit", "expected"),
88
[("ms", "123000000"), ("us", "123456000"), ("ns", "123456789")],
89
)
90
@pytest.mark.parametrize("format", ["%Y-%m-%d %H:%M:%S%.f", None])
91
def test_to_datetime_precision_with_time_unit(
92
time_unit: TimeUnit, expected: str, format: str
93
) -> None:
94
s = pl.Series(["2020-01-01 00:00:00.123456789"])
95
result = s.str.to_datetime(format, time_unit=time_unit).dt.to_string("%f")[0]
96
assert result == expected
97
98
99
@pytest.mark.parametrize(
100
("tz_string", "timedelta"),
101
[("+01:00", timedelta(minutes=60)), ("-01:30", timedelta(hours=-1, minutes=-30))],
102
)
103
def test_timezone_aware_strptime(tz_string: str, timedelta: timedelta) -> None:
104
times = pl.DataFrame(
105
{
106
"delivery_datetime": [
107
"2021-12-05 06:00:00" + tz_string,
108
"2021-12-05 07:00:00" + tz_string,
109
"2021-12-05 08:00:00" + tz_string,
110
]
111
}
112
)
113
assert times.with_columns(
114
pl.col("delivery_datetime").str.to_datetime(format="%Y-%m-%d %H:%M:%S%z")
115
).to_dict(as_series=False) == {
116
"delivery_datetime": [
117
datetime(2021, 12, 5, 6, 0, tzinfo=timezone(timedelta)),
118
datetime(2021, 12, 5, 7, 0, tzinfo=timezone(timedelta)),
119
datetime(2021, 12, 5, 8, 0, tzinfo=timezone(timedelta)),
120
]
121
}
122
123
124
def test_to_date_non_exact_strptime() -> None:
125
s = pl.Series("a", ["2022-01-16", "2022-01-17", "foo2022-01-18", "b2022-01-19ar"])
126
format = "%Y-%m-%d"
127
128
result = s.str.to_date(format, strict=False, exact=True)
129
expected = pl.Series("a", [date(2022, 1, 16), date(2022, 1, 17), None, None])
130
assert_series_equal(result, expected)
131
132
result = s.str.to_date(format, strict=False, exact=False)
133
expected = pl.Series(
134
"a",
135
[date(2022, 1, 16), date(2022, 1, 17), date(2022, 1, 18), date(2022, 1, 19)],
136
)
137
assert_series_equal(result, expected)
138
139
with pytest.raises(InvalidOperationError):
140
s.str.to_date(format, strict=True, exact=True)
141
142
143
@pytest.mark.parametrize(
144
("time_string", "expected"),
145
[
146
("01-02-2024", date(2024, 2, 1)),
147
("01.02.2024", date(2024, 2, 1)),
148
("01/02/2024", date(2024, 2, 1)),
149
("2024-02-01", date(2024, 2, 1)),
150
("2024/02/01", date(2024, 2, 1)),
151
("31-12-2024", date(2024, 12, 31)),
152
("31.12.2024", date(2024, 12, 31)),
153
("31/12/2024", date(2024, 12, 31)),
154
("2024-12-31", date(2024, 12, 31)),
155
("2024/12/31", date(2024, 12, 31)),
156
],
157
)
158
def test_to_date_all_inferred_date_patterns(time_string: str, expected: date) -> None:
159
result = pl.Series([time_string]).str.to_date()
160
assert result[0] == expected
161
162
163
@pytest.mark.parametrize(
164
("time_string", "expected"),
165
[
166
("2024-12-04 09:08:00", datetime(2024, 12, 4, 9, 8, 0)),
167
("2024-12-4 9:8:0", datetime(2024, 12, 4, 9, 8, 0)),
168
("2024/12/04 9:8", datetime(2024, 12, 4, 9, 8, 0)),
169
("4/12/2024 9:8", datetime(2024, 12, 4, 9, 8, 0)),
170
],
171
)
172
def test_to_datetime_infer_missing_digit_in_time_16092(
173
time_string: str, expected: datetime
174
) -> None:
175
result = pl.Series([time_string]).str.to_datetime()
176
assert result[0] == expected
177
178
179
@pytest.mark.parametrize(
180
("value", "attr"),
181
[
182
("a", "to_date"),
183
("ab", "to_date"),
184
("a", "to_datetime"),
185
("ab", "to_datetime"),
186
],
187
)
188
def test_non_exact_short_elements_10223(value: str, attr: str) -> None:
189
with pytest.raises((InvalidOperationError, ComputeError)):
190
getattr(pl.Series(["2019-01-01", value]).str, attr)(exact=False)
191
192
193
@pytest.mark.parametrize(
194
("offset", "time_zone", "tzinfo", "format"),
195
[
196
("+01:00", "UTC", timezone(timedelta(hours=1)), "%Y-%m-%dT%H:%M%z"),
197
("", None, None, "%Y-%m-%dT%H:%M"),
198
],
199
)
200
def test_to_datetime_non_exact_strptime(
201
offset: str, time_zone: str | None, tzinfo: timezone | None, format: str
202
) -> None:
203
s = pl.Series(
204
"a",
205
[
206
f"2022-01-16T00:00{offset}",
207
f"2022-01-17T00:00{offset}",
208
f"foo2022-01-18T00:00{offset}",
209
f"b2022-01-19T00:00{offset}ar",
210
],
211
)
212
213
result = s.str.to_datetime(format, strict=False, exact=True)
214
expected = pl.Series(
215
"a",
216
[
217
datetime(2022, 1, 16, tzinfo=tzinfo),
218
datetime(2022, 1, 17, tzinfo=tzinfo),
219
None,
220
None,
221
],
222
)
223
assert_series_equal(result, expected)
224
assert result.dtype == pl.Datetime("us", time_zone)
225
226
result = s.str.to_datetime(format, strict=False, exact=False)
227
expected = pl.Series(
228
"a",
229
[
230
datetime(2022, 1, 16, tzinfo=tzinfo),
231
datetime(2022, 1, 17, tzinfo=tzinfo),
232
datetime(2022, 1, 18, tzinfo=tzinfo),
233
datetime(2022, 1, 19, tzinfo=tzinfo),
234
],
235
)
236
assert_series_equal(result, expected)
237
assert result.dtype == pl.Datetime("us", time_zone)
238
239
with pytest.raises(InvalidOperationError):
240
s.str.to_datetime(format, strict=True, exact=True)
241
242
243
def test_to_datetime_dates_datetimes() -> None:
244
s = pl.Series("date", ["2021-04-22", "2022-01-04 00:00:00"])
245
assert s.str.to_datetime().to_list() == [
246
datetime(2021, 4, 22, 0, 0),
247
datetime(2022, 1, 4, 0, 0),
248
]
249
250
251
@pytest.mark.parametrize(
252
("time_string", "expected"),
253
[
254
("09-05-2019", datetime(2019, 5, 9)),
255
("2018-09-05", datetime(2018, 9, 5)),
256
("2018-09-05T04:05:01", datetime(2018, 9, 5, 4, 5, 1)),
257
("2018-09-05T04:24:01.9", datetime(2018, 9, 5, 4, 24, 1, 900000)),
258
("2018-09-05T04:24:02.11", datetime(2018, 9, 5, 4, 24, 2, 110000)),
259
("2018-09-05T14:24:02.123", datetime(2018, 9, 5, 14, 24, 2, 123000)),
260
("2019-04-18T02:45:55.555000000", datetime(2019, 4, 18, 2, 45, 55, 555000)),
261
("2019-04-18T22:45:55.555123", datetime(2019, 4, 18, 22, 45, 55, 555123)),
262
(
263
"2018-09-05T04:05:01+01:00",
264
datetime(2018, 9, 5, 4, 5, 1, tzinfo=timezone(timedelta(hours=1))),
265
),
266
(
267
"2018-09-05T04:24:01.9+01:00",
268
datetime(2018, 9, 5, 4, 24, 1, 900000, tzinfo=timezone(timedelta(hours=1))),
269
),
270
(
271
"2018-09-05T04:24:02.11+01:00",
272
datetime(2018, 9, 5, 4, 24, 2, 110000, tzinfo=timezone(timedelta(hours=1))),
273
),
274
(
275
"2018-09-05T14:24:02.123+01:00",
276
datetime(
277
2018, 9, 5, 14, 24, 2, 123000, tzinfo=timezone(timedelta(hours=1))
278
),
279
),
280
(
281
"2019-04-18T02:45:55.555000000+01:00",
282
datetime(
283
2019, 4, 18, 2, 45, 55, 555000, tzinfo=timezone(timedelta(hours=1))
284
),
285
),
286
(
287
"2019-04-18T22:45:55.555123+01:00",
288
datetime(
289
2019, 4, 18, 22, 45, 55, 555123, tzinfo=timezone(timedelta(hours=1))
290
),
291
),
292
(
293
"20190418T224555.555123+01:00",
294
datetime(
295
2019, 4, 18, 22, 45, 55, 555123, tzinfo=timezone(timedelta(hours=1))
296
),
297
),
298
(
299
"20190418T224555.555123Z",
300
datetime(2019, 4, 18, 22, 45, 55, 555123, tzinfo=timezone.utc),
301
),
302
],
303
)
304
def test_to_datetime_patterns_single(time_string: str, expected: str) -> None:
305
result = pl.Series([time_string]).str.to_datetime().item()
306
assert result == expected
307
308
309
@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"])
310
def test_infer_tz_aware_time_unit(time_unit: TimeUnit) -> None:
311
result = pl.Series(["2020-01-02T04:00:00+02:00"]).str.to_datetime(
312
time_unit=time_unit
313
)
314
assert result.dtype == pl.Datetime(time_unit, "UTC")
315
assert result.item() == datetime(2020, 1, 2, 2, 0, tzinfo=timezone.utc)
316
317
318
@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"])
319
def test_infer_tz_aware_with_utc(time_unit: TimeUnit) -> None:
320
result = pl.Series(["2020-01-02T04:00:00+02:00"]).str.to_datetime(
321
time_unit=time_unit
322
)
323
assert result.dtype == pl.Datetime(time_unit, "UTC")
324
assert result.item() == datetime(2020, 1, 2, 2, 0, tzinfo=timezone.utc)
325
326
327
def test_str_to_datetime_infer_tz_aware() -> None:
328
result = (
329
pl.Series(["2020-01-02T04:00:00+02:00"])
330
.str.to_datetime(time_unit="us", time_zone="Europe/Vienna")
331
.item()
332
)
333
assert result == datetime(2020, 1, 2, 3, tzinfo=ZoneInfo("Europe/Vienna"))
334
335
336
@pytest.mark.parametrize(
337
"result",
338
[
339
pl.Series(["2020-01-01T00:00:00+00:00"]).str.strptime(
340
pl.Datetime("us", "UTC"), format="%Y-%m-%dT%H:%M:%S%z"
341
),
342
pl.Series(["2020-01-01T00:00:00+00:00"]).str.strptime(
343
pl.Datetime("us"), format="%Y-%m-%dT%H:%M:%S%z"
344
),
345
pl.Series(["2020-01-01T00:00:00+00:00"]).str.strptime(pl.Datetime("us", "UTC")),
346
pl.Series(["2020-01-01T00:00:00+00:00"]).str.strptime(pl.Datetime("us")),
347
pl.Series(["2020-01-01T00:00:00+00:00"]).str.to_datetime(
348
time_zone="UTC", format="%Y-%m-%dT%H:%M:%S%z"
349
),
350
pl.Series(["2020-01-01T00:00:00+00:00"]).str.to_datetime(
351
format="%Y-%m-%dT%H:%M:%S%z"
352
),
353
pl.Series(["2020-01-01T00:00:00+00:00"]).str.to_datetime(time_zone="UTC"),
354
pl.Series(["2020-01-01T00:00:00+00:00"]).str.to_datetime(),
355
],
356
)
357
def test_parsing_offset_aware_with_utc_dtype(result: pl.Series) -> None:
358
expected = pl.Series([datetime(2020, 1, 1, tzinfo=timezone.utc)])
359
assert_series_equal(result, expected)
360
361
362
def test_datetime_strptime_patterns_consistent() -> None:
363
# note that all should be year first
364
df = pl.Series(
365
"date",
366
[
367
"2018-09-05",
368
"2018-09-05T04:05:01",
369
"2018-09-05T04:24:01.9",
370
"2018-09-05T04:24:02.11",
371
"2018-09-05T14:24:02.123",
372
"2018-09-05T14:24:02.123Z",
373
"2019-04-18T02:45:55.555000000",
374
"2019-04-18T22:45:55.555123",
375
],
376
).to_frame()
377
s = df.with_columns(
378
pl.col("date").str.to_datetime(strict=False).alias("parsed"),
379
)["parsed"]
380
assert s.null_count() == 1
381
assert s[5] is None
382
383
384
def test_datetime_strptime_patterns_inconsistent() -> None:
385
# note that the pattern is inferred from the first element to
386
# be DatetimeDMY, and so the others (correctly) parse as `null`.
387
df = pl.Series(
388
"date",
389
[
390
"09-05-2019",
391
"2018-09-05",
392
"2018-09-05T04:05:01",
393
"2018-09-05T04:24:01.9",
394
"2018-09-05T04:24:02.11",
395
"2018-09-05T14:24:02.123",
396
"2018-09-05T14:24:02.123Z",
397
"2019-04-18T02:45:55.555000000",
398
"2019-04-18T22:45:55.555123",
399
],
400
).to_frame()
401
s = df.with_columns(pl.col("date").str.to_datetime(strict=False).alias("parsed"))[
402
"parsed"
403
]
404
assert s.null_count() == 8
405
assert s[0] is not None
406
407
408
@pytest.mark.parametrize(
409
(
410
"ts",
411
"format",
412
"exp_year",
413
"exp_month",
414
"exp_day",
415
"exp_hour",
416
"exp_minute",
417
"exp_second",
418
),
419
[
420
("-0031-04-24 22:13:20", "%Y-%m-%d %H:%M:%S", -31, 4, 24, 22, 13, 20),
421
("-0031-04-24", "%Y-%m-%d", -31, 4, 24, 0, 0, 0),
422
],
423
)
424
def test_parse_negative_dates(
425
ts: str,
426
format: str,
427
exp_year: int,
428
exp_month: int,
429
exp_day: int,
430
exp_hour: int,
431
exp_minute: int,
432
exp_second: int,
433
) -> None:
434
s = pl.Series([ts])
435
result = s.str.to_datetime(format, time_unit="ms")
436
# Python datetime.datetime doesn't support negative dates, so comparing
437
# with `result.item()` directly won't work.
438
assert result.dt.year().item() == exp_year
439
assert result.dt.month().item() == exp_month
440
assert result.dt.day().item() == exp_day
441
assert result.dt.hour().item() == exp_hour
442
assert result.dt.minute().item() == exp_minute
443
assert result.dt.second().item() == exp_second
444
445
446
def test_short_formats() -> None:
447
s = pl.Series(["20202020", "2020"])
448
assert s.str.to_date("%Y", strict=False).to_list() == [
449
None,
450
date(2020, 1, 1),
451
]
452
assert s.str.to_date("%bar", strict=False).to_list() == [None, None]
453
454
455
@pytest.mark.parametrize(
456
("time_string", "fmt", "datatype", "expected"),
457
[
458
("Jul/2020", "%b/%Y", pl.Date, date(2020, 7, 1)),
459
("Jan/2020", "%b/%Y", pl.Date, date(2020, 1, 1)),
460
("02/Apr/2020", "%d/%b/%Y", pl.Date, date(2020, 4, 2)),
461
("Dec/2020", "%b/%Y", pl.Datetime, datetime(2020, 12, 1, 0, 0)),
462
("Nov/2020", "%b/%Y", pl.Datetime, datetime(2020, 11, 1, 0, 0)),
463
("02/Feb/2020", "%d/%b/%Y", pl.Datetime, datetime(2020, 2, 2, 0, 0)),
464
],
465
)
466
def test_strptime_abbrev_month(
467
time_string: str, fmt: str, datatype: PolarsTemporalType, expected: date
468
) -> None:
469
s = pl.Series([time_string])
470
result = s.str.strptime(datatype, fmt).item()
471
assert result == expected
472
473
474
def test_full_month_name() -> None:
475
s = pl.Series(["2022-December-01"]).str.to_datetime("%Y-%B-%d")
476
assert s[0] == datetime(2022, 12, 1)
477
478
479
@pytest.mark.parametrize(
480
("datatype", "expected"),
481
[
482
(pl.Datetime, datetime(2022, 1, 1)),
483
(pl.Date, date(2022, 1, 1)),
484
],
485
)
486
def test_single_digit_month(
487
datatype: PolarsTemporalType, expected: datetime | date
488
) -> None:
489
s = pl.Series(["2022-1-1"]).str.strptime(datatype, "%Y-%m-%d")
490
assert s[0] == expected
491
492
493
def test_invalid_date_parsing_4898() -> None:
494
assert pl.Series(["2022-09-18", "2022-09-50"]).str.to_date(
495
"%Y-%m-%d", strict=False
496
).to_list() == [date(2022, 9, 18), None]
497
498
499
def test_strptime_invalid_timezone() -> None:
500
ts = pl.Series(["2020-01-01 00:00:00+01:00"]).str.to_datetime("%Y-%m-%d %H:%M:%S%z")
501
with pytest.raises(ComputeError, match=r"unable to parse time zone: 'foo'"):
502
ts.dt.replace_time_zone("foo")
503
504
505
def test_to_datetime_ambiguous_or_non_existent() -> None:
506
with pytest.raises(
507
ComputeError,
508
match="datetime '2021-11-07 01:00:00' is ambiguous in time zone 'America/Chicago'",
509
):
510
pl.Series(["2021-11-07 01:00"]).str.to_datetime(
511
time_unit="us", time_zone="America/Chicago"
512
)
513
with pytest.raises(
514
ComputeError,
515
match="datetime '2021-03-28 02:30:00' is non-existent in time zone 'Europe/Warsaw'",
516
):
517
pl.Series(["2021-03-28 02:30"]).str.to_datetime(
518
time_unit="us", time_zone="Europe/Warsaw"
519
)
520
with pytest.raises(
521
ComputeError,
522
match="datetime '2021-03-28 02:30:00' is non-existent in time zone 'Europe/Warsaw'",
523
):
524
pl.Series(["2021-03-28 02:30"]).str.to_datetime(
525
time_unit="us",
526
time_zone="Europe/Warsaw",
527
ambiguous="null",
528
)
529
with pytest.raises(
530
ComputeError,
531
match="datetime '2021-03-28 02:30:00' is non-existent in time zone 'Europe/Warsaw'",
532
):
533
pl.Series(["2021-03-28 02:30"] * 2).str.to_datetime(
534
time_unit="us",
535
time_zone="Europe/Warsaw",
536
ambiguous=pl.Series(["null", "null"]),
537
)
538
539
540
@pytest.mark.parametrize(
541
("ts", "fmt", "expected"),
542
[
543
("2020-01-01T00:00:00Z", None, datetime(2020, 1, 1, tzinfo=timezone.utc)),
544
("2020-01-01T00:00:00Z", "%+", datetime(2020, 1, 1, tzinfo=timezone.utc)),
545
(
546
"2020-01-01T00:00:00+01:00",
547
"%Y-%m-%dT%H:%M:%S%z",
548
datetime(2020, 1, 1, tzinfo=timezone(timedelta(seconds=3600))),
549
),
550
(
551
"2020-01-01T00:00:00+01:00",
552
"%Y-%m-%dT%H:%M:%S%:z",
553
datetime(2020, 1, 1, tzinfo=timezone(timedelta(seconds=3600))),
554
),
555
(
556
"2020-01-01T00:00:00+01:00",
557
"%Y-%m-%dT%H:%M:%S%#z",
558
datetime(2020, 1, 1, tzinfo=timezone(timedelta(seconds=3600))),
559
),
560
],
561
)
562
def test_to_datetime_tz_aware_strptime(ts: str, fmt: str, expected: datetime) -> None:
563
result = pl.Series([ts]).str.to_datetime(fmt).item()
564
assert result == expected
565
566
567
@pytest.mark.parametrize("format", ["%+", "%Y-%m-%dT%H:%M:%S%z"])
568
def test_crossing_dst(format: str) -> None:
569
ts = ["2021-03-27T23:59:59+01:00", "2021-03-28T23:59:59+02:00"]
570
result = pl.Series(ts).str.to_datetime(format)
571
assert result[0] == datetime(2021, 3, 27, 22, 59, 59, tzinfo=ZoneInfo("UTC"))
572
assert result[1] == datetime(2021, 3, 28, 21, 59, 59, tzinfo=ZoneInfo("UTC"))
573
574
575
@pytest.mark.parametrize("format", ["%+", "%Y-%m-%dT%H:%M:%S%z"])
576
def test_crossing_dst_tz_aware(format: str) -> None:
577
ts = ["2021-03-27T23:59:59+01:00", "2021-03-28T23:59:59+02:00"]
578
result = pl.Series(ts).str.to_datetime(format)
579
expected = pl.Series(
580
[
581
datetime(2021, 3, 27, 22, 59, 59, tzinfo=timezone.utc),
582
datetime(2021, 3, 28, 21, 59, 59, tzinfo=timezone.utc),
583
]
584
)
585
assert_series_equal(result, expected)
586
587
588
@pytest.mark.parametrize(
589
("data", "format", "expected"),
590
[
591
(
592
"2023-02-05T05:10:10.074000",
593
"%Y-%m-%dT%H:%M:%S%.f",
594
datetime(2023, 2, 5, 5, 10, 10, 74000),
595
),
596
],
597
)
598
def test_strptime_subseconds_datetime(data: str, format: str, expected: time) -> None:
599
s = pl.Series([data])
600
result = s.str.to_datetime(format).item()
601
assert result == expected
602
603
604
@pytest.mark.parametrize(
605
("string", "fmt"),
606
[
607
pytest.param("2023-05-04|7", "%Y-%m-%d|%H", id="hour but no minute"),
608
pytest.param("2023-05-04|7", "%Y-%m-%d|%k", id="padded hour but no minute"),
609
pytest.param("2023-05-04|10", "%Y-%m-%d|%M", id="minute but no hour"),
610
pytest.param("2023-05-04|10", "%Y-%m-%d|%S", id="second but no hour"),
611
pytest.param(
612
"2000-Jan-01 01 00 01", "%Y-%b-%d %I %M %S", id="12-hour clock but no AM/PM"
613
),
614
pytest.param(
615
"2000-Jan-01 01 00 01",
616
"%Y-%b-%d %l %M %S",
617
id="padded 12-hour clock but no AM/PM",
618
),
619
],
620
)
621
def test_strptime_incomplete_formats(string: str, fmt: str) -> None:
622
with pytest.raises(
623
ComputeError,
624
match="Invalid format string",
625
):
626
pl.Series([string]).str.to_datetime(fmt)
627
628
629
@pytest.mark.parametrize(
630
("string", "fmt", "expected"),
631
[
632
("2023-05-04|7:3", "%Y-%m-%d|%H:%M", datetime(2023, 5, 4, 7, 3)),
633
("2023-05-04|10:03", "%Y-%m-%d|%H:%M", datetime(2023, 5, 4, 10, 3)),
634
(
635
"2000-Jan-01 01 00 01 am",
636
"%Y-%b-%d %I %M %S %P",
637
datetime(2000, 1, 1, 1, 0, 1),
638
),
639
(
640
"2000-Jan-01 01 00 01 am",
641
"%Y-%b-%d %_I %M %S %P",
642
datetime(2000, 1, 1, 1, 0, 1),
643
),
644
(
645
"2000-Jan-01 01 00 01 am",
646
"%Y-%b-%d %l %M %S %P",
647
datetime(2000, 1, 1, 1, 0, 1),
648
),
649
(
650
"2000-Jan-01 01 00 01 AM",
651
"%Y-%b-%d %I %M %S %p",
652
datetime(2000, 1, 1, 1, 0, 1),
653
),
654
(
655
"2000-Jan-01 01 00 01 AM",
656
"%Y-%b-%d %_I %M %S %p",
657
datetime(2000, 1, 1, 1, 0, 1),
658
),
659
(
660
"2000-Jan-01 01 00 01 AM",
661
"%Y-%b-%d %l %M %S %p",
662
datetime(2000, 1, 1, 1, 0, 1),
663
),
664
],
665
)
666
def test_strptime_complete_formats(string: str, fmt: str, expected: datetime) -> None:
667
# Similar to the above, but these formats are complete and should work
668
result = pl.Series([string]).str.to_datetime(fmt).item()
669
assert result == expected
670
671
672
@pytest.mark.parametrize(
673
("data", "format", "expected"),
674
[
675
("00:00:00.000005000", "%H:%M:%S%.f", time(0, 0, 0, 5)),
676
("01:23:10.000500", "%H:%M:%S%.6f", time(1, 23, 10, 500)),
677
("08:10:11.000", "%H:%M:%S%.3f", time(8, 10, 11)),
678
("15:50:25", "%T", time(15, 50, 25)),
679
("22:35", "%R", time(22, 35)),
680
],
681
)
682
def test_to_time_inferred(data: str, format: str, expected: time) -> None:
683
df = pl.DataFrame({"tmstr": [data]})
684
expected_df = df.with_columns(tm=pl.Series("tm", values=[expected]))
685
for fmt in (format, None):
686
res = df.with_columns(tm=pl.col("tmstr").str.to_time(fmt))
687
assert_frame_equal(res, expected_df)
688
689
690
@pytest.mark.parametrize(
691
("data", "format", "expected"),
692
[
693
("05:10:11.740000", "%H:%M:%S%.f", time(5, 10, 11, 740000)),
694
("13:20:12.000074", "%T%.6f", time(13, 20, 12, 74)),
695
("21:30:13.007", "%H:%M:%S%.3f", time(21, 30, 13, 7000)),
696
],
697
)
698
def test_to_time_subseconds(data: str, format: str, expected: time) -> None:
699
s = pl.Series([data])
700
for res in (
701
s.str.to_time().item(),
702
s.str.to_time(format).item(),
703
):
704
assert res == expected
705
706
707
def test_to_time_format_warning() -> None:
708
s = pl.Series(["05:10:10.074000"])
709
with pytest.warns(ChronoFormatWarning, match=r".%f"):
710
result = s.str.to_time("%H:%M:%S.%f").item()
711
assert result == time(5, 10, 10, 74)
712
713
714
@pytest.mark.parametrize("exact", [True, False])
715
def test_to_datetime_ambiguous_earliest(exact: bool) -> None:
716
result = (
717
pl.Series(["2020-10-25 01:00"])
718
.str.to_datetime(time_zone="Europe/London", ambiguous="earliest", exact=exact)
719
.item()
720
)
721
expected = datetime(2020, 10, 25, 1, fold=0, tzinfo=ZoneInfo("Europe/London"))
722
assert result == expected
723
result = (
724
pl.Series(["2020-10-25 01:00"])
725
.str.to_datetime(time_zone="Europe/London", ambiguous="latest", exact=exact)
726
.item()
727
)
728
expected = datetime(2020, 10, 25, 1, fold=1, tzinfo=ZoneInfo("Europe/London"))
729
assert result == expected
730
with pytest.raises(ComputeError):
731
pl.Series(["2020-10-25 01:00"]).str.to_datetime(
732
time_zone="Europe/London",
733
exact=exact,
734
).item()
735
736
737
def test_to_datetime_naive_format_and_time_zone() -> None:
738
# format-specified path
739
result = pl.Series(["2020-01-01"]).str.to_datetime(
740
format="%Y-%m-%d", time_zone="Asia/Kathmandu"
741
)
742
expected = pl.Series([datetime(2020, 1, 1)]).dt.replace_time_zone("Asia/Kathmandu")
743
assert_series_equal(result, expected)
744
# format-inferred path
745
result = pl.Series(["2020-01-01"]).str.to_datetime(time_zone="Asia/Kathmandu")
746
assert_series_equal(result, expected)
747
748
749
@pytest.mark.parametrize("exact", [True, False])
750
def test_strptime_ambiguous_earliest(exact: bool) -> None:
751
result = (
752
pl.Series(["2020-10-25 01:00"])
753
.str.strptime(
754
pl.Datetime("us", "Europe/London"), ambiguous="earliest", exact=exact
755
)
756
.item()
757
)
758
expected = datetime(2020, 10, 25, 1, fold=0, tzinfo=ZoneInfo("Europe/London"))
759
assert result == expected
760
result = (
761
pl.Series(["2020-10-25 01:00"])
762
.str.strptime(
763
pl.Datetime("us", "Europe/London"), ambiguous="latest", exact=exact
764
)
765
.item()
766
)
767
expected = datetime(2020, 10, 25, 1, fold=1, tzinfo=ZoneInfo("Europe/London"))
768
assert result == expected
769
with pytest.raises(ComputeError):
770
pl.Series(["2020-10-25 01:00"]).str.strptime(
771
pl.Datetime("us", "Europe/London"),
772
exact=exact,
773
).item()
774
775
776
@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"])
777
def test_to_datetime_out_of_range_13401(time_unit: TimeUnit) -> None:
778
s = pl.Series(["2020-January-01 12:34:66"])
779
with pytest.raises(InvalidOperationError, match=r"conversion .* failed"):
780
s.str.to_datetime("%Y-%B-%d %H:%M:%S", time_unit=time_unit)
781
assert (
782
s.str.to_datetime("%Y-%B-%d %H:%M:%S", strict=False, time_unit=time_unit).item()
783
is None
784
)
785
786
787
def test_out_of_ns_range_no_tu_specified_13592() -> None:
788
df = pl.DataFrame({"dates": ["2022-08-31 00:00:00.0", "0920-09-18 00:00:00.0"]})
789
result = df.select(pl.col("dates").str.to_datetime(format="%Y-%m-%d %H:%M:%S%.f"))[
790
"dates"
791
]
792
expected = pl.Series(
793
"dates",
794
[datetime(2022, 8, 31, 0, 0), datetime(920, 9, 18, 0, 0)],
795
dtype=pl.Datetime("us"),
796
)
797
assert_series_equal(result, expected)
798
799
800
def test_wrong_format_percent() -> None:
801
with pytest.raises(InvalidOperationError):
802
pl.Series(["2019-01-01"]).str.strptime(pl.Date, format="d%")
803
804
805
def test_polars_parser_fooled_by_trailing_nonsense_22167() -> None:
806
with pytest.raises(InvalidOperationError):
807
pl.Series(["2025-04-06T18:57:42.77756192Z"]).str.to_datetime(
808
"%Y-%m-%dT%H:%M:%S.%9fcabbagebananapotato"
809
)
810
with pytest.raises(InvalidOperationError):
811
pl.Series(["2025-04-06T18:57:42.77756192Z"]).str.to_datetime(
812
"%Y-%m-%dT%H:%M:%S.%9f#z"
813
)
814
with pytest.raises(InvalidOperationError):
815
pl.Series(["2025-04-06T18:57:42.77Z"]).str.to_datetime(
816
"%Y-%m-%dT%H:%M:%S.%3f#z"
817
)
818
with pytest.raises(InvalidOperationError):
819
pl.Series(["2025-04-06T18:57:42.77123Z"]).str.to_datetime(
820
"%Y-%m-%dT%H:%M:%S.%6f#z"
821
)
822
823
824
def test_strptime_empty_input_22214() -> None:
825
s = pl.Series("x", [], pl.String)
826
827
assert s.str.strptime(pl.Time, "%H:%M:%S%.f").is_empty()
828
assert s.str.strptime(pl.Date, "%Y-%m-%d").is_empty()
829
assert s.str.strptime(pl.Datetime, "%Y-%m-%d %H:%M%#z").is_empty()
830
831
832
@pytest.mark.parametrize(
833
"value",
834
[
835
"31/12/2022",
836
"banana",
837
"12-345-678",
838
"12-345-67",
839
"12-345-6789",
840
"123*45*678",
841
"123x45x678",
842
"123x45x678x",
843
],
844
)
845
def test_matching_strings_but_different_format_22495(value: str) -> None:
846
s = pl.Series("my_strings", [value])
847
result = s.str.to_date("%Y-%m-%d", strict=False).item()
848
assert result is None
849
850
851
def test_date_parse_omit_day_month() -> None:
852
fmt_B = "%Y %B"
853
fmt_b = "%Y %b"
854
df = (
855
pl.select(date=pl.date_range(pl.date(2022, 1, 1), pl.date(2022, 12, 1), "1mo"))
856
.with_columns(
857
strdateB=pl.col("date").dt.strftime(fmt_B),
858
strdateb=pl.col("date").dt.strftime(fmt_b),
859
)
860
.with_columns(
861
round_tripB=pl.col("strdateB").str.strptime(pl.Date, fmt_B),
862
round_tripb=pl.col("strdateb").str.strptime(pl.Date, fmt_b),
863
)
864
)
865
check = df.filter(
866
~pl.all_horizontal(
867
pl.col("date") == pl.col("round_tripB"),
868
pl.col("date") == pl.col("round_tripb"),
869
)
870
)
871
assert check.height == 0
872
873
s = pl.Series(
874
[
875
"2022 January",
876
"2022 February",
877
"2022 March",
878
"2022 April",
879
"2022 May",
880
"2022 June",
881
"2022 July",
882
"2022 August",
883
"2022 September",
884
"2022 October",
885
"2022 November",
886
"2022 December",
887
]
888
)
889
result = s.str.strptime(pl.Date, "%Y %B")
890
expected = pl.Series(
891
[
892
date(2022, 1, 1),
893
date(2022, 2, 1),
894
date(2022, 3, 1),
895
date(2022, 4, 1),
896
date(2022, 5, 1),
897
date(2022, 6, 1),
898
date(2022, 7, 1),
899
date(2022, 8, 1),
900
date(2022, 9, 1),
901
date(2022, 10, 1),
902
date(2022, 11, 1),
903
date(2022, 12, 1),
904
]
905
)
906
assert_series_equal(result, expected)
907
908
909
@pytest.mark.parametrize("length", [1, 5])
910
def test_eager_inference_on_expr(length: int) -> None:
911
s = pl.Series("a", ["2025-04-06T18:57:42.77123Z"] * length)
912
913
assert_series_equal(
914
s.str.strptime(pl.Datetime),
915
pl.Series(
916
"a",
917
[
918
datetime(
919
2025, 4, 6, 18, 57, 42, 771230, tzinfo=timezone(timedelta(hours=0))
920
)
921
]
922
* length,
923
),
924
)
925
926
with pytest.raises(
927
ComputeError,
928
match="`strptime` / `to_datetime` was called with no format and no time zone, but a time zone is part of the data",
929
):
930
s.to_frame().select(pl.col("a").str.strptime(pl.Datetime))
931
932
933
@pytest.mark.parametrize("maintain_order", [False, True])
934
def test_strptime_in_group_by(maintain_order: bool) -> None:
935
df = pl.DataFrame({"g": [1, 2], "a": ["AAA", "2025-01-01"]})
936
937
assert_frame_equal(
938
df.group_by("g", maintain_order=maintain_order).agg(
939
pl.col.a.filter(pl.col.a != "AAA").str.to_date("%Y-%m-%d").min()
940
),
941
pl.DataFrame({"g": [1, 2], "a": [None, "2025-01-01"]}).with_columns(
942
pl.col.a.str.to_date("%Y-%m-%d")
943
),
944
check_row_order=maintain_order,
945
)
946
947