Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/namespaces/test_strptime.py
6940 views
1
"""
2
Module for testing `.str.strptime` of the string namespace.
3
4
This method gets its own module due to its complexity.
5
"""
6
7
from __future__ import annotations
8
9
from datetime import date, datetime, time, timedelta, timezone
10
from typing import TYPE_CHECKING
11
from zoneinfo import ZoneInfo
12
13
import pytest
14
15
import polars as pl
16
from polars.exceptions import ChronoFormatWarning, ComputeError, InvalidOperationError
17
from polars.testing import assert_frame_equal, assert_series_equal
18
19
if TYPE_CHECKING:
20
from polars._typing import PolarsTemporalType, TimeUnit
21
22
23
def test_str_strptime() -> None:
24
s = pl.Series(["2020-01-01", "2020-02-02"])
25
expected = pl.Series([date(2020, 1, 1), date(2020, 2, 2)])
26
assert_series_equal(s.str.strptime(pl.Date, "%Y-%m-%d"), expected)
27
28
s = pl.Series(["2020-01-01 00:00:00", "2020-02-02 03:20:10"])
29
expected = pl.Series(
30
[datetime(2020, 1, 1, 0, 0, 0), datetime(2020, 2, 2, 3, 20, 10)]
31
)
32
assert_series_equal(s.str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S"), expected)
33
34
s = pl.Series(["00:00:00", "03:20:10"])
35
expected = pl.Series([0, 12010000000000], dtype=pl.Time)
36
assert_series_equal(s.str.strptime(pl.Time, "%H:%M:%S"), expected)
37
38
39
def test_date_parse_omit_day() -> None:
40
df = pl.DataFrame({"month": ["2022-01"]})
41
assert df.select(pl.col("month").str.to_date(format="%Y-%m")).item() == date(
42
2022, 1, 1
43
)
44
assert df.select(
45
pl.col("month").str.to_datetime(format="%Y-%m")
46
).item() == datetime(2022, 1, 1)
47
48
49
def test_to_datetime_precision() -> None:
50
s = pl.Series(
51
"date", ["2022-09-12 21:54:36.789321456", "2022-09-13 12:34:56.987456321"]
52
)
53
ds = s.str.to_datetime()
54
assert ds.cast(pl.Date).is_not_null().all()
55
assert getattr(ds.dtype, "time_unit", None) == "us"
56
57
time_units: list[TimeUnit] = ["ms", "us", "ns"]
58
suffixes = ["%.3f", "%.6f", "%.9f"]
59
test_data = zip(
60
time_units,
61
suffixes,
62
(
63
[789000000, 987000000],
64
[789321000, 987456000],
65
[789321456, 987456321],
66
),
67
)
68
for time_unit, suffix, expected_values in test_data:
69
ds = s.str.to_datetime(f"%Y-%m-%d %H:%M:%S{suffix}", time_unit=time_unit)
70
assert getattr(ds.dtype, "time_unit", None) == time_unit
71
assert ds.dt.nanosecond().to_list() == expected_values
72
73
74
@pytest.mark.parametrize(
75
("time_unit", "expected"),
76
[("ms", "123000000"), ("us", "123456000"), ("ns", "123456789")],
77
)
78
@pytest.mark.parametrize("format", ["%Y-%m-%d %H:%M:%S%.f", None])
79
def test_to_datetime_precision_with_time_unit(
80
time_unit: TimeUnit, expected: str, format: str
81
) -> None:
82
s = pl.Series(["2020-01-01 00:00:00.123456789"])
83
result = s.str.to_datetime(format, time_unit=time_unit).dt.to_string("%f")[0]
84
assert result == expected
85
86
87
@pytest.mark.parametrize(
88
("tz_string", "timedelta"),
89
[("+01:00", timedelta(minutes=60)), ("-01:30", timedelta(hours=-1, minutes=-30))],
90
)
91
def test_timezone_aware_strptime(tz_string: str, timedelta: timedelta) -> None:
92
times = pl.DataFrame(
93
{
94
"delivery_datetime": [
95
"2021-12-05 06:00:00" + tz_string,
96
"2021-12-05 07:00:00" + tz_string,
97
"2021-12-05 08:00:00" + tz_string,
98
]
99
}
100
)
101
assert times.with_columns(
102
pl.col("delivery_datetime").str.to_datetime(format="%Y-%m-%d %H:%M:%S%z")
103
).to_dict(as_series=False) == {
104
"delivery_datetime": [
105
datetime(2021, 12, 5, 6, 0, tzinfo=timezone(timedelta)),
106
datetime(2021, 12, 5, 7, 0, tzinfo=timezone(timedelta)),
107
datetime(2021, 12, 5, 8, 0, tzinfo=timezone(timedelta)),
108
]
109
}
110
111
112
def test_to_date_non_exact_strptime() -> None:
113
s = pl.Series("a", ["2022-01-16", "2022-01-17", "foo2022-01-18", "b2022-01-19ar"])
114
format = "%Y-%m-%d"
115
116
result = s.str.to_date(format, strict=False, exact=True)
117
expected = pl.Series("a", [date(2022, 1, 16), date(2022, 1, 17), None, None])
118
assert_series_equal(result, expected)
119
120
result = s.str.to_date(format, strict=False, exact=False)
121
expected = pl.Series(
122
"a",
123
[date(2022, 1, 16), date(2022, 1, 17), date(2022, 1, 18), date(2022, 1, 19)],
124
)
125
assert_series_equal(result, expected)
126
127
with pytest.raises(InvalidOperationError):
128
s.str.to_date(format, strict=True, exact=True)
129
130
131
@pytest.mark.parametrize(
132
("time_string", "expected"),
133
[
134
("01-02-2024", date(2024, 2, 1)),
135
("01.02.2024", date(2024, 2, 1)),
136
("01/02/2024", date(2024, 2, 1)),
137
("2024-02-01", date(2024, 2, 1)),
138
("2024/02/01", date(2024, 2, 1)),
139
("31-12-2024", date(2024, 12, 31)),
140
("31.12.2024", date(2024, 12, 31)),
141
("31/12/2024", date(2024, 12, 31)),
142
("2024-12-31", date(2024, 12, 31)),
143
("2024/12/31", date(2024, 12, 31)),
144
],
145
)
146
def test_to_date_all_inferred_date_patterns(time_string: str, expected: date) -> None:
147
result = pl.Series([time_string]).str.to_date()
148
assert result[0] == expected
149
150
151
@pytest.mark.parametrize(
152
("time_string", "expected"),
153
[
154
("2024-12-04 09:08:00", datetime(2024, 12, 4, 9, 8, 0)),
155
("2024-12-4 9:8:0", datetime(2024, 12, 4, 9, 8, 0)),
156
("2024/12/04 9:8", datetime(2024, 12, 4, 9, 8, 0)),
157
("4/12/2024 9:8", datetime(2024, 12, 4, 9, 8, 0)),
158
],
159
)
160
def test_to_datetime_infer_missing_digit_in_time_16092(
161
time_string: str, expected: datetime
162
) -> None:
163
result = pl.Series([time_string]).str.to_datetime()
164
assert result[0] == expected
165
166
167
@pytest.mark.parametrize(
168
("value", "attr"),
169
[
170
("a", "to_date"),
171
("ab", "to_date"),
172
("a", "to_datetime"),
173
("ab", "to_datetime"),
174
],
175
)
176
def test_non_exact_short_elements_10223(value: str, attr: str) -> None:
177
with pytest.raises((InvalidOperationError, ComputeError)):
178
getattr(pl.Series(["2019-01-01", value]).str, attr)(exact=False)
179
180
181
@pytest.mark.parametrize(
182
("offset", "time_zone", "tzinfo", "format"),
183
[
184
("+01:00", "UTC", timezone(timedelta(hours=1)), "%Y-%m-%dT%H:%M%z"),
185
("", None, None, "%Y-%m-%dT%H:%M"),
186
],
187
)
188
def test_to_datetime_non_exact_strptime(
189
offset: str, time_zone: str | None, tzinfo: timezone | None, format: str
190
) -> None:
191
s = pl.Series(
192
"a",
193
[
194
f"2022-01-16T00:00{offset}",
195
f"2022-01-17T00:00{offset}",
196
f"foo2022-01-18T00:00{offset}",
197
f"b2022-01-19T00:00{offset}ar",
198
],
199
)
200
201
result = s.str.to_datetime(format, strict=False, exact=True)
202
expected = pl.Series(
203
"a",
204
[
205
datetime(2022, 1, 16, tzinfo=tzinfo),
206
datetime(2022, 1, 17, tzinfo=tzinfo),
207
None,
208
None,
209
],
210
)
211
assert_series_equal(result, expected)
212
assert result.dtype == pl.Datetime("us", time_zone)
213
214
result = s.str.to_datetime(format, strict=False, exact=False)
215
expected = pl.Series(
216
"a",
217
[
218
datetime(2022, 1, 16, tzinfo=tzinfo),
219
datetime(2022, 1, 17, tzinfo=tzinfo),
220
datetime(2022, 1, 18, tzinfo=tzinfo),
221
datetime(2022, 1, 19, tzinfo=tzinfo),
222
],
223
)
224
assert_series_equal(result, expected)
225
assert result.dtype == pl.Datetime("us", time_zone)
226
227
with pytest.raises(InvalidOperationError):
228
s.str.to_datetime(format, strict=True, exact=True)
229
230
231
def test_to_datetime_dates_datetimes() -> None:
232
s = pl.Series("date", ["2021-04-22", "2022-01-04 00:00:00"])
233
assert s.str.to_datetime().to_list() == [
234
datetime(2021, 4, 22, 0, 0),
235
datetime(2022, 1, 4, 0, 0),
236
]
237
238
239
@pytest.mark.parametrize(
240
("time_string", "expected"),
241
[
242
("09-05-2019", datetime(2019, 5, 9)),
243
("2018-09-05", datetime(2018, 9, 5)),
244
("2018-09-05T04:05:01", datetime(2018, 9, 5, 4, 5, 1)),
245
("2018-09-05T04:24:01.9", datetime(2018, 9, 5, 4, 24, 1, 900000)),
246
("2018-09-05T04:24:02.11", datetime(2018, 9, 5, 4, 24, 2, 110000)),
247
("2018-09-05T14:24:02.123", datetime(2018, 9, 5, 14, 24, 2, 123000)),
248
("2019-04-18T02:45:55.555000000", datetime(2019, 4, 18, 2, 45, 55, 555000)),
249
("2019-04-18T22:45:55.555123", datetime(2019, 4, 18, 22, 45, 55, 555123)),
250
(
251
"2018-09-05T04:05:01+01:00",
252
datetime(2018, 9, 5, 4, 5, 1, tzinfo=timezone(timedelta(hours=1))),
253
),
254
(
255
"2018-09-05T04:24:01.9+01:00",
256
datetime(2018, 9, 5, 4, 24, 1, 900000, tzinfo=timezone(timedelta(hours=1))),
257
),
258
(
259
"2018-09-05T04:24:02.11+01:00",
260
datetime(2018, 9, 5, 4, 24, 2, 110000, tzinfo=timezone(timedelta(hours=1))),
261
),
262
(
263
"2018-09-05T14:24:02.123+01:00",
264
datetime(
265
2018, 9, 5, 14, 24, 2, 123000, tzinfo=timezone(timedelta(hours=1))
266
),
267
),
268
(
269
"2019-04-18T02:45:55.555000000+01:00",
270
datetime(
271
2019, 4, 18, 2, 45, 55, 555000, tzinfo=timezone(timedelta(hours=1))
272
),
273
),
274
(
275
"2019-04-18T22:45:55.555123+01:00",
276
datetime(
277
2019, 4, 18, 22, 45, 55, 555123, tzinfo=timezone(timedelta(hours=1))
278
),
279
),
280
],
281
)
282
def test_to_datetime_patterns_single(time_string: str, expected: str) -> None:
283
result = pl.Series([time_string]).str.to_datetime().item()
284
assert result == expected
285
286
287
@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"])
288
def test_infer_tz_aware_time_unit(time_unit: TimeUnit) -> None:
289
result = pl.Series(["2020-01-02T04:00:00+02:00"]).str.to_datetime(
290
time_unit=time_unit
291
)
292
assert result.dtype == pl.Datetime(time_unit, "UTC")
293
assert result.item() == datetime(2020, 1, 2, 2, 0, tzinfo=timezone.utc)
294
295
296
@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"])
297
def test_infer_tz_aware_with_utc(time_unit: TimeUnit) -> None:
298
result = pl.Series(["2020-01-02T04:00:00+02:00"]).str.to_datetime(
299
time_unit=time_unit
300
)
301
assert result.dtype == pl.Datetime(time_unit, "UTC")
302
assert result.item() == datetime(2020, 1, 2, 2, 0, tzinfo=timezone.utc)
303
304
305
def test_str_to_datetime_infer_tz_aware() -> None:
306
result = (
307
pl.Series(["2020-01-02T04:00:00+02:00"])
308
.str.to_datetime(time_unit="us", time_zone="Europe/Vienna")
309
.item()
310
)
311
assert result == datetime(2020, 1, 2, 3, tzinfo=ZoneInfo("Europe/Vienna"))
312
313
314
@pytest.mark.parametrize(
315
"result",
316
[
317
pl.Series(["2020-01-01T00:00:00+00:00"]).str.strptime(
318
pl.Datetime("us", "UTC"), format="%Y-%m-%dT%H:%M:%S%z"
319
),
320
pl.Series(["2020-01-01T00:00:00+00:00"]).str.strptime(
321
pl.Datetime("us"), format="%Y-%m-%dT%H:%M:%S%z"
322
),
323
pl.Series(["2020-01-01T00:00:00+00:00"]).str.strptime(pl.Datetime("us", "UTC")),
324
pl.Series(["2020-01-01T00:00:00+00:00"]).str.strptime(pl.Datetime("us")),
325
pl.Series(["2020-01-01T00:00:00+00:00"]).str.to_datetime(
326
time_zone="UTC", format="%Y-%m-%dT%H:%M:%S%z"
327
),
328
pl.Series(["2020-01-01T00:00:00+00:00"]).str.to_datetime(
329
format="%Y-%m-%dT%H:%M:%S%z"
330
),
331
pl.Series(["2020-01-01T00:00:00+00:00"]).str.to_datetime(time_zone="UTC"),
332
pl.Series(["2020-01-01T00:00:00+00:00"]).str.to_datetime(),
333
],
334
)
335
def test_parsing_offset_aware_with_utc_dtype(result: pl.Series) -> None:
336
expected = pl.Series([datetime(2020, 1, 1, tzinfo=timezone.utc)])
337
assert_series_equal(result, expected)
338
339
340
def test_datetime_strptime_patterns_consistent() -> None:
341
# note that all should be year first
342
df = pl.Series(
343
"date",
344
[
345
"2018-09-05",
346
"2018-09-05T04:05:01",
347
"2018-09-05T04:24:01.9",
348
"2018-09-05T04:24:02.11",
349
"2018-09-05T14:24:02.123",
350
"2018-09-05T14:24:02.123Z",
351
"2019-04-18T02:45:55.555000000",
352
"2019-04-18T22:45:55.555123",
353
],
354
).to_frame()
355
s = df.with_columns(
356
pl.col("date").str.to_datetime(strict=False).alias("parsed"),
357
)["parsed"]
358
assert s.null_count() == 1
359
assert s[5] is None
360
361
362
def test_datetime_strptime_patterns_inconsistent() -> None:
363
# note that the pattern is inferred from the first element to
364
# be DatetimeDMY, and so the others (correctly) parse as `null`.
365
df = pl.Series(
366
"date",
367
[
368
"09-05-2019",
369
"2018-09-05",
370
"2018-09-05T04:05:01",
371
"2018-09-05T04:24:01.9",
372
"2018-09-05T04:24:02.11",
373
"2018-09-05T14:24:02.123",
374
"2018-09-05T14:24:02.123Z",
375
"2019-04-18T02:45:55.555000000",
376
"2019-04-18T22:45:55.555123",
377
],
378
).to_frame()
379
s = df.with_columns(pl.col("date").str.to_datetime(strict=False).alias("parsed"))[
380
"parsed"
381
]
382
assert s.null_count() == 8
383
assert s[0] is not None
384
385
386
@pytest.mark.parametrize(
387
(
388
"ts",
389
"format",
390
"exp_year",
391
"exp_month",
392
"exp_day",
393
"exp_hour",
394
"exp_minute",
395
"exp_second",
396
),
397
[
398
("-0031-04-24 22:13:20", "%Y-%m-%d %H:%M:%S", -31, 4, 24, 22, 13, 20),
399
("-0031-04-24", "%Y-%m-%d", -31, 4, 24, 0, 0, 0),
400
],
401
)
402
def test_parse_negative_dates(
403
ts: str,
404
format: str,
405
exp_year: int,
406
exp_month: int,
407
exp_day: int,
408
exp_hour: int,
409
exp_minute: int,
410
exp_second: int,
411
) -> None:
412
s = pl.Series([ts])
413
result = s.str.to_datetime(format, time_unit="ms")
414
# Python datetime.datetime doesn't support negative dates, so comparing
415
# with `result.item()` directly won't work.
416
assert result.dt.year().item() == exp_year
417
assert result.dt.month().item() == exp_month
418
assert result.dt.day().item() == exp_day
419
assert result.dt.hour().item() == exp_hour
420
assert result.dt.minute().item() == exp_minute
421
assert result.dt.second().item() == exp_second
422
423
424
def test_short_formats() -> None:
425
s = pl.Series(["20202020", "2020"])
426
assert s.str.to_date("%Y", strict=False).to_list() == [
427
None,
428
date(2020, 1, 1),
429
]
430
assert s.str.to_date("%bar", strict=False).to_list() == [None, None]
431
432
433
@pytest.mark.parametrize(
434
("time_string", "fmt", "datatype", "expected"),
435
[
436
("Jul/2020", "%b/%Y", pl.Date, date(2020, 7, 1)),
437
("Jan/2020", "%b/%Y", pl.Date, date(2020, 1, 1)),
438
("02/Apr/2020", "%d/%b/%Y", pl.Date, date(2020, 4, 2)),
439
("Dec/2020", "%b/%Y", pl.Datetime, datetime(2020, 12, 1, 0, 0)),
440
("Nov/2020", "%b/%Y", pl.Datetime, datetime(2020, 11, 1, 0, 0)),
441
("02/Feb/2020", "%d/%b/%Y", pl.Datetime, datetime(2020, 2, 2, 0, 0)),
442
],
443
)
444
def test_strptime_abbrev_month(
445
time_string: str, fmt: str, datatype: PolarsTemporalType, expected: date
446
) -> None:
447
s = pl.Series([time_string])
448
result = s.str.strptime(datatype, fmt).item()
449
assert result == expected
450
451
452
def test_full_month_name() -> None:
453
s = pl.Series(["2022-December-01"]).str.to_datetime("%Y-%B-%d")
454
assert s[0] == datetime(2022, 12, 1)
455
456
457
@pytest.mark.parametrize(
458
("datatype", "expected"),
459
[
460
(pl.Datetime, datetime(2022, 1, 1)),
461
(pl.Date, date(2022, 1, 1)),
462
],
463
)
464
def test_single_digit_month(
465
datatype: PolarsTemporalType, expected: datetime | date
466
) -> None:
467
s = pl.Series(["2022-1-1"]).str.strptime(datatype, "%Y-%m-%d")
468
assert s[0] == expected
469
470
471
def test_invalid_date_parsing_4898() -> None:
472
assert pl.Series(["2022-09-18", "2022-09-50"]).str.to_date(
473
"%Y-%m-%d", strict=False
474
).to_list() == [date(2022, 9, 18), None]
475
476
477
def test_strptime_invalid_timezone() -> None:
478
ts = pl.Series(["2020-01-01 00:00:00+01:00"]).str.to_datetime("%Y-%m-%d %H:%M:%S%z")
479
with pytest.raises(ComputeError, match=r"unable to parse time zone: 'foo'"):
480
ts.dt.replace_time_zone("foo")
481
482
483
def test_to_datetime_ambiguous_or_non_existent() -> None:
484
with pytest.raises(
485
ComputeError,
486
match="datetime '2021-11-07 01:00:00' is ambiguous in time zone 'US/Central'",
487
):
488
pl.Series(["2021-11-07 01:00"]).str.to_datetime(
489
time_unit="us", time_zone="US/Central"
490
)
491
with pytest.raises(
492
ComputeError,
493
match="datetime '2021-03-28 02:30:00' is non-existent in time zone 'Europe/Warsaw'",
494
):
495
pl.Series(["2021-03-28 02:30"]).str.to_datetime(
496
time_unit="us", time_zone="Europe/Warsaw"
497
)
498
with pytest.raises(
499
ComputeError,
500
match="datetime '2021-03-28 02:30:00' is non-existent in time zone 'Europe/Warsaw'",
501
):
502
pl.Series(["2021-03-28 02:30"]).str.to_datetime(
503
time_unit="us",
504
time_zone="Europe/Warsaw",
505
ambiguous="null",
506
)
507
with pytest.raises(
508
ComputeError,
509
match="datetime '2021-03-28 02:30:00' is non-existent in time zone 'Europe/Warsaw'",
510
):
511
pl.Series(["2021-03-28 02:30"] * 2).str.to_datetime(
512
time_unit="us",
513
time_zone="Europe/Warsaw",
514
ambiguous=pl.Series(["null", "null"]),
515
)
516
517
518
@pytest.mark.parametrize(
519
("ts", "fmt", "expected"),
520
[
521
("2020-01-01T00:00:00Z", None, datetime(2020, 1, 1, tzinfo=timezone.utc)),
522
("2020-01-01T00:00:00Z", "%+", datetime(2020, 1, 1, tzinfo=timezone.utc)),
523
(
524
"2020-01-01T00:00:00+01:00",
525
"%Y-%m-%dT%H:%M:%S%z",
526
datetime(2020, 1, 1, tzinfo=timezone(timedelta(seconds=3600))),
527
),
528
(
529
"2020-01-01T00:00:00+01:00",
530
"%Y-%m-%dT%H:%M:%S%:z",
531
datetime(2020, 1, 1, tzinfo=timezone(timedelta(seconds=3600))),
532
),
533
(
534
"2020-01-01T00:00:00+01:00",
535
"%Y-%m-%dT%H:%M:%S%#z",
536
datetime(2020, 1, 1, tzinfo=timezone(timedelta(seconds=3600))),
537
),
538
],
539
)
540
def test_to_datetime_tz_aware_strptime(ts: str, fmt: str, expected: datetime) -> None:
541
result = pl.Series([ts]).str.to_datetime(fmt).item()
542
assert result == expected
543
544
545
@pytest.mark.parametrize("format", ["%+", "%Y-%m-%dT%H:%M:%S%z"])
546
def test_crossing_dst(format: str) -> None:
547
ts = ["2021-03-27T23:59:59+01:00", "2021-03-28T23:59:59+02:00"]
548
result = pl.Series(ts).str.to_datetime(format)
549
assert result[0] == datetime(2021, 3, 27, 22, 59, 59, tzinfo=ZoneInfo("UTC"))
550
assert result[1] == datetime(2021, 3, 28, 21, 59, 59, tzinfo=ZoneInfo("UTC"))
551
552
553
@pytest.mark.parametrize("format", ["%+", "%Y-%m-%dT%H:%M:%S%z"])
554
def test_crossing_dst_tz_aware(format: str) -> None:
555
ts = ["2021-03-27T23:59:59+01:00", "2021-03-28T23:59:59+02:00"]
556
result = pl.Series(ts).str.to_datetime(format)
557
expected = pl.Series(
558
[
559
datetime(2021, 3, 27, 22, 59, 59, tzinfo=timezone.utc),
560
datetime(2021, 3, 28, 21, 59, 59, tzinfo=timezone.utc),
561
]
562
)
563
assert_series_equal(result, expected)
564
565
566
@pytest.mark.parametrize(
567
("data", "format", "expected"),
568
[
569
(
570
"2023-02-05T05:10:10.074000",
571
"%Y-%m-%dT%H:%M:%S%.f",
572
datetime(2023, 2, 5, 5, 10, 10, 74000),
573
),
574
],
575
)
576
def test_strptime_subseconds_datetime(data: str, format: str, expected: time) -> None:
577
s = pl.Series([data])
578
result = s.str.to_datetime(format).item()
579
assert result == expected
580
581
582
@pytest.mark.parametrize(
583
("string", "fmt"),
584
[
585
pytest.param("2023-05-04|7", "%Y-%m-%d|%H", id="hour but no minute"),
586
pytest.param("2023-05-04|7", "%Y-%m-%d|%k", id="padded hour but no minute"),
587
pytest.param("2023-05-04|10", "%Y-%m-%d|%M", id="minute but no hour"),
588
pytest.param("2023-05-04|10", "%Y-%m-%d|%S", id="second but no hour"),
589
pytest.param(
590
"2000-Jan-01 01 00 01", "%Y-%b-%d %I %M %S", id="12-hour clock but no AM/PM"
591
),
592
pytest.param(
593
"2000-Jan-01 01 00 01",
594
"%Y-%b-%d %l %M %S",
595
id="padded 12-hour clock but no AM/PM",
596
),
597
],
598
)
599
def test_strptime_incomplete_formats(string: str, fmt: str) -> None:
600
with pytest.raises(
601
ComputeError,
602
match="Invalid format string",
603
):
604
pl.Series([string]).str.to_datetime(fmt)
605
606
607
@pytest.mark.parametrize(
608
("string", "fmt", "expected"),
609
[
610
("2023-05-04|7:3", "%Y-%m-%d|%H:%M", datetime(2023, 5, 4, 7, 3)),
611
("2023-05-04|10:03", "%Y-%m-%d|%H:%M", datetime(2023, 5, 4, 10, 3)),
612
(
613
"2000-Jan-01 01 00 01 am",
614
"%Y-%b-%d %I %M %S %P",
615
datetime(2000, 1, 1, 1, 0, 1),
616
),
617
(
618
"2000-Jan-01 01 00 01 am",
619
"%Y-%b-%d %_I %M %S %P",
620
datetime(2000, 1, 1, 1, 0, 1),
621
),
622
(
623
"2000-Jan-01 01 00 01 am",
624
"%Y-%b-%d %l %M %S %P",
625
datetime(2000, 1, 1, 1, 0, 1),
626
),
627
(
628
"2000-Jan-01 01 00 01 AM",
629
"%Y-%b-%d %I %M %S %p",
630
datetime(2000, 1, 1, 1, 0, 1),
631
),
632
(
633
"2000-Jan-01 01 00 01 AM",
634
"%Y-%b-%d %_I %M %S %p",
635
datetime(2000, 1, 1, 1, 0, 1),
636
),
637
(
638
"2000-Jan-01 01 00 01 AM",
639
"%Y-%b-%d %l %M %S %p",
640
datetime(2000, 1, 1, 1, 0, 1),
641
),
642
],
643
)
644
def test_strptime_complete_formats(string: str, fmt: str, expected: datetime) -> None:
645
# Similar to the above, but these formats are complete and should work
646
result = pl.Series([string]).str.to_datetime(fmt).item()
647
assert result == expected
648
649
650
@pytest.mark.parametrize(
651
("data", "format", "expected"),
652
[
653
("00:00:00.000005000", "%H:%M:%S%.f", time(0, 0, 0, 5)),
654
("01:23:10.000500", "%H:%M:%S%.6f", time(1, 23, 10, 500)),
655
("08:10:11.000", "%H:%M:%S%.3f", time(8, 10, 11)),
656
("15:50:25", "%T", time(15, 50, 25)),
657
("22:35", "%R", time(22, 35)),
658
],
659
)
660
def test_to_time_inferred(data: str, format: str, expected: time) -> None:
661
df = pl.DataFrame({"tmstr": [data]})
662
expected_df = df.with_columns(tm=pl.Series("tm", values=[expected]))
663
for fmt in (format, None):
664
res = df.with_columns(tm=pl.col("tmstr").str.to_time(fmt))
665
assert_frame_equal(res, expected_df)
666
667
668
@pytest.mark.parametrize(
669
("data", "format", "expected"),
670
[
671
("05:10:11.740000", "%H:%M:%S%.f", time(5, 10, 11, 740000)),
672
("13:20:12.000074", "%T%.6f", time(13, 20, 12, 74)),
673
("21:30:13.007400", "%H:%M:%S%.3f", time(21, 30, 13, 7400)),
674
],
675
)
676
def test_to_time_subseconds(data: str, format: str, expected: time) -> None:
677
s = pl.Series([data])
678
for res in (
679
s.str.to_time().item(),
680
s.str.to_time(format).item(),
681
):
682
assert res == expected
683
684
685
def test_to_time_format_warning() -> None:
686
s = pl.Series(["05:10:10.074000"])
687
with pytest.warns(ChronoFormatWarning, match=".%f"):
688
result = s.str.to_time("%H:%M:%S.%f").item()
689
assert result == time(5, 10, 10, 74)
690
691
692
@pytest.mark.parametrize("exact", [True, False])
693
def test_to_datetime_ambiguous_earliest(exact: bool) -> None:
694
result = (
695
pl.Series(["2020-10-25 01:00"])
696
.str.to_datetime(time_zone="Europe/London", ambiguous="earliest", exact=exact)
697
.item()
698
)
699
expected = datetime(2020, 10, 25, 1, fold=0, tzinfo=ZoneInfo("Europe/London"))
700
assert result == expected
701
result = (
702
pl.Series(["2020-10-25 01:00"])
703
.str.to_datetime(time_zone="Europe/London", ambiguous="latest", exact=exact)
704
.item()
705
)
706
expected = datetime(2020, 10, 25, 1, fold=1, tzinfo=ZoneInfo("Europe/London"))
707
assert result == expected
708
with pytest.raises(ComputeError):
709
pl.Series(["2020-10-25 01:00"]).str.to_datetime(
710
time_zone="Europe/London",
711
exact=exact,
712
).item()
713
714
715
def test_to_datetime_naive_format_and_time_zone() -> None:
716
# format-specified path
717
result = pl.Series(["2020-01-01"]).str.to_datetime(
718
format="%Y-%m-%d", time_zone="Asia/Kathmandu"
719
)
720
expected = pl.Series([datetime(2020, 1, 1)]).dt.replace_time_zone("Asia/Kathmandu")
721
assert_series_equal(result, expected)
722
# format-inferred path
723
result = pl.Series(["2020-01-01"]).str.to_datetime(time_zone="Asia/Kathmandu")
724
assert_series_equal(result, expected)
725
726
727
@pytest.mark.parametrize("exact", [True, False])
728
def test_strptime_ambiguous_earliest(exact: bool) -> None:
729
result = (
730
pl.Series(["2020-10-25 01:00"])
731
.str.strptime(
732
pl.Datetime("us", "Europe/London"), ambiguous="earliest", exact=exact
733
)
734
.item()
735
)
736
expected = datetime(2020, 10, 25, 1, fold=0, tzinfo=ZoneInfo("Europe/London"))
737
assert result == expected
738
result = (
739
pl.Series(["2020-10-25 01:00"])
740
.str.strptime(
741
pl.Datetime("us", "Europe/London"), ambiguous="latest", exact=exact
742
)
743
.item()
744
)
745
expected = datetime(2020, 10, 25, 1, fold=1, tzinfo=ZoneInfo("Europe/London"))
746
assert result == expected
747
with pytest.raises(ComputeError):
748
pl.Series(["2020-10-25 01:00"]).str.strptime(
749
pl.Datetime("us", "Europe/London"),
750
exact=exact,
751
).item()
752
753
754
@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"])
755
def test_to_datetime_out_of_range_13401(time_unit: TimeUnit) -> None:
756
s = pl.Series(["2020-January-01 12:34:66"])
757
with pytest.raises(InvalidOperationError, match="conversion .* failed"):
758
s.str.to_datetime("%Y-%B-%d %H:%M:%S", time_unit=time_unit)
759
assert (
760
s.str.to_datetime("%Y-%B-%d %H:%M:%S", strict=False, time_unit=time_unit).item()
761
is None
762
)
763
764
765
def test_out_of_ns_range_no_tu_specified_13592() -> None:
766
df = pl.DataFrame({"dates": ["2022-08-31 00:00:00.0", "0920-09-18 00:00:00.0"]})
767
result = df.select(pl.col("dates").str.to_datetime(format="%Y-%m-%d %H:%M:%S%.f"))[
768
"dates"
769
]
770
expected = pl.Series(
771
"dates",
772
[datetime(2022, 8, 31, 0, 0), datetime(920, 9, 18, 0, 0)],
773
dtype=pl.Datetime("us"),
774
)
775
assert_series_equal(result, expected)
776
777
778
def test_wrong_format_percent() -> None:
779
with pytest.raises(InvalidOperationError):
780
pl.Series(["2019-01-01"]).str.strptime(pl.Date, format="d%")
781
782
783
def test_polars_parser_fooled_by_trailing_nonsense_22167() -> None:
784
with pytest.raises(InvalidOperationError):
785
pl.Series(["2025-04-06T18:57:42.77756192Z"]).str.to_datetime(
786
"%Y-%m-%dT%H:%M:%S.%9fcabbagebananapotato"
787
)
788
with pytest.raises(InvalidOperationError):
789
pl.Series(["2025-04-06T18:57:42.77756192Z"]).str.to_datetime(
790
"%Y-%m-%dT%H:%M:%S.%9f#z"
791
)
792
with pytest.raises(InvalidOperationError):
793
pl.Series(["2025-04-06T18:57:42.77Z"]).str.to_datetime(
794
"%Y-%m-%dT%H:%M:%S.%3f#z"
795
)
796
with pytest.raises(InvalidOperationError):
797
pl.Series(["2025-04-06T18:57:42.77123Z"]).str.to_datetime(
798
"%Y-%m-%dT%H:%M:%S.%6f#z"
799
)
800
801
802
def test_strptime_empty_input_22214() -> None:
803
s = pl.Series("x", [], pl.String)
804
805
assert s.str.strptime(pl.Time, "%H:%M:%S%.f").is_empty()
806
assert s.str.strptime(pl.Date, "%Y-%m-%d").is_empty()
807
assert s.str.strptime(pl.Datetime, "%Y-%m-%d %H:%M%#z").is_empty()
808
809
810
@pytest.mark.parametrize(
811
"value",
812
[
813
"31/12/2022",
814
"banana",
815
"12-345-678",
816
"12-345-67",
817
"12-345-6789",
818
"123*45*678",
819
"123x45x678",
820
"123x45x678x",
821
],
822
)
823
def test_matching_strings_but_different_format_22495(value: str) -> None:
824
s = pl.Series("my_strings", [value])
825
result = s.str.to_date("%Y-%m-%d", strict=False).item()
826
assert result is None
827
828
829
def test_date_parse_omit_day_month() -> None:
830
fmt_B = "%Y %B"
831
fmt_b = "%Y %b"
832
df = (
833
pl.select(date=pl.date_range(pl.date(2022, 1, 1), pl.date(2022, 12, 1), "1mo"))
834
.with_columns(
835
strdateB=pl.col("date").dt.strftime(fmt_B),
836
strdateb=pl.col("date").dt.strftime(fmt_b),
837
)
838
.with_columns(
839
round_tripB=pl.col("strdateB").str.strptime(pl.Date, fmt_B),
840
round_tripb=pl.col("strdateb").str.strptime(pl.Date, fmt_b),
841
)
842
)
843
check = df.filter(
844
~pl.all_horizontal(
845
pl.col("date") == pl.col("round_tripB"),
846
pl.col("date") == pl.col("round_tripb"),
847
)
848
)
849
assert check.height == 0
850
851
s = pl.Series(
852
[
853
"2022 January",
854
"2022 February",
855
"2022 March",
856
"2022 April",
857
"2022 May",
858
"2022 June",
859
"2022 July",
860
"2022 August",
861
"2022 September",
862
"2022 October",
863
"2022 November",
864
"2022 December",
865
]
866
)
867
result = s.str.strptime(pl.Date, "%Y %B")
868
expected = pl.Series(
869
[
870
date(2022, 1, 1),
871
date(2022, 2, 1),
872
date(2022, 3, 1),
873
date(2022, 4, 1),
874
date(2022, 5, 1),
875
date(2022, 6, 1),
876
date(2022, 7, 1),
877
date(2022, 8, 1),
878
date(2022, 9, 1),
879
date(2022, 10, 1),
880
date(2022, 11, 1),
881
date(2022, 12, 1),
882
]
883
)
884
assert_series_equal(result, expected)
885
886
887
@pytest.mark.parametrize("length", [1, 5])
888
def test_eager_inference_on_expr(length: int) -> None:
889
s = pl.Series("a", ["2025-04-06T18:57:42.77123Z"] * length)
890
891
assert_series_equal(
892
s.str.strptime(pl.Datetime),
893
pl.Series(
894
"a",
895
[
896
datetime(
897
2025, 4, 6, 18, 57, 42, 771230, tzinfo=timezone(timedelta(hours=0))
898
)
899
]
900
* length,
901
),
902
)
903
904
with pytest.raises(
905
ComputeError,
906
match="`strptime` / `to_datetime` was called with no format and no time zone, but a time zone is part of the data",
907
):
908
s.to_frame().select(pl.col("a").str.strptime(pl.Datetime))
909
910