CoCalc -- test_strptime.py

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/namespaces/test_strptime.py
⁶⁹⁴⁰ views
1
"""
2
Module for testing `.str.strptime` of the string namespace.
3

4
This method gets its own module due to its complexity.
5
"""
6

7
from __future__ import annotations
8

9
from datetime import date, datetime, time, timedelta, timezone
10
from typing import TYPE_CHECKING
11
from zoneinfo import ZoneInfo
12

13
import pytest
14

15
import polars as pl
16
from polars.exceptions import ChronoFormatWarning, ComputeError, InvalidOperationError
17
from polars.testing import assert_frame_equal, assert_series_equal
18

19
if TYPE_CHECKING:
20
    from polars._typing import PolarsTemporalType, TimeUnit
21

22

23
def test_str_strptime() -> None:
24
    s = pl.Series(["2020-01-01", "2020-02-02"])
25
    expected = pl.Series([date(2020, 1, 1), date(2020, 2, 2)])
26
    assert_series_equal(s.str.strptime(pl.Date, "%Y-%m-%d"), expected)
27

28
    s = pl.Series(["2020-01-01 00:00:00", "2020-02-02 03:20:10"])
29
    expected = pl.Series(
30
        [datetime(2020, 1, 1, 0, 0, 0), datetime(2020, 2, 2, 3, 20, 10)]
31
    )
32
    assert_series_equal(s.str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S"), expected)
33

34
    s = pl.Series(["00:00:00", "03:20:10"])
35
    expected = pl.Series([0, 12010000000000], dtype=pl.Time)
36
    assert_series_equal(s.str.strptime(pl.Time, "%H:%M:%S"), expected)
37

38

39
def test_date_parse_omit_day() -> None:
40
    df = pl.DataFrame({"month": ["2022-01"]})
41
    assert df.select(pl.col("month").str.to_date(format="%Y-%m")).item() == date(
42
        2022, 1, 1
43
    )
44
    assert df.select(
45
        pl.col("month").str.to_datetime(format="%Y-%m")
46
    ).item() == datetime(2022, 1, 1)
47

48

49
def test_to_datetime_precision() -> None:
50
    s = pl.Series(
51
        "date", ["2022-09-12 21:54:36.789321456", "2022-09-13 12:34:56.987456321"]
52
    )
53
    ds = s.str.to_datetime()
54
    assert ds.cast(pl.Date).is_not_null().all()
55
    assert getattr(ds.dtype, "time_unit", None) == "us"
56

57
    time_units: list[TimeUnit] = ["ms", "us", "ns"]
58
    suffixes = ["%.3f", "%.6f", "%.9f"]
59
    test_data = zip(
60
        time_units,
61
        suffixes,
62
        (
63
            [789000000, 987000000],
64
            [789321000, 987456000],
65
            [789321456, 987456321],
66
        ),
67
    )
68
    for time_unit, suffix, expected_values in test_data:
69
        ds = s.str.to_datetime(f"%Y-%m-%d %H:%M:%S{suffix}", time_unit=time_unit)
70
        assert getattr(ds.dtype, "time_unit", None) == time_unit
71
        assert ds.dt.nanosecond().to_list() == expected_values
72

73

74
@pytest.mark.parametrize(
75
    ("time_unit", "expected"),
76
    [("ms", "123000000"), ("us", "123456000"), ("ns", "123456789")],
77
)
78
@pytest.mark.parametrize("format", ["%Y-%m-%d %H:%M:%S%.f", None])
79
def test_to_datetime_precision_with_time_unit(
80
    time_unit: TimeUnit, expected: str, format: str
81
) -> None:
82
    s = pl.Series(["2020-01-01 00:00:00.123456789"])
83
    result = s.str.to_datetime(format, time_unit=time_unit).dt.to_string("%f")[0]
84
    assert result == expected
85

86

87
@pytest.mark.parametrize(
88
    ("tz_string", "timedelta"),
89
    [("+01:00", timedelta(minutes=60)), ("-01:30", timedelta(hours=-1, minutes=-30))],
90
)
91
def test_timezone_aware_strptime(tz_string: str, timedelta: timedelta) -> None:
92
    times = pl.DataFrame(
93
        {
94
            "delivery_datetime": [
95
                "2021-12-05 06:00:00" + tz_string,
96
                "2021-12-05 07:00:00" + tz_string,
97
                "2021-12-05 08:00:00" + tz_string,
98
            ]
99
        }
100
    )
101
    assert times.with_columns(
102
        pl.col("delivery_datetime").str.to_datetime(format="%Y-%m-%d %H:%M:%S%z")
103
    ).to_dict(as_series=False) == {
104
        "delivery_datetime": [
105
            datetime(2021, 12, 5, 6, 0, tzinfo=timezone(timedelta)),
106
            datetime(2021, 12, 5, 7, 0, tzinfo=timezone(timedelta)),
107
            datetime(2021, 12, 5, 8, 0, tzinfo=timezone(timedelta)),
108
        ]
109
    }
110

111

112
def test_to_date_non_exact_strptime() -> None:
113
    s = pl.Series("a", ["2022-01-16", "2022-01-17", "foo2022-01-18", "b2022-01-19ar"])
114
    format = "%Y-%m-%d"
115

116
    result = s.str.to_date(format, strict=False, exact=True)
117
    expected = pl.Series("a", [date(2022, 1, 16), date(2022, 1, 17), None, None])
118
    assert_series_equal(result, expected)
119

120
    result = s.str.to_date(format, strict=False, exact=False)
121
    expected = pl.Series(
122
        "a",
123
        [date(2022, 1, 16), date(2022, 1, 17), date(2022, 1, 18), date(2022, 1, 19)],
124
    )
125
    assert_series_equal(result, expected)
126

127
    with pytest.raises(InvalidOperationError):
128
        s.str.to_date(format, strict=True, exact=True)
129

130

131
@pytest.mark.parametrize(
132
    ("time_string", "expected"),
133
    [
134
        ("01-02-2024", date(2024, 2, 1)),
135
        ("01.02.2024", date(2024, 2, 1)),
136
        ("01/02/2024", date(2024, 2, 1)),
137
        ("2024-02-01", date(2024, 2, 1)),
138
        ("2024/02/01", date(2024, 2, 1)),
139
        ("31-12-2024", date(2024, 12, 31)),
140
        ("31.12.2024", date(2024, 12, 31)),
141
        ("31/12/2024", date(2024, 12, 31)),
142
        ("2024-12-31", date(2024, 12, 31)),
143
        ("2024/12/31", date(2024, 12, 31)),
144
    ],
145
)
146
def test_to_date_all_inferred_date_patterns(time_string: str, expected: date) -> None:
147
    result = pl.Series([time_string]).str.to_date()
148
    assert result[0] == expected
149

150

151
@pytest.mark.parametrize(
152
    ("time_string", "expected"),
153
    [
154
        ("2024-12-04 09:08:00", datetime(2024, 12, 4, 9, 8, 0)),
155
        ("2024-12-4 9:8:0", datetime(2024, 12, 4, 9, 8, 0)),
156
        ("2024/12/04 9:8", datetime(2024, 12, 4, 9, 8, 0)),
157
        ("4/12/2024 9:8", datetime(2024, 12, 4, 9, 8, 0)),
158
    ],
159
)
160
def test_to_datetime_infer_missing_digit_in_time_16092(
161
    time_string: str, expected: datetime
162
) -> None:
163
    result = pl.Series([time_string]).str.to_datetime()
164
    assert result[0] == expected
165

166

167
@pytest.mark.parametrize(
168
    ("value", "attr"),
169
    [
170
        ("a", "to_date"),
171
        ("ab", "to_date"),
172
        ("a", "to_datetime"),
173
        ("ab", "to_datetime"),
174
    ],
175
)
176
def test_non_exact_short_elements_10223(value: str, attr: str) -> None:
177
    with pytest.raises((InvalidOperationError, ComputeError)):
178
        getattr(pl.Series(["2019-01-01", value]).str, attr)(exact=False)
179

180

181
@pytest.mark.parametrize(
182
    ("offset", "time_zone", "tzinfo", "format"),
183
    [
184
        ("+01:00", "UTC", timezone(timedelta(hours=1)), "%Y-%m-%dT%H:%M%z"),
185
        ("", None, None, "%Y-%m-%dT%H:%M"),
186
    ],
187
)
188
def test_to_datetime_non_exact_strptime(
189
    offset: str, time_zone: str | None, tzinfo: timezone | None, format: str
190
) -> None:
191
    s = pl.Series(
192
        "a",
193
        [
194
            f"2022-01-16T00:00{offset}",
195
            f"2022-01-17T00:00{offset}",
196
            f"foo2022-01-18T00:00{offset}",
197
            f"b2022-01-19T00:00{offset}ar",
198
        ],
199
    )
200

201
    result = s.str.to_datetime(format, strict=False, exact=True)
202
    expected = pl.Series(
203
        "a",
204
        [
205
            datetime(2022, 1, 16, tzinfo=tzinfo),
206
            datetime(2022, 1, 17, tzinfo=tzinfo),
207
            None,
208
            None,
209
        ],
210
    )
211
    assert_series_equal(result, expected)
212
    assert result.dtype == pl.Datetime("us", time_zone)
213

214
    result = s.str.to_datetime(format, strict=False, exact=False)
215
    expected = pl.Series(
216
        "a",
217
        [
218
            datetime(2022, 1, 16, tzinfo=tzinfo),
219
            datetime(2022, 1, 17, tzinfo=tzinfo),
220
            datetime(2022, 1, 18, tzinfo=tzinfo),
221
            datetime(2022, 1, 19, tzinfo=tzinfo),
222
        ],
223
    )
224
    assert_series_equal(result, expected)
225
    assert result.dtype == pl.Datetime("us", time_zone)
226

227
    with pytest.raises(InvalidOperationError):
228
        s.str.to_datetime(format, strict=True, exact=True)
229

230

231
def test_to_datetime_dates_datetimes() -> None:
232
    s = pl.Series("date", ["2021-04-22", "2022-01-04 00:00:00"])
233
    assert s.str.to_datetime().to_list() == [
234
        datetime(2021, 4, 22, 0, 0),
235
        datetime(2022, 1, 4, 0, 0),
236
    ]
237

238

239
@pytest.mark.parametrize(
240
    ("time_string", "expected"),
241
    [
242
        ("09-05-2019", datetime(2019, 5, 9)),
243
        ("2018-09-05", datetime(2018, 9, 5)),
244
        ("2018-09-05T04:05:01", datetime(2018, 9, 5, 4, 5, 1)),
245
        ("2018-09-05T04:24:01.9", datetime(2018, 9, 5, 4, 24, 1, 900000)),
246
        ("2018-09-05T04:24:02.11", datetime(2018, 9, 5, 4, 24, 2, 110000)),
247
        ("2018-09-05T14:24:02.123", datetime(2018, 9, 5, 14, 24, 2, 123000)),
248
        ("2019-04-18T02:45:55.555000000", datetime(2019, 4, 18, 2, 45, 55, 555000)),
249
        ("2019-04-18T22:45:55.555123", datetime(2019, 4, 18, 22, 45, 55, 555123)),
250
        (
251
            "2018-09-05T04:05:01+01:00",
252
            datetime(2018, 9, 5, 4, 5, 1, tzinfo=timezone(timedelta(hours=1))),
253
        ),
254
        (
255
            "2018-09-05T04:24:01.9+01:00",
256
            datetime(2018, 9, 5, 4, 24, 1, 900000, tzinfo=timezone(timedelta(hours=1))),
257
        ),
258
        (
259
            "2018-09-05T04:24:02.11+01:00",
260
            datetime(2018, 9, 5, 4, 24, 2, 110000, tzinfo=timezone(timedelta(hours=1))),
261
        ),
262
        (
263
            "2018-09-05T14:24:02.123+01:00",
264
            datetime(
265
                2018, 9, 5, 14, 24, 2, 123000, tzinfo=timezone(timedelta(hours=1))
266
            ),
267
        ),
268
        (
269
            "2019-04-18T02:45:55.555000000+01:00",
270
            datetime(
271
                2019, 4, 18, 2, 45, 55, 555000, tzinfo=timezone(timedelta(hours=1))
272
            ),
273
        ),
274
        (
275
            "2019-04-18T22:45:55.555123+01:00",
276
            datetime(
277
                2019, 4, 18, 22, 45, 55, 555123, tzinfo=timezone(timedelta(hours=1))
278
            ),
279
        ),
280
    ],
281
)
282
def test_to_datetime_patterns_single(time_string: str, expected: str) -> None:
283
    result = pl.Series([time_string]).str.to_datetime().item()
284
    assert result == expected
285

286

287
@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"])
288
def test_infer_tz_aware_time_unit(time_unit: TimeUnit) -> None:
289
    result = pl.Series(["2020-01-02T04:00:00+02:00"]).str.to_datetime(
290
        time_unit=time_unit
291
    )
292
    assert result.dtype == pl.Datetime(time_unit, "UTC")
293
    assert result.item() == datetime(2020, 1, 2, 2, 0, tzinfo=timezone.utc)
294

295

296
@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"])
297
def test_infer_tz_aware_with_utc(time_unit: TimeUnit) -> None:
298
    result = pl.Series(["2020-01-02T04:00:00+02:00"]).str.to_datetime(
299
        time_unit=time_unit
300
    )
301
    assert result.dtype == pl.Datetime(time_unit, "UTC")
302
    assert result.item() == datetime(2020, 1, 2, 2, 0, tzinfo=timezone.utc)
303

304

305
def test_str_to_datetime_infer_tz_aware() -> None:
306
    result = (
307
        pl.Series(["2020-01-02T04:00:00+02:00"])
308
        .str.to_datetime(time_unit="us", time_zone="Europe/Vienna")
309
        .item()
310
    )
311
    assert result == datetime(2020, 1, 2, 3, tzinfo=ZoneInfo("Europe/Vienna"))
312

313

314
@pytest.mark.parametrize(
315
    "result",
316
    [
317
        pl.Series(["2020-01-01T00:00:00+00:00"]).str.strptime(
318
            pl.Datetime("us", "UTC"), format="%Y-%m-%dT%H:%M:%S%z"
319
        ),
320
        pl.Series(["2020-01-01T00:00:00+00:00"]).str.strptime(
321
            pl.Datetime("us"), format="%Y-%m-%dT%H:%M:%S%z"
322
        ),
323
        pl.Series(["2020-01-01T00:00:00+00:00"]).str.strptime(pl.Datetime("us", "UTC")),
324
        pl.Series(["2020-01-01T00:00:00+00:00"]).str.strptime(pl.Datetime("us")),
325
        pl.Series(["2020-01-01T00:00:00+00:00"]).str.to_datetime(
326
            time_zone="UTC", format="%Y-%m-%dT%H:%M:%S%z"
327
        ),
328
        pl.Series(["2020-01-01T00:00:00+00:00"]).str.to_datetime(
329
            format="%Y-%m-%dT%H:%M:%S%z"
330
        ),
331
        pl.Series(["2020-01-01T00:00:00+00:00"]).str.to_datetime(time_zone="UTC"),
332
        pl.Series(["2020-01-01T00:00:00+00:00"]).str.to_datetime(),
333
    ],
334
)
335
def test_parsing_offset_aware_with_utc_dtype(result: pl.Series) -> None:
336
    expected = pl.Series([datetime(2020, 1, 1, tzinfo=timezone.utc)])
337
    assert_series_equal(result, expected)
338

339

340
def test_datetime_strptime_patterns_consistent() -> None:
341
    # note that all should be year first
342
    df = pl.Series(
343
        "date",
344
        [
345
            "2018-09-05",
346
            "2018-09-05T04:05:01",
347
            "2018-09-05T04:24:01.9",
348
            "2018-09-05T04:24:02.11",
349
            "2018-09-05T14:24:02.123",
350
            "2018-09-05T14:24:02.123Z",
351
            "2019-04-18T02:45:55.555000000",
352
            "2019-04-18T22:45:55.555123",
353
        ],
354
    ).to_frame()
355
    s = df.with_columns(
356
        pl.col("date").str.to_datetime(strict=False).alias("parsed"),
357
    )["parsed"]
358
    assert s.null_count() == 1
359
    assert s[5] is None
360

361

362
def test_datetime_strptime_patterns_inconsistent() -> None:
363
    # note that the pattern is inferred from the first element to
364
    # be DatetimeDMY, and so the others (correctly) parse as `null`.
365
    df = pl.Series(
366
        "date",
367
        [
368
            "09-05-2019",
369
            "2018-09-05",
370
            "2018-09-05T04:05:01",
371
            "2018-09-05T04:24:01.9",
372
            "2018-09-05T04:24:02.11",
373
            "2018-09-05T14:24:02.123",
374
            "2018-09-05T14:24:02.123Z",
375
            "2019-04-18T02:45:55.555000000",
376
            "2019-04-18T22:45:55.555123",
377
        ],
378
    ).to_frame()
379
    s = df.with_columns(pl.col("date").str.to_datetime(strict=False).alias("parsed"))[
380
        "parsed"
381
    ]
382
    assert s.null_count() == 8
383
    assert s[0] is not None
384

385

386
@pytest.mark.parametrize(
387
    (
388
        "ts",
389
        "format",
390
        "exp_year",
391
        "exp_month",
392
        "exp_day",
393
        "exp_hour",
394
        "exp_minute",
395
        "exp_second",
396
    ),
397
    [
398
        ("-0031-04-24 22:13:20", "%Y-%m-%d %H:%M:%S", -31, 4, 24, 22, 13, 20),
399
        ("-0031-04-24", "%Y-%m-%d", -31, 4, 24, 0, 0, 0),
400
    ],
401
)
402
def test_parse_negative_dates(
403
    ts: str,
404
    format: str,
405
    exp_year: int,
406
    exp_month: int,
407
    exp_day: int,
408
    exp_hour: int,
409
    exp_minute: int,
410
    exp_second: int,
411
) -> None:
412
    s = pl.Series([ts])
413
    result = s.str.to_datetime(format, time_unit="ms")
414
    # Python datetime.datetime doesn't support negative dates, so comparing
415
    # with `result.item()` directly won't work.
416
    assert result.dt.year().item() == exp_year
417
    assert result.dt.month().item() == exp_month
418
    assert result.dt.day().item() == exp_day
419
    assert result.dt.hour().item() == exp_hour
420
    assert result.dt.minute().item() == exp_minute
421
    assert result.dt.second().item() == exp_second
422

423

424
def test_short_formats() -> None:
425
    s = pl.Series(["20202020", "2020"])
426
    assert s.str.to_date("%Y", strict=False).to_list() == [
427
        None,
428
        date(2020, 1, 1),
429
    ]
430
    assert s.str.to_date("%bar", strict=False).to_list() == [None, None]
431

432

433
@pytest.mark.parametrize(
434
    ("time_string", "fmt", "datatype", "expected"),
435
    [
436
        ("Jul/2020", "%b/%Y", pl.Date, date(2020, 7, 1)),
437
        ("Jan/2020", "%b/%Y", pl.Date, date(2020, 1, 1)),
438
        ("02/Apr/2020", "%d/%b/%Y", pl.Date, date(2020, 4, 2)),
439
        ("Dec/2020", "%b/%Y", pl.Datetime, datetime(2020, 12, 1, 0, 0)),
440
        ("Nov/2020", "%b/%Y", pl.Datetime, datetime(2020, 11, 1, 0, 0)),
441
        ("02/Feb/2020", "%d/%b/%Y", pl.Datetime, datetime(2020, 2, 2, 0, 0)),
442
    ],
443
)
444
def test_strptime_abbrev_month(
445
    time_string: str, fmt: str, datatype: PolarsTemporalType, expected: date
446
) -> None:
447
    s = pl.Series([time_string])
448
    result = s.str.strptime(datatype, fmt).item()
449
    assert result == expected
450

451

452
def test_full_month_name() -> None:
453
    s = pl.Series(["2022-December-01"]).str.to_datetime("%Y-%B-%d")
454
    assert s[0] == datetime(2022, 12, 1)
455

456

457
@pytest.mark.parametrize(
458
    ("datatype", "expected"),
459
    [
460
        (pl.Datetime, datetime(2022, 1, 1)),
461
        (pl.Date, date(2022, 1, 1)),
462
    ],
463
)
464
def test_single_digit_month(
465
    datatype: PolarsTemporalType, expected: datetime | date
466
) -> None:
467
    s = pl.Series(["2022-1-1"]).str.strptime(datatype, "%Y-%m-%d")
468
    assert s[0] == expected
469

470

471
def test_invalid_date_parsing_4898() -> None:
472
    assert pl.Series(["2022-09-18", "2022-09-50"]).str.to_date(
473
        "%Y-%m-%d", strict=False
474
    ).to_list() == [date(2022, 9, 18), None]
475

476

477
def test_strptime_invalid_timezone() -> None:
478
    ts = pl.Series(["2020-01-01 00:00:00+01:00"]).str.to_datetime("%Y-%m-%d %H:%M:%S%z")
479
    with pytest.raises(ComputeError, match=r"unable to parse time zone: 'foo'"):
480
        ts.dt.replace_time_zone("foo")
481

482

483
def test_to_datetime_ambiguous_or_non_existent() -> None:
484
    with pytest.raises(
485
        ComputeError,
486
        match="datetime '2021-11-07 01:00:00' is ambiguous in time zone 'US/Central'",
487
    ):
488
        pl.Series(["2021-11-07 01:00"]).str.to_datetime(
489
            time_unit="us", time_zone="US/Central"
490
        )
491
    with pytest.raises(
492
        ComputeError,
493
        match="datetime '2021-03-28 02:30:00' is non-existent in time zone 'Europe/Warsaw'",
494
    ):
495
        pl.Series(["2021-03-28 02:30"]).str.to_datetime(
496
            time_unit="us", time_zone="Europe/Warsaw"
497
        )
498
    with pytest.raises(
499
        ComputeError,
500
        match="datetime '2021-03-28 02:30:00' is non-existent in time zone 'Europe/Warsaw'",
501
    ):
502
        pl.Series(["2021-03-28 02:30"]).str.to_datetime(
503
            time_unit="us",
504
            time_zone="Europe/Warsaw",
505
            ambiguous="null",
506
        )
507
    with pytest.raises(
508
        ComputeError,
509
        match="datetime '2021-03-28 02:30:00' is non-existent in time zone 'Europe/Warsaw'",
510
    ):
511
        pl.Series(["2021-03-28 02:30"] * 2).str.to_datetime(
512
            time_unit="us",
513
            time_zone="Europe/Warsaw",
514
            ambiguous=pl.Series(["null", "null"]),
515
        )
516

517

518
@pytest.mark.parametrize(
519
    ("ts", "fmt", "expected"),
520
    [
521
        ("2020-01-01T00:00:00Z", None, datetime(2020, 1, 1, tzinfo=timezone.utc)),
522
        ("2020-01-01T00:00:00Z", "%+", datetime(2020, 1, 1, tzinfo=timezone.utc)),
523
        (
524
            "2020-01-01T00:00:00+01:00",
525
            "%Y-%m-%dT%H:%M:%S%z",
526
            datetime(2020, 1, 1, tzinfo=timezone(timedelta(seconds=3600))),
527
        ),
528
        (
529
            "2020-01-01T00:00:00+01:00",
530
            "%Y-%m-%dT%H:%M:%S%:z",
531
            datetime(2020, 1, 1, tzinfo=timezone(timedelta(seconds=3600))),
532
        ),
533
        (
534
            "2020-01-01T00:00:00+01:00",
535
            "%Y-%m-%dT%H:%M:%S%#z",
536
            datetime(2020, 1, 1, tzinfo=timezone(timedelta(seconds=3600))),
537
        ),
538
    ],
539
)
540
def test_to_datetime_tz_aware_strptime(ts: str, fmt: str, expected: datetime) -> None:
541
    result = pl.Series([ts]).str.to_datetime(fmt).item()
542
    assert result == expected
543

544

545
@pytest.mark.parametrize("format", ["%+", "%Y-%m-%dT%H:%M:%S%z"])
546
def test_crossing_dst(format: str) -> None:
547
    ts = ["2021-03-27T23:59:59+01:00", "2021-03-28T23:59:59+02:00"]
548
    result = pl.Series(ts).str.to_datetime(format)
549
    assert result[0] == datetime(2021, 3, 27, 22, 59, 59, tzinfo=ZoneInfo("UTC"))
550
    assert result[1] == datetime(2021, 3, 28, 21, 59, 59, tzinfo=ZoneInfo("UTC"))
551

552

553
@pytest.mark.parametrize("format", ["%+", "%Y-%m-%dT%H:%M:%S%z"])
554
def test_crossing_dst_tz_aware(format: str) -> None:
555
    ts = ["2021-03-27T23:59:59+01:00", "2021-03-28T23:59:59+02:00"]
556
    result = pl.Series(ts).str.to_datetime(format)
557
    expected = pl.Series(
558
        [
559
            datetime(2021, 3, 27, 22, 59, 59, tzinfo=timezone.utc),
560
            datetime(2021, 3, 28, 21, 59, 59, tzinfo=timezone.utc),
561
        ]
562
    )
563
    assert_series_equal(result, expected)
564

565

566
@pytest.mark.parametrize(
567
    ("data", "format", "expected"),
568
    [
569
        (
570
            "2023-02-05T05:10:10.074000",
571
            "%Y-%m-%dT%H:%M:%S%.f",
572
            datetime(2023, 2, 5, 5, 10, 10, 74000),
573
        ),
574
    ],
575
)
576
def test_strptime_subseconds_datetime(data: str, format: str, expected: time) -> None:
577
    s = pl.Series([data])
578
    result = s.str.to_datetime(format).item()
579
    assert result == expected
580

581

582
@pytest.mark.parametrize(
583
    ("string", "fmt"),
584
    [
585
        pytest.param("2023-05-04|7", "%Y-%m-%d|%H", id="hour but no minute"),
586
        pytest.param("2023-05-04|7", "%Y-%m-%d|%k", id="padded hour but no minute"),
587
        pytest.param("2023-05-04|10", "%Y-%m-%d|%M", id="minute but no hour"),
588
        pytest.param("2023-05-04|10", "%Y-%m-%d|%S", id="second but no hour"),
589
        pytest.param(
590
            "2000-Jan-01 01 00 01", "%Y-%b-%d %I %M %S", id="12-hour clock but no AM/PM"
591
        ),
592
        pytest.param(
593
            "2000-Jan-01 01 00 01",
594
            "%Y-%b-%d %l %M %S",
595
            id="padded 12-hour clock but no AM/PM",
596
        ),
597
    ],
598
)
599
def test_strptime_incomplete_formats(string: str, fmt: str) -> None:
600
    with pytest.raises(
601
        ComputeError,
602
        match="Invalid format string",
603
    ):
604
        pl.Series([string]).str.to_datetime(fmt)
605

606

607
@pytest.mark.parametrize(
608
    ("string", "fmt", "expected"),
609
    [
610
        ("2023-05-04|7:3", "%Y-%m-%d|%H:%M", datetime(2023, 5, 4, 7, 3)),
611
        ("2023-05-04|10:03", "%Y-%m-%d|%H:%M", datetime(2023, 5, 4, 10, 3)),
612
        (
613
            "2000-Jan-01 01 00 01 am",
614
            "%Y-%b-%d %I %M %S %P",
615
            datetime(2000, 1, 1, 1, 0, 1),
616
        ),
617
        (
618
            "2000-Jan-01 01 00 01 am",
619
            "%Y-%b-%d %_I %M %S %P",
620
            datetime(2000, 1, 1, 1, 0, 1),
621
        ),
622
        (
623
            "2000-Jan-01 01 00 01 am",
624
            "%Y-%b-%d %l %M %S %P",
625
            datetime(2000, 1, 1, 1, 0, 1),
626
        ),
627
        (
628
            "2000-Jan-01 01 00 01 AM",
629
            "%Y-%b-%d %I %M %S %p",
630
            datetime(2000, 1, 1, 1, 0, 1),
631
        ),
632
        (
633
            "2000-Jan-01 01 00 01 AM",
634
            "%Y-%b-%d %_I %M %S %p",
635
            datetime(2000, 1, 1, 1, 0, 1),
636
        ),
637
        (
638
            "2000-Jan-01 01 00 01 AM",
639
            "%Y-%b-%d %l %M %S %p",
640
            datetime(2000, 1, 1, 1, 0, 1),
641
        ),
642
    ],
643
)
644
def test_strptime_complete_formats(string: str, fmt: str, expected: datetime) -> None:
645
    # Similar to the above, but these formats are complete and should work
646
    result = pl.Series([string]).str.to_datetime(fmt).item()
647
    assert result == expected
648

649

650
@pytest.mark.parametrize(
651
    ("data", "format", "expected"),
652
    [
653
        ("00:00:00.000005000", "%H:%M:%S%.f", time(0, 0, 0, 5)),
654
        ("01:23:10.000500", "%H:%M:%S%.6f", time(1, 23, 10, 500)),
655
        ("08:10:11.000", "%H:%M:%S%.3f", time(8, 10, 11)),
656
        ("15:50:25", "%T", time(15, 50, 25)),
657
        ("22:35", "%R", time(22, 35)),
658
    ],
659
)
660
def test_to_time_inferred(data: str, format: str, expected: time) -> None:
661
    df = pl.DataFrame({"tmstr": [data]})
662
    expected_df = df.with_columns(tm=pl.Series("tm", values=[expected]))
663
    for fmt in (format, None):
664
        res = df.with_columns(tm=pl.col("tmstr").str.to_time(fmt))
665
        assert_frame_equal(res, expected_df)
666

667

668
@pytest.mark.parametrize(
669
    ("data", "format", "expected"),
670
    [
671
        ("05:10:11.740000", "%H:%M:%S%.f", time(5, 10, 11, 740000)),
672
        ("13:20:12.000074", "%T%.6f", time(13, 20, 12, 74)),
673
        ("21:30:13.007400", "%H:%M:%S%.3f", time(21, 30, 13, 7400)),
674
    ],
675
)
676
def test_to_time_subseconds(data: str, format: str, expected: time) -> None:
677
    s = pl.Series([data])
678
    for res in (
679
        s.str.to_time().item(),
680
        s.str.to_time(format).item(),
681
    ):
682
        assert res == expected
683

684

685
def test_to_time_format_warning() -> None:
686
    s = pl.Series(["05:10:10.074000"])
687
    with pytest.warns(ChronoFormatWarning, match=".%f"):
688
        result = s.str.to_time("%H:%M:%S.%f").item()
689
    assert result == time(5, 10, 10, 74)
690

691

692
@pytest.mark.parametrize("exact", [True, False])
693
def test_to_datetime_ambiguous_earliest(exact: bool) -> None:
694
    result = (
695
        pl.Series(["2020-10-25 01:00"])
696
        .str.to_datetime(time_zone="Europe/London", ambiguous="earliest", exact=exact)
697
        .item()
698
    )
699
    expected = datetime(2020, 10, 25, 1, fold=0, tzinfo=ZoneInfo("Europe/London"))
700
    assert result == expected
701
    result = (
702
        pl.Series(["2020-10-25 01:00"])
703
        .str.to_datetime(time_zone="Europe/London", ambiguous="latest", exact=exact)
704
        .item()
705
    )
706
    expected = datetime(2020, 10, 25, 1, fold=1, tzinfo=ZoneInfo("Europe/London"))
707
    assert result == expected
708
    with pytest.raises(ComputeError):
709
        pl.Series(["2020-10-25 01:00"]).str.to_datetime(
710
            time_zone="Europe/London",
711
            exact=exact,
712
        ).item()
713

714

715
def test_to_datetime_naive_format_and_time_zone() -> None:
716
    # format-specified path
717
    result = pl.Series(["2020-01-01"]).str.to_datetime(
718
        format="%Y-%m-%d", time_zone="Asia/Kathmandu"
719
    )
720
    expected = pl.Series([datetime(2020, 1, 1)]).dt.replace_time_zone("Asia/Kathmandu")
721
    assert_series_equal(result, expected)
722
    # format-inferred path
723
    result = pl.Series(["2020-01-01"]).str.to_datetime(time_zone="Asia/Kathmandu")
724
    assert_series_equal(result, expected)
725

726

727
@pytest.mark.parametrize("exact", [True, False])
728
def test_strptime_ambiguous_earliest(exact: bool) -> None:
729
    result = (
730
        pl.Series(["2020-10-25 01:00"])
731
        .str.strptime(
732
            pl.Datetime("us", "Europe/London"), ambiguous="earliest", exact=exact
733
        )
734
        .item()
735
    )
736
    expected = datetime(2020, 10, 25, 1, fold=0, tzinfo=ZoneInfo("Europe/London"))
737
    assert result == expected
738
    result = (
739
        pl.Series(["2020-10-25 01:00"])
740
        .str.strptime(
741
            pl.Datetime("us", "Europe/London"), ambiguous="latest", exact=exact
742
        )
743
        .item()
744
    )
745
    expected = datetime(2020, 10, 25, 1, fold=1, tzinfo=ZoneInfo("Europe/London"))
746
    assert result == expected
747
    with pytest.raises(ComputeError):
748
        pl.Series(["2020-10-25 01:00"]).str.strptime(
749
            pl.Datetime("us", "Europe/London"),
750
            exact=exact,
751
        ).item()
752

753

754
@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"])
755
def test_to_datetime_out_of_range_13401(time_unit: TimeUnit) -> None:
756
    s = pl.Series(["2020-January-01 12:34:66"])
757
    with pytest.raises(InvalidOperationError, match="conversion .* failed"):
758
        s.str.to_datetime("%Y-%B-%d %H:%M:%S", time_unit=time_unit)
759
    assert (
760
        s.str.to_datetime("%Y-%B-%d %H:%M:%S", strict=False, time_unit=time_unit).item()
761
        is None
762
    )
763

764

765
def test_out_of_ns_range_no_tu_specified_13592() -> None:
766
    df = pl.DataFrame({"dates": ["2022-08-31 00:00:00.0", "0920-09-18 00:00:00.0"]})
767
    result = df.select(pl.col("dates").str.to_datetime(format="%Y-%m-%d %H:%M:%S%.f"))[
768
        "dates"
769
    ]
770
    expected = pl.Series(
771
        "dates",
772
        [datetime(2022, 8, 31, 0, 0), datetime(920, 9, 18, 0, 0)],
773
        dtype=pl.Datetime("us"),
774
    )
775
    assert_series_equal(result, expected)
776

777

778
def test_wrong_format_percent() -> None:
779
    with pytest.raises(InvalidOperationError):
780
        pl.Series(["2019-01-01"]).str.strptime(pl.Date, format="d%")
781

782

783
def test_polars_parser_fooled_by_trailing_nonsense_22167() -> None:
784
    with pytest.raises(InvalidOperationError):
785
        pl.Series(["2025-04-06T18:57:42.77756192Z"]).str.to_datetime(
786
            "%Y-%m-%dT%H:%M:%S.%9fcabbagebananapotato"
787
        )
788
    with pytest.raises(InvalidOperationError):
789
        pl.Series(["2025-04-06T18:57:42.77756192Z"]).str.to_datetime(
790
            "%Y-%m-%dT%H:%M:%S.%9f#z"
791
        )
792
    with pytest.raises(InvalidOperationError):
793
        pl.Series(["2025-04-06T18:57:42.77Z"]).str.to_datetime(
794
            "%Y-%m-%dT%H:%M:%S.%3f#z"
795
        )
796
    with pytest.raises(InvalidOperationError):
797
        pl.Series(["2025-04-06T18:57:42.77123Z"]).str.to_datetime(
798
            "%Y-%m-%dT%H:%M:%S.%6f#z"
799
        )
800

801

802
def test_strptime_empty_input_22214() -> None:
803
    s = pl.Series("x", [], pl.String)
804

805
    assert s.str.strptime(pl.Time, "%H:%M:%S%.f").is_empty()
806
    assert s.str.strptime(pl.Date, "%Y-%m-%d").is_empty()
807
    assert s.str.strptime(pl.Datetime, "%Y-%m-%d %H:%M%#z").is_empty()
808

809

810
@pytest.mark.parametrize(
811
    "value",
812
    [
813
        "31/12/2022",
814
        "banana",
815
        "12-345-678",
816
        "12-345-67",
817
        "12-345-6789",
818
        "123*45*678",
819
        "123x45x678",
820
        "123x45x678x",
821
    ],
822
)
823
def test_matching_strings_but_different_format_22495(value: str) -> None:
824
    s = pl.Series("my_strings", [value])
825
    result = s.str.to_date("%Y-%m-%d", strict=False).item()
826
    assert result is None
827

828

829
def test_date_parse_omit_day_month() -> None:
830
    fmt_B = "%Y %B"
831
    fmt_b = "%Y %b"
832
    df = (
833
        pl.select(date=pl.date_range(pl.date(2022, 1, 1), pl.date(2022, 12, 1), "1mo"))
834
        .with_columns(
835
            strdateB=pl.col("date").dt.strftime(fmt_B),
836
            strdateb=pl.col("date").dt.strftime(fmt_b),
837
        )
838
        .with_columns(
839
            round_tripB=pl.col("strdateB").str.strptime(pl.Date, fmt_B),
840
            round_tripb=pl.col("strdateb").str.strptime(pl.Date, fmt_b),
841
        )
842
    )
843
    check = df.filter(
844
        ~pl.all_horizontal(
845
            pl.col("date") == pl.col("round_tripB"),
846
            pl.col("date") == pl.col("round_tripb"),
847
        )
848
    )
849
    assert check.height == 0
850

851
    s = pl.Series(
852
        [
853
            "2022 January",
854
            "2022 February",
855
            "2022 March",
856
            "2022 April",
857
            "2022 May",
858
            "2022 June",
859
            "2022 July",
860
            "2022 August",
861
            "2022 September",
862
            "2022 October",
863
            "2022 November",
864
            "2022 December",
865
        ]
866
    )
867
    result = s.str.strptime(pl.Date, "%Y %B")
868
    expected = pl.Series(
869
        [
870
            date(2022, 1, 1),
871
            date(2022, 2, 1),
872
            date(2022, 3, 1),
873
            date(2022, 4, 1),
874
            date(2022, 5, 1),
875
            date(2022, 6, 1),
876
            date(2022, 7, 1),
877
            date(2022, 8, 1),
878
            date(2022, 9, 1),
879
            date(2022, 10, 1),
880
            date(2022, 11, 1),
881
            date(2022, 12, 1),
882
        ]
883
    )
884
    assert_series_equal(result, expected)
885

886

887
@pytest.mark.parametrize("length", [1, 5])
888
def test_eager_inference_on_expr(length: int) -> None:
889
    s = pl.Series("a", ["2025-04-06T18:57:42.77123Z"] * length)
890

891
    assert_series_equal(
892
        s.str.strptime(pl.Datetime),
893
        pl.Series(
894
            "a",
895
            [
896
                datetime(
897
                    2025, 4, 6, 18, 57, 42, 771230, tzinfo=timezone(timedelta(hours=0))
898
                )
899
            ]
900
            * length,
901
        ),
902
    )
903

904
    with pytest.raises(
905
        ComputeError,
906
        match="`strptime` / `to_datetime` was called with no format and no time zone, but a time zone is part of the data",
907
    ):
908
        s.to_frame().select(pl.col("a").str.strptime(pl.Datetime))
909

910
Product

Resources

Company