CoCalc -- test_temporal.py

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/sql/test_temporal.py
⁶⁹³⁹ views
1
from __future__ import annotations
2

3
from datetime import date, datetime, time
4
from typing import Any, Literal
5

6
import pytest
7

8
import polars as pl
9
from polars.exceptions import InvalidOperationError, SQLInterfaceError, SQLSyntaxError
10
from polars.testing import assert_frame_equal
11

12

13
def test_date_func() -> None:
14
    df = pl.DataFrame(
15
        {
16
            "date": [
17
                date(2021, 3, 15),
18
                date(2021, 3, 28),
19
                date(2021, 4, 4),
20
            ],
21
            "version": ["0.0.1", "0.7.3", "0.7.4"],
22
        }
23
    )
24
    with pl.SQLContext(df=df, eager=True) as ctx:
25
        result = ctx.execute("SELECT date < DATE('2021-03-20') from df")
26

27
    expected = pl.DataFrame({"date": [True, False, False]})
28
    assert_frame_equal(result, expected)
29

30
    result = pl.select(pl.sql_expr("CAST(DATE('2023-03', '%Y-%m') as STRING)"))
31
    expected = pl.DataFrame({"literal": ["2023-03-01"]})
32
    assert_frame_equal(result, expected)
33

34
    with pytest.raises(
35
        SQLSyntaxError,
36
        match=r"DATE expects 1-2 arguments \(found 0\)",
37
    ):
38
        df.sql("SELECT DATE() FROM self")
39

40
    with pytest.raises(InvalidOperationError):
41
        df.sql("SELECT DATE('2077-07-07','not_a_valid_strftime_format') FROM self")
42

43

44
@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"])
45
def test_datetime_to_time(time_unit: Literal["ns", "us", "ms"]) -> None:
46
    df = pl.DataFrame(  # noqa: F841
47
        {
48
            "dtm": [
49
                datetime(2099, 12, 31, 23, 59, 59),
50
                datetime(1999, 12, 31, 12, 30, 30),
51
                datetime(1969, 12, 31, 1, 1, 1),
52
                datetime(1899, 12, 31, 0, 0, 0),
53
            ],
54
        },
55
        schema={"dtm": pl.Datetime(time_unit)},
56
    )
57

58
    res = pl.sql("SELECT dtm::time AS tm from df").collect()
59
    assert res["tm"].to_list() == [
60
        time(23, 59, 59),
61
        time(12, 30, 30),
62
        time(1, 1, 1),
63
        time(0, 0, 0),
64
    ]
65

66

67
@pytest.mark.parametrize(
68
    ("parts", "dtype", "expected"),
69
    [
70
        (["decade", "decades"], pl.Int32, [202, 202, 200]),
71
        (["isoyear"], pl.Int32, [2024, 2020, 2005]),
72
        (["year", "y"], pl.Int32, [2024, 2020, 2006]),
73
        (["quarter"], pl.Int8, [1, 4, 1]),
74
        (["month", "months", "mon", "mons"], pl.Int8, [1, 12, 1]),
75
        (["week", "weeks"], pl.Int8, [1, 53, 52]),
76
        (["doy"], pl.Int16, [7, 365, 1]),
77
        (["isodow"], pl.Int8, [7, 3, 7]),
78
        (["dow"], pl.Int8, [0, 3, 0]),
79
        (["day", "days", "d"], pl.Int8, [7, 30, 1]),
80
        (["hour", "hours", "h"], pl.Int8, [1, 10, 23]),
81
        (["minute", "min", "mins", "m"], pl.Int8, [2, 30, 59]),
82
        (["second", "seconds", "secs", "sec"], pl.Int8, [3, 45, 59]),
83
        (
84
            ["millisecond", "milliseconds", "ms"],
85
            pl.Float64,
86
            [3123.456, 45987.654, 59555.555],
87
        ),
88
        (
89
            ["microsecond", "microseconds", "us"],
90
            pl.Float64,
91
            [3123456.0, 45987654.0, 59555555.0],
92
        ),
93
        (
94
            ["nanosecond", "nanoseconds", "ns"],
95
            pl.Float64,
96
            [3123456000.0, 45987654000.0, 59555555000.0],
97
        ),
98
        (
99
            ["time"],
100
            pl.Time,
101
            [time(1, 2, 3, 123456), time(10, 30, 45, 987654), time(23, 59, 59, 555555)],
102
        ),
103
        (
104
            ["epoch"],
105
            pl.Float64,
106
            [1704589323.123456, 1609324245.987654, 1136159999.555555],
107
        ),
108
    ],
109
)
110
def test_extract(parts: list[str], dtype: pl.DataType, expected: list[Any]) -> None:
111
    df = pl.DataFrame(
112
        {
113
            "dt": [
114
                # note: these values test several edge-cases, such as isoyear,
115
                # the mon/sun wrapping of dow vs isodow, epoch rounding, etc,
116
                # and the results have been validated against postgresql.
117
                datetime(2024, 1, 7, 1, 2, 3, 123456),
118
                datetime(2020, 12, 30, 10, 30, 45, 987654),
119
                datetime(2006, 1, 1, 23, 59, 59, 555555),
120
            ],
121
        }
122
    )
123
    with pl.SQLContext(frame_data=df, eager=True) as ctx:
124
        for part in parts:
125
            for fn in (
126
                f"EXTRACT({part} FROM dt)",
127
                f"DATE_PART('{part}',dt)",
128
            ):
129
                res = ctx.execute(f"SELECT {fn} AS {part} FROM frame_data").to_series()
130
                assert res.dtype == dtype
131
                assert res.to_list() == expected
132

133

134
def test_extract_errors() -> None:
135
    df = pl.DataFrame({"dt": [datetime(2024, 1, 7, 1, 2, 3, 123456)]})
136

137
    with pl.SQLContext(frame_data=df, eager=True) as ctx:
138
        for part in ("femtosecond", "stroopwafel"):
139
            with pytest.raises(
140
                SQLSyntaxError,
141
                match=f"EXTRACT/DATE_PART does not support '{part}' part",
142
            ):
143
                ctx.execute(f"SELECT EXTRACT({part} FROM dt) FROM frame_data")
144

145
        with pytest.raises(
146
            SQLSyntaxError,
147
            match=r"EXTRACT/DATE_PART does not support 'week\(tuesday\)' part",
148
        ):
149
            ctx.execute("SELECT DATE_PART('week(tuesday)', dt) FROM frame_data")
150

151

152
@pytest.mark.parametrize(
153
    ("dt", "expected"),
154
    [
155
        (date(1, 1, 1), [1, 1]),
156
        (date(100, 1, 1), [1, 1]),
157
        (date(101, 1, 1), [1, 2]),
158
        (date(1000, 1, 1), [1, 10]),
159
        (date(1001, 1, 1), [2, 11]),
160
        (date(1899, 12, 31), [2, 19]),
161
        (date(1900, 12, 31), [2, 19]),
162
        (date(1901, 1, 1), [2, 20]),
163
        (date(2000, 12, 31), [2, 20]),
164
        (date(2001, 1, 1), [3, 21]),
165
        (date(5555, 5, 5), [6, 56]),
166
        (date(9999, 12, 31), [10, 100]),
167
    ],
168
)
169
def test_extract_century_millennium(dt: date, expected: list[int]) -> None:
170
    with pl.SQLContext(frame_data=pl.DataFrame({"dt": [dt]}), eager=True) as ctx:
171
        res = ctx.execute(
172
            """
173
            SELECT
174
              EXTRACT(MILLENNIUM FROM dt) AS c1,
175
              DATE_PART('century',dt) AS c2,
176
              EXTRACT(millennium FROM dt) AS c3,
177
              DATE_PART('CENTURY',dt) AS c4,
178
            FROM frame_data
179
            """
180
        )
181
        assert_frame_equal(
182
            left=res,
183
            right=pl.DataFrame(
184
                data=[expected + expected],
185
                schema=["c1", "c2", "c3", "c4"],
186
                orient="row",
187
            ).cast(pl.Int32),
188
        )
189

190

191
@pytest.mark.parametrize(
192
    ("constraint", "expected"),
193
    [
194
        ("dtm >= '2020-12-30T10:30:45.987'", [0, 2]),
195
        ("dtm::date > '2006-01-01'", [0, 2]),
196
        ("dtm > '2006-01-01'", [0, 1, 2]),  # << implies '2006-01-01 00:00:00'
197
        ("dtm <= '2006-01-01'", []),  # << implies '2006-01-01 00:00:00'
198
        ("dt != '1960-01-07'", [0, 1]),
199
        ("tm != '22:10:30'", [0, 2]),
200
        ("tm >= '11:00:00' AND tm < '22:00'", [0]),
201
        ("tm >= '11:00' AND tm < '22:00:00.000'", [0]),
202
        ("tm BETWEEN '12:00' AND '23:59:58'", [0, 1]),
203
        ("tm BETWEEN '12:00:00' AND '23:59:58'", [0, 1]),
204
        ("dt BETWEEN '2050-01-01' AND '2100-12-31'", [1]),
205
        ("dt::datetime = '1960-01-07'", [2]),
206
        ("dt::datetime = '1960-01-07 00:00'", [2]),
207
        ("dt::datetime = '1960-01-07 00:00:00'", [2]),
208
        ("dtm BETWEEN '2020-12-30 10:30:44' AND '2023-01-01 00:00'", [2]),
209
        ("dt IN ('1960-01-07','2077-01-01','2222-02-22')", [1, 2]),
210
        (
211
            "dtm = '2024-01-07 01:02:03.123456000' OR dtm = '2020-12-30 10:30:45.987654'",
212
            [0, 2],
213
        ),
214
    ],
215
)
216
def test_implicit_temporal_strings(constraint: str, expected: list[int]) -> None:
217
    df = pl.DataFrame(
218
        {
219
            "idx": [0, 1, 2],
220
            "dtm": [
221
                datetime(2024, 1, 7, 1, 2, 3, 123456),
222
                datetime(2006, 1, 1, 23, 59, 59, 555555),
223
                datetime(2020, 12, 30, 10, 30, 45, 987654),
224
            ],
225
            "dt": [
226
                date(2020, 12, 30),
227
                date(2077, 1, 1),
228
                date(1960, 1, 7),
229
            ],
230
            "tm": [
231
                time(17, 30, 45),
232
                time(22, 10, 30),
233
                time(10, 25, 15),
234
            ],
235
        }
236
    )
237
    res = df.sql(f"SELECT idx FROM self WHERE {constraint}")
238
    actual = sorted(res["idx"])
239
    assert actual == expected
240

241

242
@pytest.mark.parametrize(
243
    "dtval",
244
    [
245
        # none of these are valid dates
246
        "2020-12-30T10:30:45",
247
        "yyyy-mm-dd",
248
        "2222-22-22",
249
        "10:30:45",
250
        "foo",
251
    ],
252
)
253
def test_implicit_temporal_string_errors(dtval: str) -> None:
254
    df = pl.DataFrame({"dt": [date(2020, 12, 30)]})
255

256
    with pytest.raises(
257
        InvalidOperationError,
258
        match="(conversion.*failed)|(cannot compare.*string.*temporal)",
259
    ):
260
        df.sql(f"SELECT * FROM self WHERE dt = '{dtval}'")
261

262

263
def test_strftime() -> None:
264
    df = pl.DataFrame(
265
        {
266
            "dtm": [
267
                None,
268
                datetime(1980, 9, 30, 1, 25, 50),
269
                datetime(2077, 7, 17, 11, 30, 55),
270
            ],
271
            "dt": [date(1978, 7, 5), date(1969, 12, 31), date(2020, 4, 10)],
272
            "tm": [time(10, 10, 10), time(22, 33, 55), None],
273
        }
274
    )
275
    res = df.sql(
276
        """
277
        SELECT
278
          STRFTIME(dtm,'%m.%d.%Y/%T') AS s_dtm,
279
          STRFTIME(dt ,'%B %d, %Y') AS s_dt,
280
          STRFTIME(tm ,'%S.%M.%H') AS s_tm,
281
        FROM self
282
        """
283
    )
284
    assert res.to_dict(as_series=False) == {
285
        "s_dtm": [None, "09.30.1980/01:25:50", "07.17.2077/11:30:55"],
286
        "s_dt": ["July 05, 1978", "December 31, 1969", "April 10, 2020"],
287
        "s_tm": ["10.10.10", "55.33.22", None],
288
    }
289

290
    with pytest.raises(
291
        SQLSyntaxError,
292
        match=r"STRFTIME expects 2 arguments \(found 4\)",
293
    ):
294
        pl.sql_expr("STRFTIME(dtm,'%Y-%m-%d','[extra]','[param]')")
295

296

297
def test_strptime() -> None:
298
    df = pl.DataFrame(
299
        {
300
            "s_dtm": [None, "09.30.1980/01:25:50", "07.17.2077/11:30:55"],
301
            "s_dt": ["July 5, 1978", "December 31, 1969", "April 10, 2020"],
302
            "s_tm": ["10.10.10", "55.33.22", None],
303
        }
304
    )
305
    res = df.sql(
306
        """
307
        SELECT
308
          STRPTIME(s_dtm,'%m.%d.%Y/%T') AS dtm,
309
          STRPTIME(s_dt ,'%B %d, %Y')::date AS dt,
310
          STRPTIME(s_tm ,'%S.%M.%H')::time AS tm
311
        FROM self
312
        """
313
    )
314
    assert res.to_dict(as_series=False) == {
315
        "dtm": [
316
            None,
317
            datetime(1980, 9, 30, 1, 25, 50),
318
            datetime(2077, 7, 17, 11, 30, 55),
319
        ],
320
        "dt": [date(1978, 7, 5), date(1969, 12, 31), date(2020, 4, 10)],
321
        "tm": [time(10, 10, 10), time(22, 33, 55), None],
322
    }
323
    with pytest.raises(
324
        SQLSyntaxError,
325
        match=r"STRPTIME expects 2 arguments \(found 3\)",
326
    ):
327
        pl.sql_expr("STRPTIME(s,'%Y.%m.%d',false) AS dt")
328

329

330
def test_temporal_stings_to_datetime() -> None:
331
    df = pl.DataFrame(
332
        {
333
            "s_dt": ["2077-10-10", "1942-01-08", "2000-07-05"],
334
            "s_dtm1": [
335
                "1999-12-31 10:30:45",
336
                "2020-06-10",
337
                "2022-08-07T00:01:02.654321",
338
            ],
339
            "s_dtm2": ["31-12-1999 10:30", "10-06-2020 00:00", "07-08-2022 00:01"],
340
            "s_tm": ["02:04:06", "12:30:45.999", "23:59:59.123456"],
341
        }
342
    )
343
    res = df.sql(
344
        """
345
        SELECT
346
          DATE(s_dt) AS dt1,
347
          DATETIME(s_dt) AS dt2,
348
          DATETIME(s_dtm1) AS dtm1,
349
          DATETIME(s_dtm2,'%d-%m-%Y %H:%M') AS dtm2,
350
          TIME(s_tm) AS tm
351
        FROM self
352
        """
353
    )
354
    assert res.schema == {
355
        "dt1": pl.Date,
356
        "dt2": pl.Datetime("us"),
357
        "dtm1": pl.Datetime("us"),
358
        "dtm2": pl.Datetime("us"),
359
        "tm": pl.Time,
360
    }
361
    assert res.rows() == [
362
        (
363
            date(2077, 10, 10),
364
            datetime(2077, 10, 10, 0, 0),
365
            datetime(1999, 12, 31, 10, 30, 45),
366
            datetime(1999, 12, 31, 10, 30),
367
            time(2, 4, 6),
368
        ),
369
        (
370
            date(1942, 1, 8),
371
            datetime(1942, 1, 8, 0, 0),
372
            datetime(2020, 6, 10, 0, 0),
373
            datetime(2020, 6, 10, 0, 0),
374
            time(12, 30, 45, 999000),
375
        ),
376
        (
377
            date(2000, 7, 5),
378
            datetime(2000, 7, 5, 0, 0),
379
            datetime(2022, 8, 7, 0, 1, 2, 654321),
380
            datetime(2022, 8, 7, 0, 1),
381
            time(23, 59, 59, 123456),
382
        ),
383
    ]
384

385
    for fn in ("DATE", "TIME", "DATETIME"):
386
        with pytest.raises(
387
            SQLSyntaxError,
388
            match=rf"{fn} expects 1-2 arguments \(found 3\)",
389
        ):
390
            pl.sql_expr(rf"{fn}(s,fmt,misc) AS xyz")
391

392

393
def test_temporal_typed_literals() -> None:
394
    res = pl.sql(
395
        """
396
        SELECT
397
          DATE '2020-12-30' AS dt,
398
          TIME '00:01:02' AS tm1,
399
          TIME '23:59:59.123456' AS tm2,
400
          TIMESTAMP '1930-01-01 12:30:00' AS dtm1,
401
          TIMESTAMP '2077-04-27T23:45:30.123456' AS dtm2
402
        FROM
403
          (VALUES (0)) tbl (x)
404
        """,
405
        eager=True,
406
    )
407
    assert res.to_dict(as_series=False) == {
408
        "dt": [date(2020, 12, 30)],
409
        "tm1": [time(0, 1, 2)],
410
        "tm2": [time(23, 59, 59, 123456)],
411
        "dtm1": [datetime(1930, 1, 1, 12, 30)],
412
        "dtm2": [datetime(2077, 4, 27, 23, 45, 30, 123456)],
413
    }
414

415

416
@pytest.mark.parametrize("fn", ["DATE", "TIME", "TIMESTAMP"])
417
def test_typed_literals_errors(fn: str) -> None:
418
    with pytest.raises(SQLSyntaxError, match=f"invalid {fn} literal '999'"):
419
        pl.sql_expr(f"{fn} '999'")
420

421

422
@pytest.mark.parametrize(
423
    ("unit", "expected"),
424
    [
425
        ("ms", [1704589323123, 1609324245987, 1136159999555]),
426
        ("us", [1704589323123456, 1609324245987654, 1136159999555555]),
427
        ("ns", [1704589323123456000, 1609324245987654000, 1136159999555555000]),
428
    ],
429
)
430
def test_timestamp_time_unit(unit: str | None, expected: list[int]) -> None:
431
    df = pl.DataFrame(
432
        {
433
            "ts": [
434
                datetime(2024, 1, 7, 1, 2, 3, 123456),
435
                datetime(2020, 12, 30, 10, 30, 45, 987654),
436
                datetime(2006, 1, 1, 23, 59, 59, 555555),
437
            ],
438
        }
439
    )
440
    precision = {"ms": 3, "us": 6, "ns": 9}
441

442
    with pl.SQLContext(frame_data=df, eager=True) as ctx:
443
        prec = f"({precision[unit]})" if unit else ""
444
        res = ctx.execute(f"SELECT ts::timestamp{prec} FROM frame_data").to_series()
445

446
        assert res.dtype == pl.Datetime(time_unit=unit)  # type: ignore[arg-type]
447
        assert res.to_physical().to_list() == expected
448

449

450
def test_timestamp_time_unit_errors() -> None:
451
    df = pl.DataFrame({"ts": [datetime(2024, 1, 7, 1, 2, 3, 123456)]})
452

453
    with pl.SQLContext(frame_data=df, eager=True) as ctx:
454
        for prec in (0, 15):
455
            with pytest.raises(
456
                SQLSyntaxError,
457
                match=rf"invalid temporal type precision \(expected 1-9, found {prec}\)",
458
            ):
459
                ctx.execute(f"SELECT ts::timestamp({prec}) FROM frame_data")
460

461
        with pytest.raises(
462
            SQLInterfaceError,
463
            match="sql parser error: Expected: literal int, found: - ",
464
        ):
465
            ctx.execute("SELECT ts::timestamp(-3) FROM frame_data")
466

467
Product

Resources

Company