Path: blob/main/py-polars/tests/unit/sql/test_temporal.py
6939 views
from __future__ import annotations12from datetime import date, datetime, time3from typing import Any, Literal45import pytest67import polars as pl8from polars.exceptions import InvalidOperationError, SQLInterfaceError, SQLSyntaxError9from polars.testing import assert_frame_equal101112def test_date_func() -> None:13df = pl.DataFrame(14{15"date": [16date(2021, 3, 15),17date(2021, 3, 28),18date(2021, 4, 4),19],20"version": ["0.0.1", "0.7.3", "0.7.4"],21}22)23with pl.SQLContext(df=df, eager=True) as ctx:24result = ctx.execute("SELECT date < DATE('2021-03-20') from df")2526expected = pl.DataFrame({"date": [True, False, False]})27assert_frame_equal(result, expected)2829result = pl.select(pl.sql_expr("CAST(DATE('2023-03', '%Y-%m') as STRING)"))30expected = pl.DataFrame({"literal": ["2023-03-01"]})31assert_frame_equal(result, expected)3233with pytest.raises(34SQLSyntaxError,35match=r"DATE expects 1-2 arguments \(found 0\)",36):37df.sql("SELECT DATE() FROM self")3839with pytest.raises(InvalidOperationError):40df.sql("SELECT DATE('2077-07-07','not_a_valid_strftime_format') FROM self")414243@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"])44def test_datetime_to_time(time_unit: Literal["ns", "us", "ms"]) -> None:45df = pl.DataFrame( # noqa: F84146{47"dtm": [48datetime(2099, 12, 31, 23, 59, 59),49datetime(1999, 12, 31, 12, 30, 30),50datetime(1969, 12, 31, 1, 1, 1),51datetime(1899, 12, 31, 0, 0, 0),52],53},54schema={"dtm": pl.Datetime(time_unit)},55)5657res = pl.sql("SELECT dtm::time AS tm from df").collect()58assert res["tm"].to_list() == [59time(23, 59, 59),60time(12, 30, 30),61time(1, 1, 1),62time(0, 0, 0),63]646566@pytest.mark.parametrize(67("parts", "dtype", "expected"),68[69(["decade", "decades"], pl.Int32, [202, 202, 200]),70(["isoyear"], pl.Int32, [2024, 2020, 2005]),71(["year", "y"], pl.Int32, [2024, 2020, 2006]),72(["quarter"], pl.Int8, [1, 4, 1]),73(["month", "months", "mon", "mons"], pl.Int8, [1, 12, 1]),74(["week", "weeks"], pl.Int8, [1, 53, 52]),75(["doy"], pl.Int16, [7, 365, 1]),76(["isodow"], pl.Int8, [7, 3, 7]),77(["dow"], pl.Int8, [0, 3, 0]),78(["day", "days", "d"], pl.Int8, [7, 30, 1]),79(["hour", "hours", "h"], pl.Int8, [1, 10, 23]),80(["minute", "min", "mins", "m"], pl.Int8, [2, 30, 59]),81(["second", "seconds", "secs", "sec"], pl.Int8, [3, 45, 59]),82(83["millisecond", "milliseconds", "ms"],84pl.Float64,85[3123.456, 45987.654, 59555.555],86),87(88["microsecond", "microseconds", "us"],89pl.Float64,90[3123456.0, 45987654.0, 59555555.0],91),92(93["nanosecond", "nanoseconds", "ns"],94pl.Float64,95[3123456000.0, 45987654000.0, 59555555000.0],96),97(98["time"],99pl.Time,100[time(1, 2, 3, 123456), time(10, 30, 45, 987654), time(23, 59, 59, 555555)],101),102(103["epoch"],104pl.Float64,105[1704589323.123456, 1609324245.987654, 1136159999.555555],106),107],108)109def test_extract(parts: list[str], dtype: pl.DataType, expected: list[Any]) -> None:110df = pl.DataFrame(111{112"dt": [113# note: these values test several edge-cases, such as isoyear,114# the mon/sun wrapping of dow vs isodow, epoch rounding, etc,115# and the results have been validated against postgresql.116datetime(2024, 1, 7, 1, 2, 3, 123456),117datetime(2020, 12, 30, 10, 30, 45, 987654),118datetime(2006, 1, 1, 23, 59, 59, 555555),119],120}121)122with pl.SQLContext(frame_data=df, eager=True) as ctx:123for part in parts:124for fn in (125f"EXTRACT({part} FROM dt)",126f"DATE_PART('{part}',dt)",127):128res = ctx.execute(f"SELECT {fn} AS {part} FROM frame_data").to_series()129assert res.dtype == dtype130assert res.to_list() == expected131132133def test_extract_errors() -> None:134df = pl.DataFrame({"dt": [datetime(2024, 1, 7, 1, 2, 3, 123456)]})135136with pl.SQLContext(frame_data=df, eager=True) as ctx:137for part in ("femtosecond", "stroopwafel"):138with pytest.raises(139SQLSyntaxError,140match=f"EXTRACT/DATE_PART does not support '{part}' part",141):142ctx.execute(f"SELECT EXTRACT({part} FROM dt) FROM frame_data")143144with pytest.raises(145SQLSyntaxError,146match=r"EXTRACT/DATE_PART does not support 'week\(tuesday\)' part",147):148ctx.execute("SELECT DATE_PART('week(tuesday)', dt) FROM frame_data")149150151@pytest.mark.parametrize(152("dt", "expected"),153[154(date(1, 1, 1), [1, 1]),155(date(100, 1, 1), [1, 1]),156(date(101, 1, 1), [1, 2]),157(date(1000, 1, 1), [1, 10]),158(date(1001, 1, 1), [2, 11]),159(date(1899, 12, 31), [2, 19]),160(date(1900, 12, 31), [2, 19]),161(date(1901, 1, 1), [2, 20]),162(date(2000, 12, 31), [2, 20]),163(date(2001, 1, 1), [3, 21]),164(date(5555, 5, 5), [6, 56]),165(date(9999, 12, 31), [10, 100]),166],167)168def test_extract_century_millennium(dt: date, expected: list[int]) -> None:169with pl.SQLContext(frame_data=pl.DataFrame({"dt": [dt]}), eager=True) as ctx:170res = ctx.execute(171"""172SELECT173EXTRACT(MILLENNIUM FROM dt) AS c1,174DATE_PART('century',dt) AS c2,175EXTRACT(millennium FROM dt) AS c3,176DATE_PART('CENTURY',dt) AS c4,177FROM frame_data178"""179)180assert_frame_equal(181left=res,182right=pl.DataFrame(183data=[expected + expected],184schema=["c1", "c2", "c3", "c4"],185orient="row",186).cast(pl.Int32),187)188189190@pytest.mark.parametrize(191("constraint", "expected"),192[193("dtm >= '2020-12-30T10:30:45.987'", [0, 2]),194("dtm::date > '2006-01-01'", [0, 2]),195("dtm > '2006-01-01'", [0, 1, 2]), # << implies '2006-01-01 00:00:00'196("dtm <= '2006-01-01'", []), # << implies '2006-01-01 00:00:00'197("dt != '1960-01-07'", [0, 1]),198("tm != '22:10:30'", [0, 2]),199("tm >= '11:00:00' AND tm < '22:00'", [0]),200("tm >= '11:00' AND tm < '22:00:00.000'", [0]),201("tm BETWEEN '12:00' AND '23:59:58'", [0, 1]),202("tm BETWEEN '12:00:00' AND '23:59:58'", [0, 1]),203("dt BETWEEN '2050-01-01' AND '2100-12-31'", [1]),204("dt::datetime = '1960-01-07'", [2]),205("dt::datetime = '1960-01-07 00:00'", [2]),206("dt::datetime = '1960-01-07 00:00:00'", [2]),207("dtm BETWEEN '2020-12-30 10:30:44' AND '2023-01-01 00:00'", [2]),208("dt IN ('1960-01-07','2077-01-01','2222-02-22')", [1, 2]),209(210"dtm = '2024-01-07 01:02:03.123456000' OR dtm = '2020-12-30 10:30:45.987654'",211[0, 2],212),213],214)215def test_implicit_temporal_strings(constraint: str, expected: list[int]) -> None:216df = pl.DataFrame(217{218"idx": [0, 1, 2],219"dtm": [220datetime(2024, 1, 7, 1, 2, 3, 123456),221datetime(2006, 1, 1, 23, 59, 59, 555555),222datetime(2020, 12, 30, 10, 30, 45, 987654),223],224"dt": [225date(2020, 12, 30),226date(2077, 1, 1),227date(1960, 1, 7),228],229"tm": [230time(17, 30, 45),231time(22, 10, 30),232time(10, 25, 15),233],234}235)236res = df.sql(f"SELECT idx FROM self WHERE {constraint}")237actual = sorted(res["idx"])238assert actual == expected239240241@pytest.mark.parametrize(242"dtval",243[244# none of these are valid dates245"2020-12-30T10:30:45",246"yyyy-mm-dd",247"2222-22-22",248"10:30:45",249"foo",250],251)252def test_implicit_temporal_string_errors(dtval: str) -> None:253df = pl.DataFrame({"dt": [date(2020, 12, 30)]})254255with pytest.raises(256InvalidOperationError,257match="(conversion.*failed)|(cannot compare.*string.*temporal)",258):259df.sql(f"SELECT * FROM self WHERE dt = '{dtval}'")260261262def test_strftime() -> None:263df = pl.DataFrame(264{265"dtm": [266None,267datetime(1980, 9, 30, 1, 25, 50),268datetime(2077, 7, 17, 11, 30, 55),269],270"dt": [date(1978, 7, 5), date(1969, 12, 31), date(2020, 4, 10)],271"tm": [time(10, 10, 10), time(22, 33, 55), None],272}273)274res = df.sql(275"""276SELECT277STRFTIME(dtm,'%m.%d.%Y/%T') AS s_dtm,278STRFTIME(dt ,'%B %d, %Y') AS s_dt,279STRFTIME(tm ,'%S.%M.%H') AS s_tm,280FROM self281"""282)283assert res.to_dict(as_series=False) == {284"s_dtm": [None, "09.30.1980/01:25:50", "07.17.2077/11:30:55"],285"s_dt": ["July 05, 1978", "December 31, 1969", "April 10, 2020"],286"s_tm": ["10.10.10", "55.33.22", None],287}288289with pytest.raises(290SQLSyntaxError,291match=r"STRFTIME expects 2 arguments \(found 4\)",292):293pl.sql_expr("STRFTIME(dtm,'%Y-%m-%d','[extra]','[param]')")294295296def test_strptime() -> None:297df = pl.DataFrame(298{299"s_dtm": [None, "09.30.1980/01:25:50", "07.17.2077/11:30:55"],300"s_dt": ["July 5, 1978", "December 31, 1969", "April 10, 2020"],301"s_tm": ["10.10.10", "55.33.22", None],302}303)304res = df.sql(305"""306SELECT307STRPTIME(s_dtm,'%m.%d.%Y/%T') AS dtm,308STRPTIME(s_dt ,'%B %d, %Y')::date AS dt,309STRPTIME(s_tm ,'%S.%M.%H')::time AS tm310FROM self311"""312)313assert res.to_dict(as_series=False) == {314"dtm": [315None,316datetime(1980, 9, 30, 1, 25, 50),317datetime(2077, 7, 17, 11, 30, 55),318],319"dt": [date(1978, 7, 5), date(1969, 12, 31), date(2020, 4, 10)],320"tm": [time(10, 10, 10), time(22, 33, 55), None],321}322with pytest.raises(323SQLSyntaxError,324match=r"STRPTIME expects 2 arguments \(found 3\)",325):326pl.sql_expr("STRPTIME(s,'%Y.%m.%d',false) AS dt")327328329def test_temporal_stings_to_datetime() -> None:330df = pl.DataFrame(331{332"s_dt": ["2077-10-10", "1942-01-08", "2000-07-05"],333"s_dtm1": [334"1999-12-31 10:30:45",335"2020-06-10",336"2022-08-07T00:01:02.654321",337],338"s_dtm2": ["31-12-1999 10:30", "10-06-2020 00:00", "07-08-2022 00:01"],339"s_tm": ["02:04:06", "12:30:45.999", "23:59:59.123456"],340}341)342res = df.sql(343"""344SELECT345DATE(s_dt) AS dt1,346DATETIME(s_dt) AS dt2,347DATETIME(s_dtm1) AS dtm1,348DATETIME(s_dtm2,'%d-%m-%Y %H:%M') AS dtm2,349TIME(s_tm) AS tm350FROM self351"""352)353assert res.schema == {354"dt1": pl.Date,355"dt2": pl.Datetime("us"),356"dtm1": pl.Datetime("us"),357"dtm2": pl.Datetime("us"),358"tm": pl.Time,359}360assert res.rows() == [361(362date(2077, 10, 10),363datetime(2077, 10, 10, 0, 0),364datetime(1999, 12, 31, 10, 30, 45),365datetime(1999, 12, 31, 10, 30),366time(2, 4, 6),367),368(369date(1942, 1, 8),370datetime(1942, 1, 8, 0, 0),371datetime(2020, 6, 10, 0, 0),372datetime(2020, 6, 10, 0, 0),373time(12, 30, 45, 999000),374),375(376date(2000, 7, 5),377datetime(2000, 7, 5, 0, 0),378datetime(2022, 8, 7, 0, 1, 2, 654321),379datetime(2022, 8, 7, 0, 1),380time(23, 59, 59, 123456),381),382]383384for fn in ("DATE", "TIME", "DATETIME"):385with pytest.raises(386SQLSyntaxError,387match=rf"{fn} expects 1-2 arguments \(found 3\)",388):389pl.sql_expr(rf"{fn}(s,fmt,misc) AS xyz")390391392def test_temporal_typed_literals() -> None:393res = pl.sql(394"""395SELECT396DATE '2020-12-30' AS dt,397TIME '00:01:02' AS tm1,398TIME '23:59:59.123456' AS tm2,399TIMESTAMP '1930-01-01 12:30:00' AS dtm1,400TIMESTAMP '2077-04-27T23:45:30.123456' AS dtm2401FROM402(VALUES (0)) tbl (x)403""",404eager=True,405)406assert res.to_dict(as_series=False) == {407"dt": [date(2020, 12, 30)],408"tm1": [time(0, 1, 2)],409"tm2": [time(23, 59, 59, 123456)],410"dtm1": [datetime(1930, 1, 1, 12, 30)],411"dtm2": [datetime(2077, 4, 27, 23, 45, 30, 123456)],412}413414415@pytest.mark.parametrize("fn", ["DATE", "TIME", "TIMESTAMP"])416def test_typed_literals_errors(fn: str) -> None:417with pytest.raises(SQLSyntaxError, match=f"invalid {fn} literal '999'"):418pl.sql_expr(f"{fn} '999'")419420421@pytest.mark.parametrize(422("unit", "expected"),423[424("ms", [1704589323123, 1609324245987, 1136159999555]),425("us", [1704589323123456, 1609324245987654, 1136159999555555]),426("ns", [1704589323123456000, 1609324245987654000, 1136159999555555000]),427],428)429def test_timestamp_time_unit(unit: str | None, expected: list[int]) -> None:430df = pl.DataFrame(431{432"ts": [433datetime(2024, 1, 7, 1, 2, 3, 123456),434datetime(2020, 12, 30, 10, 30, 45, 987654),435datetime(2006, 1, 1, 23, 59, 59, 555555),436],437}438)439precision = {"ms": 3, "us": 6, "ns": 9}440441with pl.SQLContext(frame_data=df, eager=True) as ctx:442prec = f"({precision[unit]})" if unit else ""443res = ctx.execute(f"SELECT ts::timestamp{prec} FROM frame_data").to_series()444445assert res.dtype == pl.Datetime(time_unit=unit) # type: ignore[arg-type]446assert res.to_physical().to_list() == expected447448449def test_timestamp_time_unit_errors() -> None:450df = pl.DataFrame({"ts": [datetime(2024, 1, 7, 1, 2, 3, 123456)]})451452with pl.SQLContext(frame_data=df, eager=True) as ctx:453for prec in (0, 15):454with pytest.raises(455SQLSyntaxError,456match=rf"invalid temporal type precision \(expected 1-9, found {prec}\)",457):458ctx.execute(f"SELECT ts::timestamp({prec}) FROM frame_data")459460with pytest.raises(461SQLInterfaceError,462match="sql parser error: Expected: literal int, found: - ",463):464ctx.execute("SELECT ts::timestamp(-3) FROM frame_data")465466467