Path: blob/main/py-polars/tests/unit/sql/test_temporal.py
8406 views
from __future__ import annotations12from datetime import date, datetime, time3from typing import Any, Literal45import pytest67import polars as pl8from polars.exceptions import InvalidOperationError, SQLInterfaceError, SQLSyntaxError9from polars.testing import assert_frame_equal101112def test_date_func() -> None:13df = pl.DataFrame(14{15"date": [16date(2021, 3, 15),17date(2021, 3, 28),18date(2021, 4, 4),19],20"version": ["0.0.1", "0.7.3", "0.7.4"],21}22)23with pl.SQLContext(df=df, eager=True) as ctx:24result = ctx.execute("SELECT date < DATE('2021-03-20') FROM df")2526expected = pl.DataFrame({"date": [True, False, False]})27assert_frame_equal(result, expected)2829result = pl.select(pl.sql_expr("CAST(DATE('2023-03', '%Y-%m') as STRING)"))30expected = pl.DataFrame({"literal": ["2023-03-01"]})31assert_frame_equal(result, expected)3233with pytest.raises(34SQLSyntaxError,35match=r"DATE expects 1-2 arguments \(found 0\)",36):37df.sql("SELECT DATE() FROM self")3839with pytest.raises(InvalidOperationError):40df.sql("SELECT DATE('2077-07-07','not_a_valid_strftime_format') FROM self")414243@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"])44def test_datetime_to_time(time_unit: Literal["ns", "us", "ms"]) -> None:45s = pl.Series(46name="dtm",47values=[48datetime(2099, 12, 31, 23, 59, 59),49datetime(1999, 12, 31, 12, 30, 30),50datetime(1969, 12, 31, 1, 1, 1),51datetime(1899, 12, 31, 0, 0, 0),52],53dtype=pl.Datetime(time_unit),54)55df = s.to_frame()56lf = df.lazy()5758select = "SELECT dtm::time AS tm"59for res in (60pl.sql(f"{select} FROM df").collect(),61pl.sql(f"{select} FROM s").collect(),62lf.sql(f"{select} FROM self").collect(),63df.sql(f"{select} FROM self"),64s.sql(f"{select} FROM self"),65):66assert isinstance(res, pl.DataFrame)67assert res["tm"].to_list() == [68time(23, 59, 59),69time(12, 30, 30),70time(1, 1, 1),71time(0, 0, 0),72]737475@pytest.mark.parametrize(76("parts", "dtype", "expected"),77[78(["decade", "decades"], pl.Int32, [202, 202, 200]),79(["isoyear"], pl.Int32, [2024, 2020, 2005]),80(["year", "y"], pl.Int32, [2024, 2020, 2006]),81(["quarter"], pl.Int8, [1, 4, 1]),82(["month", "months", "mon", "mons"], pl.Int8, [1, 12, 1]),83(["week", "weeks"], pl.Int8, [1, 53, 52]),84(["doy"], pl.Int16, [7, 365, 1]),85(["isodow"], pl.Int8, [7, 3, 7]),86(["dow"], pl.Int8, [0, 3, 0]),87(["day", "days", "d"], pl.Int8, [7, 30, 1]),88(["hour", "hours", "h"], pl.Int8, [1, 10, 23]),89(["minute", "min", "mins", "m"], pl.Int8, [2, 30, 59]),90(["second", "seconds", "secs", "sec"], pl.Int8, [3, 45, 59]),91(92["millisecond", "milliseconds", "ms"],93pl.Float64,94[3123.456, 45987.654, 59555.555],95),96(97["microsecond", "microseconds", "us"],98pl.Float64,99[3123456.0, 45987654.0, 59555555.0],100),101(102["nanosecond", "nanoseconds", "ns"],103pl.Float64,104[3123456000.0, 45987654000.0, 59555555000.0],105),106(107["time"],108pl.Time,109[time(1, 2, 3, 123456), time(10, 30, 45, 987654), time(23, 59, 59, 555555)],110),111(112["epoch"],113pl.Float64,114[1704589323.123456, 1609324245.987654, 1136159999.555555],115),116],117)118def test_extract(parts: list[str], dtype: pl.DataType, expected: list[Any]) -> None:119df = pl.DataFrame(120{121"dt": [122# note: these values test several edge-cases, such as isoyear,123# the mon/sun wrapping of dow vs isodow, epoch rounding, etc,124# and the results have been validated against postgresql.125datetime(2024, 1, 7, 1, 2, 3, 123456),126datetime(2020, 12, 30, 10, 30, 45, 987654),127datetime(2006, 1, 1, 23, 59, 59, 555555),128],129}130)131with pl.SQLContext(frame_data=df, eager=True) as ctx:132for part in parts:133for fn in (134f"EXTRACT({part} FROM dt)",135f"DATE_PART('{part}',dt)",136):137res = ctx.execute(f"SELECT {fn} AS {part} FROM frame_data").to_series()138assert res.dtype == dtype139assert res.to_list() == expected140141142def test_extract_errors() -> None:143df = pl.DataFrame({"dt": [datetime(2024, 1, 7, 1, 2, 3, 123456)]})144145with pl.SQLContext(frame_data=df, eager=True) as ctx:146for part in ("femtosecond", "stroopwafel"):147with pytest.raises(148SQLSyntaxError,149match=f"EXTRACT/DATE_PART does not support '{part}' part",150):151ctx.execute(f"SELECT EXTRACT({part} FROM dt) FROM frame_data")152153with pytest.raises(154SQLSyntaxError,155match=r"EXTRACT/DATE_PART does not support 'week\(tuesday\)' part",156):157ctx.execute("SELECT DATE_PART('week(tuesday)', dt) FROM frame_data")158159160@pytest.mark.parametrize(161("dt", "expected"),162[163(date(1, 1, 1), [1, 1]),164(date(100, 1, 1), [1, 1]),165(date(101, 1, 1), [1, 2]),166(date(1000, 1, 1), [1, 10]),167(date(1001, 1, 1), [2, 11]),168(date(1899, 12, 31), [2, 19]),169(date(1900, 12, 31), [2, 19]),170(date(1901, 1, 1), [2, 20]),171(date(2000, 12, 31), [2, 20]),172(date(2001, 1, 1), [3, 21]),173(date(5555, 5, 5), [6, 56]),174(date(9999, 12, 31), [10, 100]),175],176)177def test_extract_century_millennium(dt: date, expected: list[int]) -> None:178with pl.SQLContext(frame_data=pl.DataFrame({"dt": [dt]}), eager=True) as ctx:179res = ctx.execute(180"""181SELECT182EXTRACT(MILLENNIUM FROM dt) AS c1,183DATE_PART('century',dt) AS c2,184EXTRACT(millennium FROM dt) AS c3,185DATE_PART('CENTURY',dt) AS c4,186FROM frame_data187"""188)189assert_frame_equal(190left=res,191right=pl.DataFrame(192data=[expected + expected],193schema=["c1", "c2", "c3", "c4"],194orient="row",195).cast(pl.Int32),196)197198199@pytest.mark.parametrize(200("constraint", "expected"),201[202("dtm >= '2020-12-30T10:30:45.987'", [0, 2]),203("dtm::date > '2006-01-01'", [0, 2]),204("dtm > '2006-01-01'", [0, 1, 2]), # << implies '2006-01-01 00:00:00'205("dtm <= '2006-01-01'", []), # << implies '2006-01-01 00:00:00'206("dt != '1960-01-07'", [0, 1]),207("tm != '22:10:30'", [0, 2]),208("tm >= '11:00:00' AND tm < '22:00'", [0]),209("tm >= '11:00' AND tm < '22:00:00.000'", [0]),210("tm BETWEEN '12:00' AND '23:59:58'", [0, 1]),211("tm BETWEEN '12:00:00' AND '23:59:58'", [0, 1]),212("dt BETWEEN '2050-01-01' AND '2100-12-31'", [1]),213("dt::datetime = '1960-01-07'", [2]),214("dt::datetime = '1960-01-07 00:00'", [2]),215("dt::datetime = '1960-01-07 00:00:00'", [2]),216("dtm BETWEEN '2020-12-30 10:30:44' AND '2023-01-01 00:00'", [2]),217("dt IN ('1960-01-07','2077-01-01','2222-02-22')", [1, 2]),218(219"dtm = '2024-01-07 01:02:03.123456000' OR dtm = '2020-12-30 10:30:45.987654'",220[0, 2],221),222],223)224def test_implicit_temporal_strings(constraint: str, expected: list[int]) -> None:225df = pl.DataFrame(226{227"idx": [0, 1, 2],228"dtm": [229datetime(2024, 1, 7, 1, 2, 3, 123456),230datetime(2006, 1, 1, 23, 59, 59, 555555),231datetime(2020, 12, 30, 10, 30, 45, 987654),232],233"dt": [234date(2020, 12, 30),235date(2077, 1, 1),236date(1960, 1, 7),237],238"tm": [239time(17, 30, 45),240time(22, 10, 30),241time(10, 25, 15),242],243}244)245res = df.sql(f"SELECT idx FROM self WHERE {constraint}")246actual = sorted(res["idx"])247assert actual == expected248249250@pytest.mark.parametrize(251"dtval",252[253# none of these are valid dates254"2020-12-30T10:30:45",255"yyyy-mm-dd",256"2222-22-22",257"10:30:45",258"foo",259],260)261def test_implicit_temporal_string_errors(dtval: str) -> None:262df = pl.DataFrame({"dt": [date(2020, 12, 30)]})263264with pytest.raises(265InvalidOperationError,266match=r"(conversion.*failed)|(cannot compare.*string.*temporal)",267):268df.sql(f"SELECT * FROM self WHERE dt = '{dtval}'")269270271def test_strftime() -> None:272df = pl.DataFrame(273{274"dtm": [275None,276datetime(1980, 9, 30, 1, 25, 50),277datetime(2077, 7, 17, 11, 30, 55),278],279"dt": [date(1978, 7, 5), date(1969, 12, 31), date(2020, 4, 10)],280"tm": [time(10, 10, 10), time(22, 33, 55), None],281}282)283res = df.sql(284"""285SELECT286STRFTIME(dtm,'%m.%d.%Y/%T') AS s_dtm,287STRFTIME(dt ,'%B %d, %Y') AS s_dt,288STRFTIME(tm ,'%S.%M.%H') AS s_tm,289FROM self290"""291)292assert res.to_dict(as_series=False) == {293"s_dtm": [None, "09.30.1980/01:25:50", "07.17.2077/11:30:55"],294"s_dt": ["July 05, 1978", "December 31, 1969", "April 10, 2020"],295"s_tm": ["10.10.10", "55.33.22", None],296}297298with pytest.raises(299SQLSyntaxError,300match=r"STRFTIME expects 2 arguments \(found 4\)",301):302pl.sql_expr("STRFTIME(dtm,'%Y-%m-%d','[extra]','[param]')")303304305def test_strptime() -> None:306df = pl.DataFrame(307{308"s_dtm": [None, "09.30.1980/01:25:50", "07.17.2077/11:30:55"],309"s_dt": ["July 5, 1978", "December 31, 1969", "April 10, 2020"],310"s_tm": ["10.10.10", "55.33.22", None],311}312)313res = df.sql(314"""315SELECT316STRPTIME(s_dtm,'%m.%d.%Y/%T') AS dtm,317STRPTIME(s_dt ,'%B %d, %Y')::date AS dt,318STRPTIME(s_tm ,'%S.%M.%H')::time AS tm319FROM self320"""321)322assert res.to_dict(as_series=False) == {323"dtm": [324None,325datetime(1980, 9, 30, 1, 25, 50),326datetime(2077, 7, 17, 11, 30, 55),327],328"dt": [date(1978, 7, 5), date(1969, 12, 31), date(2020, 4, 10)],329"tm": [time(10, 10, 10), time(22, 33, 55), None],330}331with pytest.raises(332SQLSyntaxError,333match=r"STRPTIME expects 2 arguments \(found 3\)",334):335pl.sql_expr("STRPTIME(s,'%Y.%m.%d',false) AS dt")336337338def test_temporal_stings_to_datetime() -> None:339df = pl.DataFrame(340{341"s_dt": ["2077-10-10", "1942-01-08", "2000-07-05"],342"s_dtm1": [343"1999-12-31 10:30:45",344"2020-06-10",345"2022-08-07T00:01:02.654321",346],347"s_dtm2": ["31-12-1999 10:30", "10-06-2020 00:00", "07-08-2022 00:01"],348"s_tm": ["02:04:06", "12:30:45.999", "23:59:59.123456"],349}350)351res = df.sql(352"""353SELECT354DATE(s_dt) AS dt1,355DATETIME(s_dt) AS dt2,356DATETIME(s_dtm1) AS dtm1,357DATETIME(s_dtm2,'%d-%m-%Y %H:%M') AS dtm2,358TIME(s_tm) AS tm359FROM self360"""361)362assert res.schema == {363"dt1": pl.Date,364"dt2": pl.Datetime("us"),365"dtm1": pl.Datetime("us"),366"dtm2": pl.Datetime("us"),367"tm": pl.Time,368}369assert res.rows() == [370(371date(2077, 10, 10),372datetime(2077, 10, 10, 0, 0),373datetime(1999, 12, 31, 10, 30, 45),374datetime(1999, 12, 31, 10, 30),375time(2, 4, 6),376),377(378date(1942, 1, 8),379datetime(1942, 1, 8, 0, 0),380datetime(2020, 6, 10, 0, 0),381datetime(2020, 6, 10, 0, 0),382time(12, 30, 45, 999000),383),384(385date(2000, 7, 5),386datetime(2000, 7, 5, 0, 0),387datetime(2022, 8, 7, 0, 1, 2, 654321),388datetime(2022, 8, 7, 0, 1),389time(23, 59, 59, 123456),390),391]392393for fn in ("DATE", "TIME", "DATETIME"):394with pytest.raises(395SQLSyntaxError,396match=rf"{fn} expects 1-2 arguments \(found 3\)",397):398pl.sql_expr(rf"{fn}(s,fmt,misc) AS xyz")399400401def test_temporal_typed_literals() -> None:402res = pl.sql(403"""404SELECT405DATE '2020-12-30' AS dt,406TIME '00:01:02' AS tm1,407TIME '23:59:59.123456' AS tm2,408TIMESTAMP '1930-01-01 12:30:00' AS dtm1,409TIMESTAMP '2077-04-27T23:45:30.123456' AS dtm2410FROM411(VALUES (0)) tbl (x)412""",413eager=True,414)415assert res.to_dict(as_series=False) == {416"dt": [date(2020, 12, 30)],417"tm1": [time(0, 1, 2)],418"tm2": [time(23, 59, 59, 123456)],419"dtm1": [datetime(1930, 1, 1, 12, 30)],420"dtm2": [datetime(2077, 4, 27, 23, 45, 30, 123456)],421}422423424@pytest.mark.parametrize("fn", ["DATE", "TIME", "TIMESTAMP"])425def test_typed_literals_errors(fn: str) -> None:426with pytest.raises(SQLSyntaxError, match=f"invalid {fn} literal '999'"):427pl.sql_expr(f"{fn} '999'")428429430@pytest.mark.parametrize(431("unit", "expected"),432[433("ms", [1704589323123, 1609324245987, 1136159999555]),434("us", [1704589323123456, 1609324245987654, 1136159999555555]),435("ns", [1704589323123456000, 1609324245987654000, 1136159999555555000]),436],437)438def test_timestamp_time_unit(unit: str | None, expected: list[int]) -> None:439df = pl.DataFrame(440{441"ts": [442datetime(2024, 1, 7, 1, 2, 3, 123456),443datetime(2020, 12, 30, 10, 30, 45, 987654),444datetime(2006, 1, 1, 23, 59, 59, 555555),445],446}447)448precision = {"ms": 3, "us": 6, "ns": 9}449450with pl.SQLContext(frame_data=df, eager=True) as ctx:451prec = f"({precision[unit]})" if unit else ""452res = ctx.execute(f"SELECT ts::timestamp{prec} FROM frame_data").to_series()453454assert res.dtype == pl.Datetime(time_unit=unit) # type: ignore[arg-type]455assert res.to_physical().to_list() == expected456457458def test_timestamp_time_unit_errors() -> None:459df = pl.DataFrame({"ts": [datetime(2024, 1, 7, 1, 2, 3, 123456)]})460461with pl.SQLContext(frame_data=df, eager=True) as ctx:462for prec in (0, 15):463with pytest.raises(464SQLSyntaxError,465match=rf"invalid temporal type precision \(expected 1-9, found {prec}\)",466):467ctx.execute(f"SELECT ts::timestamp({prec}) FROM frame_data")468469with pytest.raises(470SQLInterfaceError,471match="sql parser error: Expected: literal int, found: - ",472):473ctx.execute("SELECT ts::timestamp(-3) FROM frame_data")474475476