Path: blob/main/py-polars/tests/unit/operations/test_join_asof.py
6939 views
from __future__ import annotations12import warnings3from datetime import date, datetime, timedelta4from typing import TYPE_CHECKING, Any56import numpy as np7import pytest89import polars as pl10from polars.exceptions import DuplicateError, InvalidOperationError11from polars.testing import assert_frame_equal1213if TYPE_CHECKING:14from polars._typing import AsofJoinStrategy, PolarsIntegerType151617def test_asof_join_singular_right_11966() -> None:18df = pl.DataFrame({"id": [1, 2, 3], "time": [0.9, 2.1, 2.8]}).sort("time")19lookup = pl.DataFrame({"time": [2.0], "value": [100]}).sort("time")20joined = df.join_asof(lookup, on="time", strategy="nearest")21expected = pl.DataFrame(22{"id": [1, 2, 3], "time": [0.9, 2.1, 2.8], "value": [100, 100, 100]}23)24assert_frame_equal(joined, expected)252627def test_asof_join_inline_cast_6438() -> None:28df_trades = pl.DataFrame(29{30"time": [31datetime(2020, 1, 1, 9, 1, 0),32datetime(2020, 1, 1, 9, 1, 0),33datetime(2020, 1, 1, 9, 3, 0),34datetime(2020, 1, 1, 9, 6, 0),35],36"stock": ["A", "B", "B", "C"],37"trade": [101, 299, 301, 500],38}39)4041df_quotes = pl.DataFrame(42{43"time": [44datetime(2020, 1, 1, 9, 0, 0),45datetime(2020, 1, 1, 9, 2, 0),46datetime(2020, 1, 1, 9, 3, 0),47datetime(2020, 1, 1, 9, 6, 0),48],49"stock": ["A", "B", "C", "A"],50"quote": [100, 300, 501, 102],51}52).with_columns([pl.col("time").dt.cast_time_unit("ns")])5354assert df_trades.join_asof(55df_quotes, on=pl.col("time").cast(pl.Datetime("ns")).set_sorted(), by="stock"56).to_dict(as_series=False) == {57"time": [58datetime(2020, 1, 1, 9, 1),59datetime(2020, 1, 1, 9, 1),60datetime(2020, 1, 1, 9, 3),61datetime(2020, 1, 1, 9, 6),62],63"time_right": [64datetime(2020, 1, 1, 9, 0),65None,66datetime(2020, 1, 1, 9, 2),67datetime(2020, 1, 1, 9, 3),68],69"stock": ["A", "B", "B", "C"],70"trade": [101, 299, 301, 500],71"quote": [100, None, 300, 501],72}737475def test_asof_join_projection_resolution_4606() -> None:76a = pl.DataFrame({"a": [1], "b": [2], "c": [3]}).lazy()77b = pl.DataFrame({"a": [1], "b": [2], "d": [4]}).lazy()78joined_tbl = a.join_asof(b, on=pl.col("a").set_sorted(), by="b")79assert joined_tbl.group_by("a").agg(80[pl.col("c").sum().alias("c")]81).collect().columns == ["a", "c"]828384def test_asof_join_schema_5211() -> None:85df1 = pl.DataFrame({"today": [1, 2]})8687df2 = pl.DataFrame({"next_friday": [1, 2]})8889assert (90df1.lazy()91.join_asof(92df2.lazy(), left_on="today", right_on="next_friday", strategy="forward"93)94.collect_schema()95) == {"today": pl.Int64, "next_friday": pl.Int64}969798def test_asof_join_schema_5684() -> None:99df_a = (100pl.DataFrame(101{102"id": [1],103"a": [1],104"b": [1],105}106)107.lazy()108.set_sorted("a")109)110111df_b = (112pl.DataFrame(113{114"id": [1, 1, 2],115"b": [-3, -3, 6],116}117)118.lazy()119.set_sorted("b")120)121122q = (123df_a.join_asof(df_b, by="id", left_on="a", right_on="b")124.drop("b")125.join_asof(df_b, by="id", left_on="a", right_on="b")126.drop("b")127)128129projected_result = q.select(pl.all()).collect()130result = q.collect()131132assert_frame_equal(projected_result, result)133assert (134q.collect_schema()135== projected_result.schema136== {"id": pl.Int64, "a": pl.Int64, "b_right": pl.Int64}137)138139140def test_join_asof_mismatched_dtypes() -> None:141# test 'on' dtype mismatch142df1 = pl.DataFrame(143{"a": pl.Series([1, 2, 3], dtype=pl.Int64), "b": ["a", "b", "c"]}144)145df2 = pl.DataFrame(146{"a": pl.Series([1.0, 2.0, 3.0], dtype=pl.Float64), "c": ["d", "e", "f"]}147)148149with pytest.raises(150pl.exceptions.SchemaError, match="datatypes of join keys don't match"151):152df1.join_asof(df2, on="a", strategy="forward")153154# test 'by' dtype mismatch155df1 = pl.DataFrame(156{157"time": pl.date_range(date(2018, 1, 1), date(2018, 1, 8), eager=True),158"group": pl.Series([1, 1, 1, 1, 2, 2, 2, 2], dtype=pl.Int32),159"value": [0, 0, None, None, 2, None, 1, None],160}161)162df2 = pl.DataFrame(163{164"time": pl.date_range(date(2018, 1, 1), date(2018, 1, 8), eager=True),165"group": pl.Series([1, 1, 1, 1, 2, 2, 2, 2], dtype=pl.Int64),166"value": [0, 0, None, None, 2, None, 1, None],167}168)169170with pytest.raises(171pl.exceptions.ComputeError, match="mismatching dtypes in 'by' parameter"172):173df1.join_asof(df2, on="time", by="group", strategy="forward")174175176def test_join_asof_floats() -> None:177df1 = pl.DataFrame({"a": [1.0, 2.0, 3.0], "b": ["lrow1", "lrow2", "lrow3"]})178df2 = pl.DataFrame({"a": [0.59, 1.49, 2.89], "b": ["rrow1", "rrow2", "rrow3"]})179180result = df1.join_asof(df2, on=pl.col("a").set_sorted(), strategy="backward")181expected = {182"a": [1.0, 2.0, 3.0],183"b": ["lrow1", "lrow2", "lrow3"],184"a_right": [0.59, 1.49, 2.89],185"b_right": ["rrow1", "rrow2", "rrow3"],186}187assert result.to_dict(as_series=False) == expected188189# with by argument190# 5740191df1 = pl.DataFrame(192{"b": np.linspace(0, 5, 7), "c": ["x" if i < 4 else "y" for i in range(7)]}193)194df2 = pl.DataFrame(195{196"val": [0.0, 2.5, 2.6, 2.7, 3.4, 4.0, 5.0],197"c": ["x", "x", "x", "y", "y", "y", "y"],198}199).with_columns(pl.col("val").alias("b").set_sorted())200assert df1.set_sorted("b").join_asof(df2, on=pl.col("b"), by="c").to_dict(201as_series=False202) == {203"b": [2040.0,2050.8333333333333334,2061.6666666666666667,2072.5,2083.3333333333333335,2094.166666666666667,2105.0,211],212"c": ["x", "x", "x", "x", "y", "y", "y"],213"val": [0.0, 0.0, 0.0, 2.5, 2.7, 4.0, 5.0],214}215216217def test_join_asof_tolerance() -> None:218df_trades = pl.DataFrame(219{220"time": [221datetime(2020, 1, 1, 9, 0, 1),222datetime(2020, 1, 1, 9, 0, 1),223datetime(2020, 1, 1, 9, 0, 3),224datetime(2020, 1, 1, 9, 0, 6),225],226"stock": ["A", "B", "B", "C"],227"trade": [101, 299, 301, 500],228}229).set_sorted("time")230231df_quotes = pl.DataFrame(232{233"time": [234datetime(2020, 1, 1, 9, 0, 0),235datetime(2020, 1, 1, 9, 0, 2),236datetime(2020, 1, 1, 9, 0, 4),237datetime(2020, 1, 1, 9, 0, 6),238],239"stock": ["A", "B", "C", "A"],240"quote": [100, 300, 501, 102],241}242).set_sorted("time")243244assert df_trades.join_asof(245df_quotes, on="time", by="stock", tolerance="2s"246).to_dict(as_series=False) == {247"time": [248datetime(2020, 1, 1, 9, 0, 1),249datetime(2020, 1, 1, 9, 0, 1),250datetime(2020, 1, 1, 9, 0, 3),251datetime(2020, 1, 1, 9, 0, 6),252],253"stock": ["A", "B", "B", "C"],254"trade": [101, 299, 301, 500],255"quote": [100, None, 300, 501],256}257258assert df_trades.join_asof(259df_quotes, on="time", by="stock", tolerance="1s"260).to_dict(as_series=False) == {261"time": [262datetime(2020, 1, 1, 9, 0, 1),263datetime(2020, 1, 1, 9, 0, 1),264datetime(2020, 1, 1, 9, 0, 3),265datetime(2020, 1, 1, 9, 0, 6),266],267"stock": ["A", "B", "B", "C"],268"trade": [101, 299, 301, 500],269"quote": [100, None, 300, None],270}271272for invalid_tolerance, match in [273("foo", "expected leading integer"),274([None], "could not extract number"),275]:276with pytest.raises(pl.exceptions.PolarsError, match=match):277df_trades.join_asof(278df_quotes,279on="time",280by="stock",281tolerance=invalid_tolerance, # type: ignore[arg-type]282)283284285def test_join_asof_tolerance_forward() -> None:286df_quotes = pl.DataFrame(287{288"time": [289datetime(2020, 1, 1, 9, 0, 0),290datetime(2020, 1, 1, 9, 0, 2),291datetime(2020, 1, 1, 9, 0, 4),292datetime(2020, 1, 1, 9, 0, 6),293datetime(2020, 1, 1, 9, 0, 7),294],295"stock": ["A", "B", "C", "A", "D"],296"quote": [100, 300, 501, 102, 10],297}298).set_sorted("time")299300df_trades = pl.DataFrame(301{302"time": [303datetime(2020, 1, 1, 9, 0, 2),304datetime(2020, 1, 1, 9, 0, 1),305datetime(2020, 1, 1, 9, 0, 3),306datetime(2020, 1, 1, 9, 0, 6),307datetime(2020, 1, 1, 9, 0, 7),308],309"stock": ["A", "B", "B", "C", "D"],310"trade": [101, 299, 301, 500, 10],311}312).set_sorted("time")313314assert df_quotes.join_asof(315df_trades, on="time", by="stock", tolerance="2s", strategy="forward"316).to_dict(as_series=False) == {317"time": [318datetime(2020, 1, 1, 9, 0, 0),319datetime(2020, 1, 1, 9, 0, 2),320datetime(2020, 1, 1, 9, 0, 4),321datetime(2020, 1, 1, 9, 0, 6),322datetime(2020, 1, 1, 9, 0, 7),323],324"stock": ["A", "B", "C", "A", "D"],325"quote": [100, 300, 501, 102, 10],326"trade": [101, 301, 500, None, 10],327}328329assert df_quotes.join_asof(330df_trades, on="time", by="stock", tolerance="1s", strategy="forward"331).to_dict(as_series=False) == {332"time": [333datetime(2020, 1, 1, 9, 0, 0),334datetime(2020, 1, 1, 9, 0, 2),335datetime(2020, 1, 1, 9, 0, 4),336datetime(2020, 1, 1, 9, 0, 6),337datetime(2020, 1, 1, 9, 0, 7),338],339"stock": ["A", "B", "C", "A", "D"],340"quote": [100, 300, 501, 102, 10],341"trade": [None, 301, None, None, 10],342}343344# Sanity check that this gives us equi-join345assert df_quotes.join_asof(346df_trades, on="time", by="stock", tolerance="0s", strategy="forward"347).to_dict(as_series=False) == {348"time": [349datetime(2020, 1, 1, 9, 0, 0),350datetime(2020, 1, 1, 9, 0, 2),351datetime(2020, 1, 1, 9, 0, 4),352datetime(2020, 1, 1, 9, 0, 6),353datetime(2020, 1, 1, 9, 0, 7),354],355"stock": ["A", "B", "C", "A", "D"],356"quote": [100, 300, 501, 102, 10],357"trade": [None, None, None, None, 10],358}359360361def test_join_asof_projection() -> None:362df1 = pl.DataFrame(363{364"df1_date": [20221011, 20221012, 20221013, 20221014, 20221016],365"df1_col1": ["foo", "bar", "foo", "bar", "foo"],366"key": ["a", "b", "b", "a", "b"],367}368).set_sorted("df1_date")369370df2 = pl.DataFrame(371{372"df2_date": [20221012, 20221015, 20221018],373"df2_col1": ["1", "2", "3"],374"key": ["a", "b", "b"],375}376).set_sorted("df2_date")377378assert (379(380df1.lazy().join_asof(df2.lazy(), left_on="df1_date", right_on="df2_date")381).select([pl.col("df2_date"), "df1_date"])382).collect().to_dict(as_series=False) == {383"df2_date": [None, 20221012, 20221012, 20221012, 20221015],384"df1_date": [20221011, 20221012, 20221013, 20221014, 20221016],385}386assert (387df1.lazy().join_asof(388df2.lazy(), by="key", left_on="df1_date", right_on="df2_date"389)390).select(["df2_date", "df1_date"]).collect().to_dict(as_series=False) == {391"df2_date": [None, None, None, 20221012, 20221015],392"df1_date": [20221011, 20221012, 20221013, 20221014, 20221016],393}394395396def test_asof_join_by_logical_types() -> None:397dates = (398pl.datetime_range(399datetime(2022, 1, 1), datetime(2022, 1, 2), interval="2h", eager=True400)401.cast(pl.Datetime("ns"))402.head(9)403)404x = pl.DataFrame({"a": dates, "b": map(float, range(9)), "c": ["1", "2", "3"] * 3})405406result = x.join_asof(x, on=pl.col("b").set_sorted(), by=["c", "a"])407408expected = {409"a": [410datetime(2022, 1, 1, 0, 0),411datetime(2022, 1, 1, 2, 0),412datetime(2022, 1, 1, 4, 0),413datetime(2022, 1, 1, 6, 0),414datetime(2022, 1, 1, 8, 0),415datetime(2022, 1, 1, 10, 0),416datetime(2022, 1, 1, 12, 0),417datetime(2022, 1, 1, 14, 0),418datetime(2022, 1, 1, 16, 0),419],420"b": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],421"c": ["1", "2", "3", "1", "2", "3", "1", "2", "3"],422"b_right": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],423}424assert result.to_dict(as_series=False) == expected425426427def test_join_asof_projection_7481() -> None:428ldf1 = pl.DataFrame({"a": [1, 2, 2], "b": "bleft"}).lazy().set_sorted("a")429ldf2 = pl.DataFrame({"a": 2, "b": [1, 2, 2]}).lazy().set_sorted("b")430431assert (432ldf1.join_asof(ldf2, left_on="a", right_on="b").select("a", "b")433).collect().to_dict(as_series=False) == {434"a": [1, 2, 2],435"b": ["bleft", "bleft", "bleft"],436}437438439def test_asof_join_sorted_by_group(capsys: Any) -> None:440df1 = pl.DataFrame(441{442"key": ["a", "a", "a", "b", "b", "b"],443"asof_key": [2.0, 1.0, 3.0, 1.0, 2.0, 3.0],444"a": [102, 101, 103, 104, 105, 106],445}446).sort(by=["key", "asof_key"])447448df2 = pl.DataFrame(449{450"key": ["a", "a", "a", "b", "b", "b"],451"asof_key": [0.9, 1.9, 2.9, 0.9, 1.9, 2.9],452"b": [201, 202, 203, 204, 205, 206],453}454).sort(by=["key", "asof_key"])455456expected = pl.DataFrame(457[458pl.Series("key", ["a", "a", "a", "b", "b", "b"], dtype=pl.String),459pl.Series("asof_key", [1.0, 2.0, 3.0, 1.0, 2.0, 3.0], dtype=pl.Float64),460pl.Series("a", [101, 102, 103, 104, 105, 106], dtype=pl.Int64),461pl.Series("b", [201, 202, 203, 204, 205, 206], dtype=pl.Int64),462]463)464465out = df1.join_asof(df2, on="asof_key", by="key")466assert_frame_equal(out, expected)467468_, err = capsys.readouterr()469assert "is not explicitly sorted" not in err470471472def test_asof_join_nearest() -> None:473# Generic join_asof474df1 = pl.DataFrame(475{476"asof_key": [-1, 1, 2, 4, 6],477"a": [1, 2, 3, 4, 5],478}479).sort(by="asof_key")480481df2 = pl.DataFrame(482{483"asof_key": [-1, 2, 4, 5],484"b": [1, 2, 3, 4],485}486).sort(by="asof_key")487488expected = pl.DataFrame(489{"asof_key": [-1, 1, 2, 4, 6], "a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 3, 4]}490)491492out = df1.join_asof(df2, on="asof_key", strategy="nearest")493assert_frame_equal(out, expected)494495# Edge case: last item of right matches multiples on left496df1 = pl.DataFrame(497{498"asof_key": [9, 9, 10, 10, 10],499"a": [1, 2, 3, 4, 5],500}501).set_sorted("asof_key")502df2 = pl.DataFrame(503{504"asof_key": [1, 2, 3, 10],505"b": [1, 2, 3, 4],506}507).set_sorted("asof_key")508expected = pl.DataFrame(509{510"asof_key": [9, 9, 10, 10, 10],511"a": [1, 2, 3, 4, 5],512"b": [4, 4, 4, 4, 4],513}514)515516out = df1.join_asof(df2, on="asof_key", strategy="nearest")517assert_frame_equal(out, expected)518519520def test_asof_join_nearest_with_tolerance() -> None:521a = b = [1, 2, 3, 4, 5]522523nones = pl.Series([None, None, None, None, None], dtype=pl.Int64)524525# Case 1: complete miss526df1 = pl.DataFrame({"asof_key": [1, 2, 3, 4, 5], "a": a}).set_sorted("asof_key")527df2 = pl.DataFrame(528{529"asof_key": [7, 8, 9, 10, 11],530"b": b,531}532).set_sorted("asof_key")533expected = df1.with_columns(nones.alias("b"))534out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1)535assert_frame_equal(out, expected)536537# Case 2: complete miss in other direction538df1 = pl.DataFrame({"asof_key": [7, 8, 9, 10, 11], "a": a}).set_sorted("asof_key")539df2 = pl.DataFrame(540{541"asof_key": [1, 2, 3, 4, 5],542"b": b,543}544).set_sorted("asof_key")545expected = df1.with_columns(nones.alias("b"))546out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1)547assert_frame_equal(out, expected)548549# Case 3: match first item550df1 = pl.DataFrame({"asof_key": [1, 2, 3, 4, 5], "a": a}).set_sorted("asof_key")551df2 = pl.DataFrame(552{553"asof_key": [6, 7, 8, 9, 10],554"b": b,555}556).set_sorted("asof_key")557out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1)558expected = df1.with_columns(pl.Series([None, None, None, None, 1]).alias("b"))559assert_frame_equal(out, expected)560561# Case 4: match last item562df1 = pl.DataFrame({"asof_key": [1, 2, 3, 4, 5], "a": a}).set_sorted("asof_key")563df2 = pl.DataFrame(564{565"asof_key": [-4, -3, -2, -1, 0],566"b": b,567}568).set_sorted("asof_key")569out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1)570expected = df1.with_columns(pl.Series([5, None, None, None, None]).alias("b"))571assert_frame_equal(out, expected)572573# Case 5: match multiples, pick closer574df1 = pl.DataFrame(575{"asof_key": pl.Series([1, 2, 3, 4, 5], dtype=pl.Float64), "a": a}576).set_sorted("asof_key")577df2 = pl.DataFrame(578{579"asof_key": [0.0, 2.0, 2.4, 3.4, 10.0],580"b": b,581}582).set_sorted("asof_key")583out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1)584expected = df1.with_columns(pl.Series([2, 2, 4, 4, None]).alias("b"))585assert_frame_equal(out, expected)586587# Case 6: use 0 tolerance588df1 = pl.DataFrame(589{"asof_key": pl.Series([1, 2, 3, 4, 5], dtype=pl.Float64), "a": a}590).set_sorted("asof_key")591df2 = pl.DataFrame(592{593"asof_key": [0.0, 2.0, 2.4, 3.4, 10.0],594"b": b,595}596).set_sorted("asof_key")597out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=0)598expected = df1.with_columns(pl.Series([None, 2, None, None, None]).alias("b"))599assert_frame_equal(out, expected)600601# Case 7: test with datetime602df1 = pl.DataFrame(603{604"asof_key": pl.Series(605[606datetime(2023, 1, 1),607datetime(2023, 1, 2),608datetime(2023, 1, 3),609datetime(2023, 1, 4),610datetime(2023, 1, 6),611]612),613"a": a,614}615).set_sorted("asof_key")616df2 = pl.DataFrame(617{618"asof_key": pl.Series(619[620datetime(2022, 1, 1),621datetime(2022, 1, 2),622datetime(2022, 1, 3),623datetime(6242023, 1, 2, 21, 30, 0625), # should match with 2023-01-02, 2023-01-03, and 2021-01-04626datetime(2023, 1, 7),627]628),629"b": b,630}631).set_sorted("asof_key")632out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance="1d4h")633expected = df1.with_columns(pl.Series([None, 4, 4, 4, 5]).alias("b"))634assert_frame_equal(out, expected)635636# Case 8: test using timedelta tolerance637out = df1.join_asof(638df2, on="asof_key", strategy="nearest", tolerance=timedelta(days=1, hours=4)639)640assert_frame_equal(out, expected)641642# Case #9: last item is closest match643df1 = pl.DataFrame(644{645"asof_key_left": [10.00001, 20.0, 30.0],646}647).set_sorted("asof_key_left")648df2 = pl.DataFrame(649{650"asof_key_right": [10.00001, 20.0001, 29.0],651}652).set_sorted("asof_key_right")653out = df1.join_asof(654df2,655left_on="asof_key_left",656right_on="asof_key_right",657strategy="nearest",658tolerance=0.5,659)660expected = pl.DataFrame(661{662"asof_key_left": [10.00001, 20.0, 30.0],663"asof_key_right": [10.00001, 20.0001, None],664}665)666assert_frame_equal(out, expected)667668669def test_asof_join_nearest_by() -> None:670# Generic join_asof671df1 = pl.DataFrame(672{673"asof_key": [-1, 1, 2, 6, 1],674"group": [1, 1, 1, 2, 2],675"a": [1, 2, 3, 2, 5],676}677).sort(by=["group", "asof_key"])678679df2 = pl.DataFrame(680{681"asof_key": [-1, 2, 5, 1],682"group": [1, 1, 2, 2],683"b": [1, 2, 3, 4],684}685).sort(by=["group", "asof_key"])686687expected = pl.DataFrame(688{689"asof_key": [-1, 1, 2, 6, 1],690"group": [1, 1, 1, 2, 2],691"a": [1, 2, 3, 5, 2],692"b": [1, 2, 2, 4, 3],693}694).sort(by=["group", "asof_key"])695696# Edge case: last item of right matches multiples on left697df1 = pl.DataFrame(698{699"asof_key": [9, 9, 10, 10, 10],700"group": [1, 1, 1, 2, 2],701"a": [1, 2, 3, 2, 5],702}703).sort(by=["group", "asof_key"])704705df2 = pl.DataFrame(706{707"asof_key": [-1, 1, 1, 10],708"group": [1, 1, 2, 2],709"b": [1, 2, 3, 4],710}711).sort(by=["group", "asof_key"])712713expected = pl.DataFrame(714{715"asof_key": [9, 9, 10, 10, 10],716"group": [1, 1, 1, 2, 2],717"a": [1, 2, 3, 2, 5],718"b": [2, 2, 2, 4, 4],719}720)721722out = df1.join_asof(df2, on="asof_key", by="group", strategy="nearest")723assert_frame_equal(out, expected)724725a = pl.DataFrame(726{727"code": [676, 35, 676, 676, 676],728"time": [364360, 364370, 364380, 365400, 367440],729}730)731b = pl.DataFrame(732{733"code": [676, 676, 35, 676, 676],734"time": [364000, 365000, 365000, 366000, 367000],735"price": [1.0, 2.0, 50, 3.0, None],736}737)738739expected = pl.DataFrame(740{741"code": [676, 35, 676, 676, 676],742"time": [364360, 364370, 364380, 365400, 367440],743"price": [1.0, 50.0, 1.0, 2.0, None],744}745)746747out = a.join_asof(b, by="code", on="time", strategy="nearest")748assert_frame_equal(out, expected)749750# last item is closest match751df1 = pl.DataFrame(752{753"a": [1, 1, 1],754"asof_key_left": [10.00001, 20.0, 30.0],755}756).set_sorted("asof_key_left")757df2 = pl.DataFrame(758{759"a": [1, 1, 1],760"asof_key_right": [10.00001, 20.0001, 29.0],761}762).set_sorted("asof_key_right")763out = df1.join_asof(764df2,765left_on="asof_key_left",766right_on="asof_key_right",767by="a",768strategy="nearest",769)770expected = pl.DataFrame(771{772"a": [1, 1, 1],773"asof_key_left": [10.00001, 20.0, 30.0],774"asof_key_right": [10.00001, 20.0001, 29.0],775}776)777assert_frame_equal(out, expected)778779780def test_asof_join_nearest_by_with_tolerance() -> None:781df1 = pl.DataFrame(782{783"group": [7841,7851,7861,7871,7881,7892,7902,7912,7922,7932,7943,7953,7963,7973,7983,7994,8004,8014,8024,8034,8045,8055,8065,8075,8085,8096,8106,8116,8126,8136,814],815"asof_key": pl.Series(816[8171,8182,8193,8204,8215,8227,8238,8249,82510,82611,8271,8282,8293,8304,8315,8321,8332,8343,8354,8365,8371,8382,8393,8404,8415,8421,8432,8443,8454,8465,847],848dtype=pl.Float32,849),850"a": [8511,8522,8533,8544,8555,8561,8572,8583,8594,8605,8611,8622,8633,8644,8655,8661,8672,8683,8694,8705,8711,8722,8733,8744,8755,8761,8772,8783,8794,8805,881],882}883)884885df2 = pl.DataFrame(886{887"group": [8881,8891,8901,8911,8921,8932,8942,8952,8962,8972,8983,8993,9003,9013,9023,9034,9044,9054,9064,9074,9085,9095,9105,9115,9125,9136,9146,9156,9166,9176,918],919"asof_key": pl.Series(920[9217,9228,9239,92410,92511,9261,9272,9283,9294,9305,9316,9327,9338,9349,93510,9365,937-3,938-2,939-1,9400,9410,9422,9432.4,9443.4,94510,946-3,9473,9488,9499,95010,951],952dtype=pl.Float32,953),954"b": [9551,9562,9573,9584,9595,9601,9612,9623,9634,9645,9651,9662,9673,9684,9695,9701,9712,9723,9734,9745,9751,9762,9773,9784,9795,9801,9812,9823,9834,9845,985],986}987)988989expected = df1.with_columns(990pl.Series(991[992None,993None,994None,995None,996None,997None,998None,999None,1000None,1001None,1002None,1003None,1004None,1005None,10061,10075,1008None,1009None,10101,10111,10122,10132,10144,10154,1016None,1017None,10182,10192,10202,1021None,1022]1023).alias("b")1024)1025df1 = df1.sort(by=["group", "asof_key"])1026df2 = df2.sort(by=["group", "asof_key"])1027expected = expected.sort(by=["group", "a"])10281029out = df1.join_asof(1030df2, by="group", on="asof_key", strategy="nearest", tolerance=1.01031).sort(by=["group", "a"])1032assert_frame_equal(out, expected)10331034# last item is closest match1035df1 = pl.DataFrame(1036{1037"a": [1, 1, 1],1038"asof_key_left": [10.00001, 20.0, 30.0],1039}1040).set_sorted("asof_key_left")1041df2 = pl.DataFrame(1042{1043"a": [1, 1, 1],1044"asof_key_right": [10.00001, 20.0001, 29.0],1045}1046).set_sorted("asof_key_right")1047out = df1.join_asof(1048df2,1049left_on="asof_key_left",1050right_on="asof_key_right",1051by="a",1052strategy="nearest",1053tolerance=0.5,1054)1055expected = pl.DataFrame(1056{1057"a": [1, 1, 1],1058"asof_key_left": [10.00001, 20.0, 30.0],1059"asof_key_right": [10.00001, 20.0001, None],1060}1061)1062assert_frame_equal(out, expected)106310641065def test_asof_join_nearest_by_date() -> None:1066df1 = pl.DataFrame(1067{1068"asof_key": [1069date(2019, 12, 30),1070date(2020, 1, 1),1071date(2020, 1, 2),1072date(2020, 1, 6),1073date(2020, 1, 1),1074],1075"group": [1, 1, 1, 2, 2],1076"a": [1, 2, 3, 2, 5],1077}1078).sort(by=["group", "asof_key"])10791080df2 = pl.DataFrame(1081{1082"asof_key": [1083date(2020, 1, 1),1084date(2020, 1, 2),1085date(2020, 1, 5),1086date(2020, 1, 1),1087],1088"group": [1, 1, 2, 2],1089"b": [1, 2, 3, 4],1090}1091).sort(by=["group", "asof_key"])10921093expected = pl.DataFrame(1094{1095"asof_key": [1096date(2019, 12, 30),1097date(2020, 1, 1),1098date(2020, 1, 2),1099date(2020, 1, 6),1100date(2020, 1, 1),1101],1102"group": [1, 1, 1, 2, 2],1103"a": [1, 2, 3, 2, 5],1104"b": [1, 1, 2, 3, 4],1105}1106).sort(by=["group", "asof_key"])11071108out = df1.join_asof(df2, on="asof_key", by="group", strategy="nearest")1109assert_frame_equal(out, expected)111011111112@pytest.mark.may_fail_auto_streaming # See #18927.1113def test_asof_join_string() -> None:1114left = pl.DataFrame({"x": [None, "a", "b", "c", None, "d", None]}).set_sorted("x")1115right = pl.DataFrame({"x": ["apple", None, "chutney"], "y": [0, 1, 2]}).set_sorted(1116"x"1117)1118forward = left.join_asof(right, on="x", strategy="forward")1119backward = left.join_asof(right, on="x", strategy="backward")1120forward_expected = pl.DataFrame(1121{1122"x": [None, "a", "b", "c", None, "d", None],1123"y": [None, 0, 2, 2, None, None, None],1124}1125)1126backward_expected = pl.DataFrame(1127{1128"x": [None, "a", "b", "c", None, "d", None],1129"y": [None, None, 0, 0, None, 2, None],1130}1131)1132assert_frame_equal(forward, forward_expected)1133assert_frame_equal(backward, backward_expected)113411351136def test_join_asof_by_argument_parsing() -> None:1137df1 = pl.DataFrame(1138{1139"n": [10, 20, 30, 40, 50, 60],1140"id1": [0, 0, 3, 3, 5, 5],1141"id2": [1, 2, 1, 2, 1, 2],1142"x": ["a", "b", "c", "d", "e", "f"],1143}1144).sort(by="n")11451146df2 = pl.DataFrame(1147{1148"n": [25, 8, 5, 23, 15, 35],1149"id1": [0, 0, 3, 3, 5, 5],1150"id2": [1, 2, 1, 2, 1, 2],1151"y": ["A", "B", "C", "D", "E", "F"],1152}1153).sort(by="n")11541155# any sequency for by argument is allowed, so we should see the same results here1156by_list = df1.join_asof(df2, on="n", by=["id1", "id2"])1157by_tuple = df1.join_asof(df2, on="n", by=("id1", "id2"))1158assert_frame_equal(by_list, by_tuple)11591160# same for using the by_left and by_right kwargs1161by_list2 = df1.join_asof(1162df2, on="n", by_left=["id1", "id2"], by_right=["id1", "id2"]1163)1164by_tuple2 = df1.join_asof(1165df2, on="n", by_left=("id1", "id2"), by_right=("id1", "id2")1166)1167assert_frame_equal(by_list2, by_list)1168assert_frame_equal(by_tuple2, by_list)116911701171def test_join_asof_invalid_args() -> None:1172df1 = pl.DataFrame(1173{1174"a": [1, 2, 3],1175"b": [1, 2, 3],1176}1177).set_sorted("a")1178df2 = pl.DataFrame(1179{1180"a": [1, 2, 3],1181"c": [1, 2, 3],1182}1183).set_sorted("a")11841185with pytest.raises(TypeError, match="expected `on` to be str or Expr, got 'list'"):1186df1.join_asof(df2, on=["a"]) # type: ignore[arg-type]1187with pytest.raises(1188TypeError, match="expected `left_on` to be str or Expr, got 'list'"1189):1190df1.join_asof(df2, left_on=["a"], right_on="a") # type: ignore[arg-type]1191with pytest.raises(1192TypeError, match="expected `right_on` to be str or Expr, got 'list'"1193):1194df1.join_asof(df2, left_on="a", right_on=["a"]) # type: ignore[arg-type]119511961197def test_join_as_of_by_schema() -> None:1198a = pl.DataFrame({"a": [1], "b": [2], "c": [3]}).lazy()1199b = pl.DataFrame({"a": [1], "b": [2], "d": [4]}).lazy()1200q = a.join_asof(b, on=pl.col("a").set_sorted(), by="b")1201assert q.collect_schema().names() == q.collect().columns120212031204def test_asof_join_by_schema() -> None:1205# different `by` names.1206df1 = pl.DataFrame({"on1": 0, "by1": 0})1207df2 = pl.DataFrame({"on1": 0, "by2": 0})12081209q = df1.lazy().join_asof(1210df2.lazy(),1211on="on1",1212by_left="by1",1213by_right="by2",1214)12151216assert q.collect_schema() == q.collect().schema121712181219def test_raise_invalid_by_arg_13020() -> None:1220df1 = pl.DataFrame({"asOfDate": [date(2020, 1, 1)]})1221df2 = pl.DataFrame(1222{1223"endityId": [date(2020, 1, 1)],1224"eventDate": ["A"],1225}1226)1227with pytest.raises(pl.exceptions.InvalidOperationError, match="expected both"):1228df1.sort("asOfDate").join_asof(1229df2.sort("eventDate"),1230left_on="asOfDate",1231right_on="eventDate",1232by_left=None,1233by_right=["entityId"],1234)123512361237def test_join_asof_no_exact_matches() -> None:1238trades = pl.DataFrame(1239{1240"time": [1241"2016-05-25 13:30:00.023",1242"2016-05-25 13:30:00.038",1243"2016-05-25 13:30:00.048",1244"2016-05-25 13:30:00.048",1245"2016-05-25 13:30:00.048",1246],1247"ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],1248"price": [51.95, 51.95, 720.77, 720.92, 98.0],1249"quantity": [75, 155, 100, 100, 100],1250}1251).with_columns(pl.col("time").str.to_datetime())12521253quotes = pl.DataFrame(1254{1255"time": [1256"2016-05-25 13:30:00.023",1257"2016-05-25 13:30:00.023",1258"2016-05-25 13:30:00.030",1259"2016-05-25 13:30:00.041",1260"2016-05-25 13:30:00.048",1261"2016-05-25 13:30:00.049",1262"2016-05-25 13:30:00.072",1263"2016-05-25 13:30:00.075",1264],1265"ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"],1266"bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],1267"ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03],1268}1269).with_columns(pl.col("time").str.to_datetime())12701271assert trades.join_asof(1272quotes, on="time", by="ticker", tolerance="10ms", allow_exact_matches=False1273).to_dict(as_series=False) == {1274"time": [1275datetime(2016, 5, 25, 13, 30, 0, 23000),1276datetime(2016, 5, 25, 13, 30, 0, 38000),1277datetime(2016, 5, 25, 13, 30, 0, 48000),1278datetime(2016, 5, 25, 13, 30, 0, 48000),1279datetime(2016, 5, 25, 13, 30, 0, 48000),1280],1281"ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],1282"price": [51.95, 51.95, 720.77, 720.92, 98.0],1283"quantity": [75, 155, 100, 100, 100],1284"bid": [None, 51.97, None, None, None],1285"ask": [None, 51.98, None, None, None],1286}128712881289def test_join_asof_not_sorted() -> None:1290df = pl.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [2, 1, 3, 1, 2, 3]})1291with pytest.raises(InvalidOperationError, match="is not sorted"):1292df.join_asof(df, on="b")12931294# When 'by' is provided, we do not check sortedness, but a warning is received1295with pytest.warns(1296UserWarning,1297match="Sortedness of columns cannot be checked when 'by' groups provided",1298):1299df.join_asof(df, on="b", by="a")13001301# When sortedness is False, we should get no warning1302with warnings.catch_warnings(record=True) as w:1303df.join_asof(df, on="b", check_sortedness=False)1304df.join_asof(df, on="b", by="a", check_sortedness=False)1305assert len(w) == 0 # no warnings caught130613071308@pytest.mark.parametrize("left_dtype", [pl.Int64, pl.UInt64, pl.Int128])1309@pytest.mark.parametrize("right_dtype", [pl.Int64, pl.UInt64, pl.Int128])1310@pytest.mark.parametrize("strategy", ["backward", "forward", "nearest"])1311def test_join_asof_large_int_21276(1312left_dtype: PolarsIntegerType,1313right_dtype: PolarsIntegerType,1314strategy: AsofJoinStrategy,1315) -> None:1316large_int64 = 1608129000134000123 # it only happen when "on" column is large1317left = pl.DataFrame({"ts": pl.Series([large_int64 + 2], dtype=left_dtype)})1318right = pl.DataFrame(1319{1320"ts": pl.Series([large_int64 + 1, large_int64 + 3], dtype=right_dtype),1321"value": [111, 333],1322}1323)1324result = left.join_asof(right, on="ts", strategy=strategy)1325idx = 0 if strategy == "backward" else 11326expected = pl.DataFrame(1327{1328"ts": left["ts"],1329"value": right["value"].gather(idx),1330}1331)1332assert_frame_equal(result, expected)133313341335@pytest.mark.parametrize("by", ["constant", None])1336def test_join_asof_slice_23583(by: str | None) -> None:1337lhs = pl.LazyFrame(1338{1339"index": [0],1340"constant": 0,1341"date": [date(2025, 1, 1)],1342},1343).set_sorted("date")13441345rhs = pl.LazyFrame(1346{1347"index": [0, 1],1348"constant": 0,1349"date": [date(1970, 1, 1), date(2025, 1, 1)],1350},1351).set_sorted("date")13521353q = (1354lhs.join_asof(rhs, on="date", by=by, check_sortedness=False)1355.head(1)1356.select(pl.exclude("constant_right"))1357)13581359expect = pl.DataFrame(1360{1361"index": [0],1362"constant": 0,1363"date": [date(2025, 1, 1)],1364"index_right": [1],1365},1366)13671368assert_frame_equal(q.collect(optimizations=pl.QueryOptFlags.none()), expect)1369assert_frame_equal(q.collect(), expect)137013711372def test_join_asof_23751() -> None:1373a = pl.DataFrame(1374[1375pl.Series([1, 2, 3, 4, 5]).alias("index") * int(1e10),1376pl.Series([1, -1, 1, 1, -1]).alias("side"),1377]1378)13791380b = pl.DataFrame(1381[1382pl.Series([0, 1, 1, 3, 3, 5]).alias("index_right").cast(pl.UInt64)1383* int(1e10),1384pl.Series([-1, 1, -1, 1, 1, -1]).alias("side"),1385pl.Series([0, 10, 20, 30, 40, 50]).alias("value"),1386]1387)13881389assert a.join_asof(b, left_on="index", right_on="index_right", by="side").to_dict(1390as_series=False1391) == {1392"index": [10000000000, 20000000000, 30000000000, 40000000000, 50000000000],1393"side": [1, -1, 1, 1, -1],1394"index_right": [139510000000000,139610000000000,139730000000000,139830000000000,139950000000000,1400],1401"value": [10, 20, 40, 40, 50],1402}140314041405def test_join_asof_nosuffix_dup_col_23834() -> None:1406a = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})1407b = pl.DataFrame({"b": [1, 2, 3], "c": [9, 10, 11]})1408with pytest.raises(DuplicateError):1409a.join_asof(b, left_on="a", right_on="b", suffix="")141014111412def test_join_asof_planner_schema_24000() -> None:1413a = pl.DataFrame([pl.Series("index", [1, 2, 3]) * 10])1414b = pl.DataFrame(1415[1416pl.Series("value", [10, 20, 30]),1417pl.Series("index_right", [1, 2, 3]).cast(pl.UInt64) * 10,1418]1419)1420q = a.lazy().join_asof(b.lazy(), left_on="index", right_on="index_right")14211422assert q.collect().schema == q.collect_schema()14231424b = pl.DataFrame(1425[1426pl.Series("index_right", [1, 2, 3]).cast(pl.UInt64) * 10,1427pl.Series("value", [10, 20, 30]),1428]1429)1430q = a.lazy().join_asof(b.lazy(), left_on="index", right_on="index_right")14311432assert q.collect().schema == q.collect_schema()143314341435