Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/test_join_asof.py
8424 views
1
from __future__ import annotations
2
3
import itertools
4
import math
5
import random
6
import warnings
7
from datetime import date, datetime, timedelta
8
from typing import TYPE_CHECKING, Any
9
10
import numpy as np
11
import pytest
12
from hypothesis import given
13
14
import polars as pl
15
from polars.exceptions import DuplicateError, InvalidOperationError
16
from polars.testing import assert_frame_equal
17
from polars.testing.parametric.strategies.core import dataframes
18
19
if TYPE_CHECKING:
20
from polars._typing import AsofJoinStrategy, PolarsIntegerType
21
22
23
def test_asof_join_singular_right_11966() -> None:
24
df = pl.DataFrame({"id": [1, 2, 3], "time": [0.9, 2.1, 2.8]}).sort("time")
25
lookup = pl.DataFrame({"time": [2.0], "value": [100]}).sort("time")
26
joined = df.join_asof(lookup, on="time", strategy="nearest")
27
expected = pl.DataFrame(
28
{"id": [1, 2, 3], "time": [0.9, 2.1, 2.8], "value": [100, 100, 100]}
29
)
30
assert_frame_equal(joined, expected)
31
32
33
def test_asof_join_inline_cast_6438() -> None:
34
df_trades = pl.DataFrame(
35
{
36
"time": [
37
datetime(2020, 1, 1, 9, 1, 0),
38
datetime(2020, 1, 1, 9, 1, 0),
39
datetime(2020, 1, 1, 9, 3, 0),
40
datetime(2020, 1, 1, 9, 6, 0),
41
],
42
"stock": ["A", "B", "B", "C"],
43
"trade": [101, 299, 301, 500],
44
}
45
)
46
47
df_quotes = pl.DataFrame(
48
{
49
"time": [
50
datetime(2020, 1, 1, 9, 0, 0),
51
datetime(2020, 1, 1, 9, 2, 0),
52
datetime(2020, 1, 1, 9, 3, 0),
53
datetime(2020, 1, 1, 9, 6, 0),
54
],
55
"stock": ["A", "B", "C", "A"],
56
"quote": [100, 300, 501, 102],
57
}
58
).with_columns([pl.col("time").dt.cast_time_unit("ns")])
59
60
assert df_trades.join_asof(
61
df_quotes, on=pl.col("time").cast(pl.Datetime("ns")).set_sorted(), by="stock"
62
).to_dict(as_series=False) == {
63
"time": [
64
datetime(2020, 1, 1, 9, 1),
65
datetime(2020, 1, 1, 9, 1),
66
datetime(2020, 1, 1, 9, 3),
67
datetime(2020, 1, 1, 9, 6),
68
],
69
"time_right": [
70
datetime(2020, 1, 1, 9, 0),
71
None,
72
datetime(2020, 1, 1, 9, 2),
73
datetime(2020, 1, 1, 9, 3),
74
],
75
"stock": ["A", "B", "B", "C"],
76
"trade": [101, 299, 301, 500],
77
"quote": [100, None, 300, 501],
78
}
79
80
81
def test_asof_join_projection_resolution_4606() -> None:
82
a = pl.DataFrame({"a": [1], "b": [2], "c": [3]}).lazy()
83
b = pl.DataFrame({"a": [1], "b": [2], "d": [4]}).lazy()
84
joined_tbl = a.join_asof(b, on=pl.col("a").set_sorted(), by="b")
85
assert joined_tbl.group_by("a").agg(
86
[pl.col("c").sum().alias("c")]
87
).collect().columns == ["a", "c"]
88
89
90
def test_asof_join_schema_5211() -> None:
91
df1 = pl.DataFrame({"today": [1, 2]})
92
93
df2 = pl.DataFrame({"next_friday": [1, 2]})
94
95
assert (
96
df1.lazy()
97
.join_asof(
98
df2.lazy(), left_on="today", right_on="next_friday", strategy="forward"
99
)
100
.collect_schema()
101
) == {"today": pl.Int64, "next_friday": pl.Int64}
102
103
104
def test_asof_join_schema_5684() -> None:
105
df_a = (
106
pl.DataFrame(
107
{
108
"id": [1],
109
"a": [1],
110
"b": [1],
111
}
112
)
113
.lazy()
114
.set_sorted("a")
115
)
116
117
df_b = (
118
pl.DataFrame(
119
{
120
"id": [1, 1, 2],
121
"b": [-3, -3, 6],
122
}
123
)
124
.lazy()
125
.set_sorted("b")
126
)
127
128
q = (
129
df_a.join_asof(df_b, by="id", left_on="a", right_on="b")
130
.drop("b")
131
.join_asof(df_b, by="id", left_on="a", right_on="b")
132
.drop("b")
133
)
134
135
projected_result = q.select(pl.all()).collect()
136
result = q.collect()
137
138
assert_frame_equal(projected_result, result)
139
assert (
140
q.collect_schema()
141
== projected_result.schema
142
== {"id": pl.Int64, "a": pl.Int64, "b_right": pl.Int64}
143
)
144
145
146
def test_join_asof_mismatched_dtypes() -> None:
147
# test 'on' dtype mismatch
148
df1 = pl.DataFrame(
149
{"a": pl.Series([1, 2, 3], dtype=pl.Int64), "b": ["a", "b", "c"]}
150
)
151
df2 = pl.DataFrame(
152
{"a": pl.Series([1.0, 2.0, 3.0], dtype=pl.Float64), "c": ["d", "e", "f"]}
153
)
154
155
with pytest.raises(
156
pl.exceptions.SchemaError, match="datatypes of join keys don't match"
157
):
158
df1.join_asof(df2, on="a", strategy="forward")
159
160
# test 'by' dtype mismatch
161
df1 = pl.DataFrame(
162
{
163
"time": pl.date_range(date(2018, 1, 1), date(2018, 1, 8), eager=True),
164
"group": pl.Series([1, 1, 1, 1, 2, 2, 2, 2], dtype=pl.Int32),
165
"value": [0, 0, None, None, 2, None, 1, None],
166
}
167
)
168
df2 = pl.DataFrame(
169
{
170
"time": pl.date_range(date(2018, 1, 1), date(2018, 1, 8), eager=True),
171
"group": pl.Series([1, 1, 1, 1, 2, 2, 2, 2], dtype=pl.Int64),
172
"value": [0, 0, None, None, 2, None, 1, None],
173
}
174
)
175
176
with pytest.raises(
177
pl.exceptions.ComputeError, match="mismatching dtypes in 'by' parameter"
178
):
179
df1.join_asof(df2, on="time", by="group", strategy="forward")
180
181
182
def test_join_asof_floats() -> None:
183
df1 = pl.DataFrame({"a": [1.0, 2.0, 3.0], "b": ["lrow1", "lrow2", "lrow3"]})
184
df2 = pl.DataFrame({"a": [0.59, 1.49, 2.89], "b": ["rrow1", "rrow2", "rrow3"]})
185
186
result = df1.join_asof(df2, on=pl.col("a").set_sorted(), strategy="backward")
187
expected = {
188
"a": [1.0, 2.0, 3.0],
189
"b": ["lrow1", "lrow2", "lrow3"],
190
"a_right": [0.59, 1.49, 2.89],
191
"b_right": ["rrow1", "rrow2", "rrow3"],
192
}
193
assert result.to_dict(as_series=False) == expected
194
195
# with by argument
196
# 5740
197
df1 = pl.DataFrame(
198
{"b": np.linspace(0, 5, 7), "c": ["x" if i < 4 else "y" for i in range(7)]}
199
)
200
df2 = pl.DataFrame(
201
{
202
"val": [0.0, 2.5, 2.6, 2.7, 3.4, 4.0, 5.0],
203
"c": ["x", "x", "x", "y", "y", "y", "y"],
204
}
205
).with_columns(pl.col("val").alias("b").set_sorted())
206
assert df1.set_sorted("b").join_asof(df2, on=pl.col("b"), by="c").to_dict(
207
as_series=False
208
) == {
209
"b": [
210
0.0,
211
0.8333333333333334,
212
1.6666666666666667,
213
2.5,
214
3.3333333333333335,
215
4.166666666666667,
216
5.0,
217
],
218
"c": ["x", "x", "x", "x", "y", "y", "y"],
219
"val": [0.0, 0.0, 0.0, 2.5, 2.7, 4.0, 5.0],
220
}
221
222
223
def test_join_asof_tolerance() -> None:
224
df_trades = pl.DataFrame(
225
{
226
"time": [
227
datetime(2020, 1, 1, 9, 0, 1),
228
datetime(2020, 1, 1, 9, 0, 1),
229
datetime(2020, 1, 1, 9, 0, 3),
230
datetime(2020, 1, 1, 9, 0, 6),
231
],
232
"stock": ["A", "B", "B", "C"],
233
"trade": [101, 299, 301, 500],
234
}
235
).set_sorted("time")
236
237
df_quotes = pl.DataFrame(
238
{
239
"time": [
240
datetime(2020, 1, 1, 9, 0, 0),
241
datetime(2020, 1, 1, 9, 0, 2),
242
datetime(2020, 1, 1, 9, 0, 4),
243
datetime(2020, 1, 1, 9, 0, 6),
244
],
245
"stock": ["A", "B", "C", "A"],
246
"quote": [100, 300, 501, 102],
247
}
248
).set_sorted("time")
249
250
assert df_trades.join_asof(
251
df_quotes, on="time", by="stock", tolerance="2s"
252
).to_dict(as_series=False) == {
253
"time": [
254
datetime(2020, 1, 1, 9, 0, 1),
255
datetime(2020, 1, 1, 9, 0, 1),
256
datetime(2020, 1, 1, 9, 0, 3),
257
datetime(2020, 1, 1, 9, 0, 6),
258
],
259
"stock": ["A", "B", "B", "C"],
260
"trade": [101, 299, 301, 500],
261
"quote": [100, None, 300, 501],
262
}
263
264
assert df_trades.join_asof(
265
df_quotes, on="time", by="stock", tolerance="1s"
266
).to_dict(as_series=False) == {
267
"time": [
268
datetime(2020, 1, 1, 9, 0, 1),
269
datetime(2020, 1, 1, 9, 0, 1),
270
datetime(2020, 1, 1, 9, 0, 3),
271
datetime(2020, 1, 1, 9, 0, 6),
272
],
273
"stock": ["A", "B", "B", "C"],
274
"trade": [101, 299, 301, 500],
275
"quote": [100, None, 300, None],
276
}
277
278
for invalid_tolerance, match in [
279
("foo", "expected leading integer"),
280
([None], "could not extract number"),
281
]:
282
with pytest.raises(pl.exceptions.PolarsError, match=match):
283
df_trades.join_asof(
284
df_quotes,
285
on="time",
286
by="stock",
287
tolerance=invalid_tolerance, # type: ignore[arg-type]
288
)
289
290
291
def test_join_asof_tolerance_forward() -> None:
292
df_quotes = pl.DataFrame(
293
{
294
"time": [
295
datetime(2020, 1, 1, 9, 0, 0),
296
datetime(2020, 1, 1, 9, 0, 2),
297
datetime(2020, 1, 1, 9, 0, 4),
298
datetime(2020, 1, 1, 9, 0, 6),
299
datetime(2020, 1, 1, 9, 0, 7),
300
],
301
"stock": ["A", "B", "C", "A", "D"],
302
"quote": [100, 300, 501, 102, 10],
303
}
304
).set_sorted("time")
305
306
df_trades = pl.DataFrame(
307
{
308
"time": [
309
datetime(2020, 1, 1, 9, 0, 2),
310
datetime(2020, 1, 1, 9, 0, 1),
311
datetime(2020, 1, 1, 9, 0, 3),
312
datetime(2020, 1, 1, 9, 0, 6),
313
datetime(2020, 1, 1, 9, 0, 7),
314
],
315
"stock": ["A", "B", "B", "C", "D"],
316
"trade": [101, 299, 301, 500, 10],
317
}
318
).set_sorted("time")
319
320
assert df_quotes.join_asof(
321
df_trades, on="time", by="stock", tolerance="2s", strategy="forward"
322
).to_dict(as_series=False) == {
323
"time": [
324
datetime(2020, 1, 1, 9, 0, 0),
325
datetime(2020, 1, 1, 9, 0, 2),
326
datetime(2020, 1, 1, 9, 0, 4),
327
datetime(2020, 1, 1, 9, 0, 6),
328
datetime(2020, 1, 1, 9, 0, 7),
329
],
330
"stock": ["A", "B", "C", "A", "D"],
331
"quote": [100, 300, 501, 102, 10],
332
"trade": [101, 301, 500, None, 10],
333
}
334
335
assert df_quotes.join_asof(
336
df_trades, on="time", by="stock", tolerance="1s", strategy="forward"
337
).to_dict(as_series=False) == {
338
"time": [
339
datetime(2020, 1, 1, 9, 0, 0),
340
datetime(2020, 1, 1, 9, 0, 2),
341
datetime(2020, 1, 1, 9, 0, 4),
342
datetime(2020, 1, 1, 9, 0, 6),
343
datetime(2020, 1, 1, 9, 0, 7),
344
],
345
"stock": ["A", "B", "C", "A", "D"],
346
"quote": [100, 300, 501, 102, 10],
347
"trade": [None, 301, None, None, 10],
348
}
349
350
# Sanity check that this gives us equi-join
351
assert df_quotes.join_asof(
352
df_trades, on="time", by="stock", tolerance="0s", strategy="forward"
353
).to_dict(as_series=False) == {
354
"time": [
355
datetime(2020, 1, 1, 9, 0, 0),
356
datetime(2020, 1, 1, 9, 0, 2),
357
datetime(2020, 1, 1, 9, 0, 4),
358
datetime(2020, 1, 1, 9, 0, 6),
359
datetime(2020, 1, 1, 9, 0, 7),
360
],
361
"stock": ["A", "B", "C", "A", "D"],
362
"quote": [100, 300, 501, 102, 10],
363
"trade": [None, None, None, None, 10],
364
}
365
366
367
def test_join_asof_projection() -> None:
368
df1 = pl.DataFrame(
369
{
370
"df1_date": [20221011, 20221012, 20221013, 20221014, 20221016],
371
"df1_col1": ["foo", "bar", "foo", "bar", "foo"],
372
"key": ["a", "b", "b", "a", "b"],
373
}
374
).set_sorted("df1_date")
375
376
df2 = pl.DataFrame(
377
{
378
"df2_date": [20221012, 20221015, 20221018],
379
"df2_col1": ["1", "2", "3"],
380
"key": ["a", "b", "b"],
381
}
382
).set_sorted("df2_date")
383
384
assert (
385
(
386
df1.lazy().join_asof(df2.lazy(), left_on="df1_date", right_on="df2_date")
387
).select([pl.col("df2_date"), "df1_date"])
388
).collect().to_dict(as_series=False) == {
389
"df2_date": [None, 20221012, 20221012, 20221012, 20221015],
390
"df1_date": [20221011, 20221012, 20221013, 20221014, 20221016],
391
}
392
assert (
393
df1.lazy().join_asof(
394
df2.lazy(), by="key", left_on="df1_date", right_on="df2_date"
395
)
396
).select(["df2_date", "df1_date"]).collect().to_dict(as_series=False) == {
397
"df2_date": [None, None, None, 20221012, 20221015],
398
"df1_date": [20221011, 20221012, 20221013, 20221014, 20221016],
399
}
400
401
402
def test_asof_join_by_logical_types() -> None:
403
dates = (
404
pl.datetime_range(
405
datetime(2022, 1, 1), datetime(2022, 1, 2), interval="2h", eager=True
406
)
407
.cast(pl.Datetime("ns"))
408
.head(9)
409
)
410
x = pl.DataFrame({"a": dates, "b": map(float, range(9)), "c": ["1", "2", "3"] * 3})
411
412
result = x.join_asof(x, on=pl.col("b").set_sorted(), by=["c", "a"])
413
414
expected = {
415
"a": [
416
datetime(2022, 1, 1, 0, 0),
417
datetime(2022, 1, 1, 2, 0),
418
datetime(2022, 1, 1, 4, 0),
419
datetime(2022, 1, 1, 6, 0),
420
datetime(2022, 1, 1, 8, 0),
421
datetime(2022, 1, 1, 10, 0),
422
datetime(2022, 1, 1, 12, 0),
423
datetime(2022, 1, 1, 14, 0),
424
datetime(2022, 1, 1, 16, 0),
425
],
426
"b": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
427
"c": ["1", "2", "3", "1", "2", "3", "1", "2", "3"],
428
"b_right": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
429
}
430
assert result.to_dict(as_series=False) == expected
431
432
433
def test_join_asof_projection_7481() -> None:
434
ldf1 = pl.DataFrame({"a": [1, 2, 2], "b": "bleft"}).lazy().set_sorted("a")
435
ldf2 = pl.DataFrame({"a": 2, "b": [1, 2, 2]}).lazy().set_sorted("b")
436
437
assert (
438
ldf1.join_asof(ldf2, left_on="a", right_on="b").select("a", "b")
439
).collect().to_dict(as_series=False) == {
440
"a": [1, 2, 2],
441
"b": ["bleft", "bleft", "bleft"],
442
}
443
444
445
def test_asof_join_sorted_by_group(capsys: Any) -> None:
446
df1 = pl.DataFrame(
447
{
448
"key": ["a", "a", "a", "b", "b", "b"],
449
"asof_key": [2.0, 1.0, 3.0, 1.0, 2.0, 3.0],
450
"a": [102, 101, 103, 104, 105, 106],
451
}
452
).sort(by=["key", "asof_key"])
453
454
df2 = pl.DataFrame(
455
{
456
"key": ["a", "a", "a", "b", "b", "b"],
457
"asof_key": [0.9, 1.9, 2.9, 0.9, 1.9, 2.9],
458
"b": [201, 202, 203, 204, 205, 206],
459
}
460
).sort(by=["key", "asof_key"])
461
462
expected = pl.DataFrame(
463
[
464
pl.Series("key", ["a", "a", "a", "b", "b", "b"], dtype=pl.String),
465
pl.Series("asof_key", [1.0, 2.0, 3.0, 1.0, 2.0, 3.0], dtype=pl.Float64),
466
pl.Series("a", [101, 102, 103, 104, 105, 106], dtype=pl.Int64),
467
pl.Series("b", [201, 202, 203, 204, 205, 206], dtype=pl.Int64),
468
]
469
)
470
471
out = df1.join_asof(df2, on="asof_key", by="key")
472
assert_frame_equal(out, expected)
473
474
_, err = capsys.readouterr()
475
assert "is not explicitly sorted" not in err
476
477
478
def test_asof_join_nearest() -> None:
479
# Generic join_asof
480
df1 = pl.DataFrame(
481
{
482
"asof_key": [-1, 1, 2, 4, 6],
483
"a": [1, 2, 3, 4, 5],
484
}
485
).sort(by="asof_key")
486
487
df2 = pl.DataFrame(
488
{
489
"asof_key": [-1, 2, 4, 5],
490
"b": [1, 2, 3, 4],
491
}
492
).sort(by="asof_key")
493
494
expected = pl.DataFrame(
495
{"asof_key": [-1, 1, 2, 4, 6], "a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 3, 4]}
496
)
497
498
out = df1.join_asof(df2, on="asof_key", strategy="nearest")
499
assert_frame_equal(out, expected)
500
501
# Edge case: last item of right matches multiples on left
502
df1 = pl.DataFrame(
503
{
504
"asof_key": [9, 9, 10, 10, 10],
505
"a": [1, 2, 3, 4, 5],
506
}
507
).set_sorted("asof_key")
508
df2 = pl.DataFrame(
509
{
510
"asof_key": [1, 2, 3, 10],
511
"b": [1, 2, 3, 4],
512
}
513
).set_sorted("asof_key")
514
expected = pl.DataFrame(
515
{
516
"asof_key": [9, 9, 10, 10, 10],
517
"a": [1, 2, 3, 4, 5],
518
"b": [4, 4, 4, 4, 4],
519
}
520
)
521
522
out = df1.join_asof(df2, on="asof_key", strategy="nearest")
523
assert_frame_equal(out, expected)
524
525
526
@pytest.mark.slow
527
@pytest.mark.parametrize("allow_exact_matches", [True, False])
528
def test_asof_join_nearest_reference(allow_exact_matches: bool) -> None:
529
def asof_join_nearest_reference(
530
df_left: pl.DataFrame, df_right: pl.DataFrame
531
) -> pl.DataFrame:
532
schema = {"key": pl.Int32, "value": pl.Int32, "value_right": pl.Int32}
533
result = pl.DataFrame(schema=schema)
534
for left_row in df_left.iter_rows():
535
cross_product_sorted = (
536
pl.DataFrame(
537
{
538
"key": pl.Series([left_row[0]] * len(df_right), dtype=pl.Int32),
539
"value": pl.Series(
540
[left_row[1]] * len(df_right), dtype=pl.Int32
541
),
542
"key_right": df_right["key"],
543
"value_right": df_right["value"],
544
},
545
)
546
.with_row_index()
547
.filter(
548
pl.when(allow_exact_matches)
549
.then(pl.lit(True))
550
.otherwise(pl.col("key") != pl.col("key_right"))
551
)
552
.sort(
553
(pl.col("key") - pl.col("key_right")).abs(),
554
-pl.col("index").cast(pl.Int32),
555
)
556
.drop("index", "key_right")
557
)
558
if len(cross_product_sorted) == 0:
559
result = result.vstack(
560
pl.DataFrame([left_row + (None,)], schema=schema, orient="row"),
561
)
562
else:
563
best_match = cross_product_sorted[0]
564
result = result.vstack(best_match)
565
return result
566
567
test_dfs = []
568
rng = random.Random()
569
for n_a, n_b, n_c, n_d in itertools.product([0, 1, 2], repeat=4):
570
a = rng.randint(0, 10)
571
b = rng.randint(0, 10)
572
c = rng.randint(0, 10)
573
d = rng.randint(0, 10)
574
keys = [a] * n_a + [b] * n_b + [c] * n_c + [d] * n_d
575
values = [rng.randint(0, 100000) for _ in keys]
576
577
df = pl.DataFrame(
578
{"key": keys, "value": values}, schema={"key": pl.Int32, "value": pl.Int32}
579
).sort(by="key")
580
test_dfs.append(df)
581
582
for df_left, df_right in itertools.product(test_dfs, repeat=2):
583
expected = asof_join_nearest_reference(df_left, df_right)
584
actual = df_left.join_asof(
585
df_right,
586
on="key",
587
strategy="nearest",
588
allow_exact_matches=allow_exact_matches,
589
)
590
assert_frame_equal(actual, expected)
591
592
593
def test_asof_join_nearest_with_tolerance() -> None:
594
a = b = [1, 2, 3, 4, 5]
595
596
nones = pl.Series([None, None, None, None, None], dtype=pl.Int64)
597
598
# Case 1: complete miss
599
df1 = pl.DataFrame({"asof_key": [1, 2, 3, 4, 5], "a": a}).set_sorted("asof_key")
600
df2 = pl.DataFrame(
601
{
602
"asof_key": [7, 8, 9, 10, 11],
603
"b": b,
604
}
605
).set_sorted("asof_key")
606
expected = df1.with_columns(nones.alias("b"))
607
out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1)
608
assert_frame_equal(out, expected)
609
610
# Case 2: complete miss in other direction
611
df1 = pl.DataFrame({"asof_key": [7, 8, 9, 10, 11], "a": a}).set_sorted("asof_key")
612
df2 = pl.DataFrame(
613
{
614
"asof_key": [1, 2, 3, 4, 5],
615
"b": b,
616
}
617
).set_sorted("asof_key")
618
expected = df1.with_columns(nones.alias("b"))
619
out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1)
620
assert_frame_equal(out, expected)
621
622
# Case 3: match first item
623
df1 = pl.DataFrame({"asof_key": [1, 2, 3, 4, 5], "a": a}).set_sorted("asof_key")
624
df2 = pl.DataFrame(
625
{
626
"asof_key": [6, 7, 8, 9, 10],
627
"b": b,
628
}
629
).set_sorted("asof_key")
630
out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1)
631
expected = df1.with_columns(pl.Series([None, None, None, None, 1]).alias("b"))
632
assert_frame_equal(out, expected)
633
634
# Case 4: match last item
635
df1 = pl.DataFrame({"asof_key": [1, 2, 3, 4, 5], "a": a}).set_sorted("asof_key")
636
df2 = pl.DataFrame(
637
{
638
"asof_key": [-4, -3, -2, -1, 0],
639
"b": b,
640
}
641
).set_sorted("asof_key")
642
out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1)
643
expected = df1.with_columns(pl.Series([5, None, None, None, None]).alias("b"))
644
assert_frame_equal(out, expected)
645
646
# Case 5: match multiples, pick closer
647
df1 = pl.DataFrame(
648
{"asof_key": pl.Series([1, 2, 3, 4, 5], dtype=pl.Float64), "a": a}
649
).set_sorted("asof_key")
650
df2 = pl.DataFrame(
651
{
652
"asof_key": [0.0, 2.0, 2.4, 3.4, 10.0],
653
"b": b,
654
}
655
).set_sorted("asof_key")
656
out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=1)
657
expected = df1.with_columns(pl.Series([2, 2, 4, 4, None]).alias("b"))
658
assert_frame_equal(out, expected)
659
660
# Case 6: use 0 tolerance
661
df1 = pl.DataFrame(
662
{"asof_key": pl.Series([1, 2, 3, 4, 5], dtype=pl.Float64), "a": a}
663
).set_sorted("asof_key")
664
df2 = pl.DataFrame(
665
{
666
"asof_key": [0.0, 2.0, 2.4, 3.4, 10.0],
667
"b": b,
668
}
669
).set_sorted("asof_key")
670
out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance=0)
671
expected = df1.with_columns(pl.Series([None, 2, None, None, None]).alias("b"))
672
assert_frame_equal(out, expected)
673
674
# Case 7: test with datetime
675
df1 = pl.DataFrame(
676
{
677
"asof_key": pl.Series(
678
[
679
datetime(2023, 1, 1),
680
datetime(2023, 1, 2),
681
datetime(2023, 1, 3),
682
datetime(2023, 1, 4),
683
datetime(2023, 1, 6),
684
]
685
),
686
"a": a,
687
}
688
).set_sorted("asof_key")
689
df2 = pl.DataFrame(
690
{
691
"asof_key": pl.Series(
692
[
693
datetime(2022, 1, 1),
694
datetime(2022, 1, 2),
695
datetime(2022, 1, 3),
696
datetime(
697
2023, 1, 2, 21, 30, 0
698
), # should match with 2023-01-02, 2023-01-03, and 2021-01-04
699
datetime(2023, 1, 7),
700
]
701
),
702
"b": b,
703
}
704
).set_sorted("asof_key")
705
out = df1.join_asof(df2, on="asof_key", strategy="nearest", tolerance="1d4h")
706
expected = df1.with_columns(pl.Series([None, 4, 4, 4, 5]).alias("b"))
707
assert_frame_equal(out, expected)
708
709
# Case 8: test using timedelta tolerance
710
out = df1.join_asof(
711
df2, on="asof_key", strategy="nearest", tolerance=timedelta(days=1, hours=4)
712
)
713
assert_frame_equal(out, expected)
714
715
# Case #9: last item is closest match
716
df1 = pl.DataFrame(
717
{
718
"asof_key_left": [10.00001, 20.0, 30.0],
719
}
720
).set_sorted("asof_key_left")
721
df2 = pl.DataFrame(
722
{
723
"asof_key_right": [10.00001, 20.0001, 29.0],
724
}
725
).set_sorted("asof_key_right")
726
out = df1.join_asof(
727
df2,
728
left_on="asof_key_left",
729
right_on="asof_key_right",
730
strategy="nearest",
731
tolerance=0.5,
732
)
733
expected = pl.DataFrame(
734
{
735
"asof_key_left": [10.00001, 20.0, 30.0],
736
"asof_key_right": [10.00001, 20.0001, None],
737
}
738
)
739
assert_frame_equal(out, expected)
740
741
742
def test_asof_join_nearest_by() -> None:
743
# Generic join_asof
744
df1 = pl.DataFrame(
745
{
746
"asof_key": [-1, 1, 2, 6, 1],
747
"group": [1, 1, 1, 2, 2],
748
"a": [1, 2, 3, 2, 5],
749
}
750
).sort(by=["group", "asof_key"])
751
752
df2 = pl.DataFrame(
753
{
754
"asof_key": [-1, 2, 5, 1],
755
"group": [1, 1, 2, 2],
756
"b": [1, 2, 3, 4],
757
}
758
).sort(by=["group", "asof_key"])
759
760
expected = pl.DataFrame(
761
{
762
"asof_key": [-1, 1, 2, 6, 1],
763
"group": [1, 1, 1, 2, 2],
764
"a": [1, 2, 3, 5, 2],
765
"b": [1, 2, 2, 4, 3],
766
}
767
).sort(by=["group", "asof_key"])
768
769
# Edge case: last item of right matches multiples on left
770
df1 = pl.DataFrame(
771
{
772
"asof_key": [9, 9, 10, 10, 10],
773
"group": [1, 1, 1, 2, 2],
774
"a": [1, 2, 3, 2, 5],
775
}
776
).sort(by=["group", "asof_key"])
777
778
df2 = pl.DataFrame(
779
{
780
"asof_key": [-1, 1, 1, 10],
781
"group": [1, 1, 2, 2],
782
"b": [1, 2, 3, 4],
783
}
784
).sort(by=["group", "asof_key"])
785
786
expected = pl.DataFrame(
787
{
788
"asof_key": [9, 9, 10, 10, 10],
789
"group": [1, 1, 1, 2, 2],
790
"a": [1, 2, 3, 2, 5],
791
"b": [2, 2, 2, 4, 4],
792
}
793
)
794
795
out = df1.join_asof(df2, on="asof_key", by="group", strategy="nearest")
796
assert_frame_equal(out, expected)
797
798
a = pl.DataFrame(
799
{
800
"code": [676, 35, 676, 676, 676],
801
"time": [364360, 364370, 364380, 365400, 367440],
802
}
803
)
804
b = pl.DataFrame(
805
{
806
"code": [676, 676, 35, 676, 676],
807
"time": [364000, 365000, 365000, 366000, 367000],
808
"price": [1.0, 2.0, 50, 3.0, None],
809
}
810
)
811
812
expected = pl.DataFrame(
813
{
814
"code": [676, 35, 676, 676, 676],
815
"time": [364360, 364370, 364380, 365400, 367440],
816
"price": [1.0, 50.0, 1.0, 2.0, None],
817
}
818
)
819
820
out = a.join_asof(b, by="code", on="time", strategy="nearest")
821
assert_frame_equal(out, expected)
822
823
# last item is closest match
824
df1 = pl.DataFrame(
825
{
826
"a": [1, 1, 1],
827
"asof_key_left": [10.00001, 20.0, 30.0],
828
}
829
).set_sorted("asof_key_left")
830
df2 = pl.DataFrame(
831
{
832
"a": [1, 1, 1],
833
"asof_key_right": [10.00001, 20.0001, 29.0],
834
}
835
).set_sorted("asof_key_right")
836
out = df1.join_asof(
837
df2,
838
left_on="asof_key_left",
839
right_on="asof_key_right",
840
by="a",
841
strategy="nearest",
842
)
843
expected = pl.DataFrame(
844
{
845
"a": [1, 1, 1],
846
"asof_key_left": [10.00001, 20.0, 30.0],
847
"asof_key_right": [10.00001, 20.0001, 29.0],
848
}
849
)
850
assert_frame_equal(out, expected)
851
852
853
def test_asof_join_nearest_by_with_tolerance() -> None:
854
df1 = pl.DataFrame(
855
{
856
"group": [
857
1,
858
1,
859
1,
860
1,
861
1,
862
2,
863
2,
864
2,
865
2,
866
2,
867
3,
868
3,
869
3,
870
3,
871
3,
872
4,
873
4,
874
4,
875
4,
876
4,
877
5,
878
5,
879
5,
880
5,
881
5,
882
6,
883
6,
884
6,
885
6,
886
6,
887
],
888
"asof_key": pl.Series(
889
[
890
1,
891
2,
892
3,
893
4,
894
5,
895
7,
896
8,
897
9,
898
10,
899
11,
900
1,
901
2,
902
3,
903
4,
904
5,
905
1,
906
2,
907
3,
908
4,
909
5,
910
1,
911
2,
912
3,
913
4,
914
5,
915
1,
916
2,
917
3,
918
4,
919
5,
920
],
921
dtype=pl.Float32,
922
),
923
"a": [
924
1,
925
2,
926
3,
927
4,
928
5,
929
1,
930
2,
931
3,
932
4,
933
5,
934
1,
935
2,
936
3,
937
4,
938
5,
939
1,
940
2,
941
3,
942
4,
943
5,
944
1,
945
2,
946
3,
947
4,
948
5,
949
1,
950
2,
951
3,
952
4,
953
5,
954
],
955
}
956
)
957
958
df2 = pl.DataFrame(
959
{
960
"group": [
961
1,
962
1,
963
1,
964
1,
965
1,
966
2,
967
2,
968
2,
969
2,
970
2,
971
3,
972
3,
973
3,
974
3,
975
3,
976
4,
977
4,
978
4,
979
4,
980
4,
981
5,
982
5,
983
5,
984
5,
985
5,
986
6,
987
6,
988
6,
989
6,
990
6,
991
],
992
"asof_key": pl.Series(
993
[
994
7,
995
8,
996
9,
997
10,
998
11,
999
1,
1000
2,
1001
3,
1002
4,
1003
5,
1004
6,
1005
7,
1006
8,
1007
9,
1008
10,
1009
5,
1010
-3,
1011
-2,
1012
-1,
1013
0,
1014
0,
1015
2,
1016
2.4,
1017
3.4,
1018
10,
1019
-3,
1020
3,
1021
8,
1022
9,
1023
10,
1024
],
1025
dtype=pl.Float32,
1026
),
1027
"b": [
1028
1,
1029
2,
1030
3,
1031
4,
1032
5,
1033
1,
1034
2,
1035
3,
1036
4,
1037
5,
1038
1,
1039
2,
1040
3,
1041
4,
1042
5,
1043
1,
1044
2,
1045
3,
1046
4,
1047
5,
1048
1,
1049
2,
1050
3,
1051
4,
1052
5,
1053
1,
1054
2,
1055
3,
1056
4,
1057
5,
1058
],
1059
}
1060
)
1061
1062
expected = df1.with_columns(
1063
pl.Series(
1064
[
1065
None,
1066
None,
1067
None,
1068
None,
1069
None,
1070
None,
1071
None,
1072
None,
1073
None,
1074
None,
1075
None,
1076
None,
1077
None,
1078
None,
1079
1,
1080
5,
1081
None,
1082
None,
1083
1,
1084
1,
1085
2,
1086
2,
1087
4,
1088
4,
1089
None,
1090
None,
1091
2,
1092
2,
1093
2,
1094
None,
1095
]
1096
).alias("b")
1097
)
1098
df1 = df1.sort(by=["group", "asof_key"])
1099
df2 = df2.sort(by=["group", "asof_key"])
1100
expected = expected.sort(by=["group", "a"])
1101
1102
out = df1.join_asof(
1103
df2, by="group", on="asof_key", strategy="nearest", tolerance=1.0
1104
).sort(by=["group", "a"])
1105
assert_frame_equal(out, expected)
1106
1107
# last item is closest match
1108
df1 = pl.DataFrame(
1109
{
1110
"a": [1, 1, 1],
1111
"asof_key_left": [10.00001, 20.0, 30.0],
1112
}
1113
).set_sorted("asof_key_left")
1114
df2 = pl.DataFrame(
1115
{
1116
"a": [1, 1, 1],
1117
"asof_key_right": [10.00001, 20.0001, 29.0],
1118
}
1119
).set_sorted("asof_key_right")
1120
out = df1.join_asof(
1121
df2,
1122
left_on="asof_key_left",
1123
right_on="asof_key_right",
1124
by="a",
1125
strategy="nearest",
1126
tolerance=0.5,
1127
)
1128
expected = pl.DataFrame(
1129
{
1130
"a": [1, 1, 1],
1131
"asof_key_left": [10.00001, 20.0, 30.0],
1132
"asof_key_right": [10.00001, 20.0001, None],
1133
}
1134
)
1135
assert_frame_equal(out, expected)
1136
1137
1138
def test_asof_join_nearest_by_date() -> None:
1139
df1 = pl.DataFrame(
1140
{
1141
"asof_key": [
1142
date(2019, 12, 30),
1143
date(2020, 1, 1),
1144
date(2020, 1, 2),
1145
date(2020, 1, 6),
1146
date(2020, 1, 1),
1147
],
1148
"group": [1, 1, 1, 2, 2],
1149
"a": [1, 2, 3, 2, 5],
1150
}
1151
).sort(by=["group", "asof_key"])
1152
1153
df2 = pl.DataFrame(
1154
{
1155
"asof_key": [
1156
date(2020, 1, 1),
1157
date(2020, 1, 2),
1158
date(2020, 1, 5),
1159
date(2020, 1, 1),
1160
],
1161
"group": [1, 1, 2, 2],
1162
"b": [1, 2, 3, 4],
1163
}
1164
).sort(by=["group", "asof_key"])
1165
1166
expected = pl.DataFrame(
1167
{
1168
"asof_key": [
1169
date(2019, 12, 30),
1170
date(2020, 1, 1),
1171
date(2020, 1, 2),
1172
date(2020, 1, 6),
1173
date(2020, 1, 1),
1174
],
1175
"group": [1, 1, 1, 2, 2],
1176
"a": [1, 2, 3, 2, 5],
1177
"b": [1, 1, 2, 3, 4],
1178
}
1179
).sort(by=["group", "asof_key"])
1180
1181
out = df1.join_asof(df2, on="asof_key", by="group", strategy="nearest")
1182
assert_frame_equal(out, expected)
1183
1184
1185
@pytest.mark.may_fail_auto_streaming # See #18927.
1186
def test_asof_join_string() -> None:
1187
left = pl.DataFrame({"x": [None, "a", "b", "c", None, "d", None]}).set_sorted("x")
1188
right = pl.DataFrame({"x": ["apple", None, "chutney"], "y": [0, 1, 2]}).set_sorted(
1189
"x"
1190
)
1191
forward = left.join_asof(right, on="x", strategy="forward")
1192
backward = left.join_asof(right, on="x", strategy="backward")
1193
forward_expected = pl.DataFrame(
1194
{
1195
"x": [None, "a", "b", "c", None, "d", None],
1196
"y": [None, 0, 2, 2, None, None, None],
1197
}
1198
)
1199
backward_expected = pl.DataFrame(
1200
{
1201
"x": [None, "a", "b", "c", None, "d", None],
1202
"y": [None, None, 0, 0, None, 2, None],
1203
}
1204
)
1205
assert_frame_equal(forward, forward_expected)
1206
assert_frame_equal(backward, backward_expected)
1207
1208
1209
def test_join_asof_by_argument_parsing() -> None:
1210
df1 = pl.DataFrame(
1211
{
1212
"n": [10, 20, 30, 40, 50, 60],
1213
"id1": [0, 0, 3, 3, 5, 5],
1214
"id2": [1, 2, 1, 2, 1, 2],
1215
"x": ["a", "b", "c", "d", "e", "f"],
1216
}
1217
).sort(by="n")
1218
1219
df2 = pl.DataFrame(
1220
{
1221
"n": [25, 8, 5, 23, 15, 35],
1222
"id1": [0, 0, 3, 3, 5, 5],
1223
"id2": [1, 2, 1, 2, 1, 2],
1224
"y": ["A", "B", "C", "D", "E", "F"],
1225
}
1226
).sort(by="n")
1227
1228
# any sequency for by argument is allowed, so we should see the same results here
1229
by_list = df1.join_asof(df2, on="n", by=["id1", "id2"])
1230
by_tuple = df1.join_asof(df2, on="n", by=("id1", "id2"))
1231
assert_frame_equal(by_list, by_tuple)
1232
1233
# same for using the by_left and by_right kwargs
1234
by_list2 = df1.join_asof(
1235
df2, on="n", by_left=["id1", "id2"], by_right=["id1", "id2"]
1236
)
1237
by_tuple2 = df1.join_asof(
1238
df2, on="n", by_left=("id1", "id2"), by_right=("id1", "id2")
1239
)
1240
assert_frame_equal(by_list2, by_list)
1241
assert_frame_equal(by_tuple2, by_list)
1242
1243
1244
def test_join_asof_invalid_args() -> None:
1245
df1 = pl.DataFrame(
1246
{
1247
"a": [1, 2, 3],
1248
"b": [1, 2, 3],
1249
}
1250
).set_sorted("a")
1251
df2 = pl.DataFrame(
1252
{
1253
"a": [1, 2, 3],
1254
"c": [1, 2, 3],
1255
}
1256
).set_sorted("a")
1257
1258
with pytest.raises(TypeError, match="expected `on` to be str or Expr, got 'list'"):
1259
df1.join_asof(df2, on=["a"]) # type: ignore[arg-type]
1260
with pytest.raises(
1261
TypeError, match="expected `left_on` to be str or Expr, got 'list'"
1262
):
1263
df1.join_asof(df2, left_on=["a"], right_on="a") # type: ignore[arg-type]
1264
with pytest.raises(
1265
TypeError, match="expected `right_on` to be str or Expr, got 'list'"
1266
):
1267
df1.join_asof(df2, left_on="a", right_on=["a"]) # type: ignore[arg-type]
1268
1269
1270
def test_join_as_of_by_schema() -> None:
1271
a = pl.DataFrame({"a": [1], "b": [2], "c": [3]}).lazy()
1272
b = pl.DataFrame({"a": [1], "b": [2], "d": [4]}).lazy()
1273
q = a.join_asof(b, on=pl.col("a").set_sorted(), by="b")
1274
assert q.collect_schema().names() == q.collect().columns
1275
1276
1277
def test_asof_join_by_schema() -> None:
1278
# different `by` names.
1279
df1 = pl.DataFrame({"on1": 0, "by1": 0})
1280
df2 = pl.DataFrame({"on1": 0, "by2": 0})
1281
1282
q = df1.lazy().join_asof(
1283
df2.lazy(),
1284
on="on1",
1285
by_left="by1",
1286
by_right="by2",
1287
)
1288
1289
assert q.collect_schema() == q.collect().schema
1290
1291
1292
def test_raise_invalid_by_arg_13020() -> None:
1293
df1 = pl.DataFrame({"asOfDate": [date(2020, 1, 1)]})
1294
df2 = pl.DataFrame(
1295
{
1296
"endityId": [date(2020, 1, 1)],
1297
"eventDate": ["A"],
1298
}
1299
)
1300
with pytest.raises(pl.exceptions.InvalidOperationError, match="expected both"):
1301
df1.sort("asOfDate").join_asof(
1302
df2.sort("eventDate"),
1303
left_on="asOfDate",
1304
right_on="eventDate",
1305
by_left=None,
1306
by_right=["entityId"],
1307
)
1308
1309
1310
def test_join_asof_no_exact_matches() -> None:
1311
trades = pl.DataFrame(
1312
{
1313
"time": [
1314
"2016-05-25 13:30:00.023",
1315
"2016-05-25 13:30:00.038",
1316
"2016-05-25 13:30:00.048",
1317
"2016-05-25 13:30:00.048",
1318
"2016-05-25 13:30:00.048",
1319
],
1320
"ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
1321
"price": [51.95, 51.95, 720.77, 720.92, 98.0],
1322
"quantity": [75, 155, 100, 100, 100],
1323
}
1324
).with_columns(pl.col("time").str.to_datetime())
1325
1326
quotes = pl.DataFrame(
1327
{
1328
"time": [
1329
"2016-05-25 13:30:00.023",
1330
"2016-05-25 13:30:00.023",
1331
"2016-05-25 13:30:00.030",
1332
"2016-05-25 13:30:00.041",
1333
"2016-05-25 13:30:00.048",
1334
"2016-05-25 13:30:00.049",
1335
"2016-05-25 13:30:00.072",
1336
"2016-05-25 13:30:00.075",
1337
],
1338
"ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"],
1339
"bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
1340
"ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03],
1341
}
1342
).with_columns(pl.col("time").str.to_datetime())
1343
1344
assert trades.join_asof(
1345
quotes, on="time", by="ticker", tolerance="10ms", allow_exact_matches=False
1346
).to_dict(as_series=False) == {
1347
"time": [
1348
datetime(2016, 5, 25, 13, 30, 0, 23000),
1349
datetime(2016, 5, 25, 13, 30, 0, 38000),
1350
datetime(2016, 5, 25, 13, 30, 0, 48000),
1351
datetime(2016, 5, 25, 13, 30, 0, 48000),
1352
datetime(2016, 5, 25, 13, 30, 0, 48000),
1353
],
1354
"ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
1355
"price": [51.95, 51.95, 720.77, 720.92, 98.0],
1356
"quantity": [75, 155, 100, 100, 100],
1357
"bid": [None, 51.97, None, None, None],
1358
"ask": [None, 51.98, None, None, None],
1359
}
1360
1361
1362
@pytest.mark.parametrize("strategy", ["backward", "forward", "nearest"])
1363
@given(
1364
df_left=dataframes(cols=1, allowed_dtypes=pl.Int32),
1365
df_right=dataframes(cols=1, allowed_dtypes=pl.Int32),
1366
)
1367
def test_join_asof_no_exact_matches_parametric(
1368
strategy: AsofJoinStrategy, df_left: pl.DataFrame, df_right: pl.DataFrame
1369
) -> None:
1370
df_left = df_left.sort("col0")
1371
df_right = df_right.sort("col0")
1372
1373
out = df_left.join_asof(
1374
df_right,
1375
on="col0",
1376
strategy=strategy,
1377
suffix="_right",
1378
coalesce=False,
1379
allow_exact_matches=False,
1380
)
1381
1382
for l_val, r_val in zip(
1383
out["col0"],
1384
out["col0_right"],
1385
strict=False,
1386
):
1387
assert l_val != r_val or r_val is None
1388
1389
1390
def test_join_asof_not_sorted() -> None:
1391
df = pl.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [2, 1, 3, 1, 2, 3]})
1392
with pytest.raises(InvalidOperationError, match="is not sorted"):
1393
df.join_asof(df, on="b")
1394
1395
# When 'by' is provided, we do not check sortedness, but a warning is received
1396
with pytest.warns(
1397
UserWarning,
1398
match="Sortedness of columns cannot be checked when 'by' groups provided",
1399
):
1400
df.join_asof(df, on="b", by="a")
1401
1402
# When sortedness is False, we should get no warning
1403
with warnings.catch_warnings(record=True) as w:
1404
df.join_asof(df, on="b", check_sortedness=False)
1405
df.join_asof(df, on="b", by="a", check_sortedness=False)
1406
assert len(w) == 0 # no warnings caught
1407
1408
1409
@pytest.mark.parametrize(
1410
"dtypes",
1411
[
1412
(pl.Int64, pl.Int64),
1413
(pl.Int64, pl.UInt64),
1414
(pl.Int64, pl.Int128),
1415
(pl.UInt64, pl.Int64),
1416
(pl.UInt64, pl.UInt64),
1417
(pl.UInt64, pl.Int128),
1418
(pl.UInt64, pl.UInt128),
1419
(pl.Int128, pl.Int64),
1420
(pl.Int128, pl.UInt64),
1421
(pl.Int128, pl.Int128),
1422
(pl.UInt128, pl.UInt64),
1423
(pl.UInt128, pl.UInt128),
1424
],
1425
)
1426
@pytest.mark.parametrize("swap", [False, True])
1427
@pytest.mark.parametrize("strategy", ["backward", "forward", "nearest"])
1428
def test_join_asof_large_int_21276(
1429
dtypes: tuple[PolarsIntegerType, PolarsIntegerType],
1430
swap: bool,
1431
strategy: AsofJoinStrategy,
1432
) -> None:
1433
left_dtype, right_dtype = reversed(dtypes) if swap else dtypes
1434
large_int64 = 1608129000134000123 # it only happen when "on" column is large
1435
left = pl.DataFrame({"ts": pl.Series([large_int64 + 2], dtype=left_dtype)})
1436
right = pl.DataFrame(
1437
{
1438
"ts": pl.Series([large_int64 + 1, large_int64 + 3], dtype=right_dtype),
1439
"value": [111, 333],
1440
}
1441
)
1442
result = left.join_asof(right, on="ts", strategy=strategy)
1443
idx = 0 if strategy == "backward" else 1
1444
expected = pl.DataFrame(
1445
{
1446
"ts": left["ts"],
1447
"value": right["value"].gather(idx),
1448
}
1449
)
1450
assert_frame_equal(result, expected)
1451
1452
1453
@pytest.mark.parametrize("by", ["constant", None])
1454
def test_join_asof_slice_23583(by: str | None) -> None:
1455
lhs = pl.LazyFrame(
1456
{
1457
"index": [0],
1458
"constant": 0,
1459
"date": [date(2025, 1, 1)],
1460
},
1461
).set_sorted("date")
1462
1463
rhs = pl.LazyFrame(
1464
{
1465
"index": [0, 1],
1466
"constant": 0,
1467
"date": [date(1970, 1, 1), date(2025, 1, 1)],
1468
},
1469
).set_sorted("date")
1470
1471
q = (
1472
lhs.join_asof(rhs, on="date", by=by, check_sortedness=False)
1473
.head(1)
1474
.select(pl.exclude("constant_right"))
1475
)
1476
1477
expect = pl.DataFrame(
1478
{
1479
"index": [0],
1480
"constant": 0,
1481
"date": [date(2025, 1, 1)],
1482
"index_right": [1],
1483
},
1484
)
1485
1486
assert_frame_equal(q.collect(optimizations=pl.QueryOptFlags.none()), expect)
1487
assert_frame_equal(q.collect(), expect)
1488
1489
1490
def test_join_asof_23751() -> None:
1491
a = pl.DataFrame(
1492
[
1493
pl.Series([1, 2, 3, 4, 5]).alias("index") * int(1e10),
1494
pl.Series([1, -1, 1, 1, -1]).alias("side"),
1495
]
1496
)
1497
1498
b = pl.DataFrame(
1499
[
1500
pl.Series([0, 1, 1, 3, 3, 5]).alias("index_right").cast(pl.UInt64)
1501
* int(1e10),
1502
pl.Series([-1, 1, -1, 1, 1, -1]).alias("side"),
1503
pl.Series([0, 10, 20, 30, 40, 50]).alias("value"),
1504
]
1505
)
1506
1507
assert a.join_asof(b, left_on="index", right_on="index_right", by="side").to_dict(
1508
as_series=False
1509
) == {
1510
"index": [10000000000, 20000000000, 30000000000, 40000000000, 50000000000],
1511
"side": [1, -1, 1, 1, -1],
1512
"index_right": [
1513
10000000000,
1514
10000000000,
1515
30000000000,
1516
30000000000,
1517
50000000000,
1518
],
1519
"value": [10, 20, 40, 40, 50],
1520
}
1521
1522
1523
def test_join_asof_nosuffix_dup_col_23834() -> None:
1524
a = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
1525
b = pl.DataFrame({"b": [1, 2, 3], "c": [9, 10, 11]})
1526
with pytest.raises(DuplicateError):
1527
a.join_asof(b, left_on="a", right_on="b", suffix="")
1528
1529
1530
def test_join_asof_planner_schema_24000() -> None:
1531
a = pl.DataFrame([pl.Series("index", [1, 2, 3]) * 10])
1532
b = pl.DataFrame(
1533
[
1534
pl.Series("value", [10, 20, 30]),
1535
pl.Series("index_right", [1, 2, 3]).cast(pl.UInt64) * 10,
1536
]
1537
)
1538
q = a.lazy().join_asof(b.lazy(), left_on="index", right_on="index_right")
1539
1540
assert q.collect().schema == q.collect_schema()
1541
1542
b = pl.DataFrame(
1543
[
1544
pl.Series("index_right", [1, 2, 3]).cast(pl.UInt64) * 10,
1545
pl.Series("value", [10, 20, 30]),
1546
]
1547
)
1548
q = a.lazy().join_asof(b.lazy(), left_on="index", right_on="index_right")
1549
1550
assert q.collect().schema == q.collect_schema()
1551
1552
1553
@pytest.mark.parametrize(
1554
"dtype",
1555
[pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Int8, pl.Int16, pl.Int32, pl.Int64],
1556
)
1557
def test_join_asof_int_dtypes_24383(dtype: PolarsIntegerType) -> None:
1558
lf1 = pl.LazyFrame(
1559
{
1560
"id": pl.Series([1], dtype=dtype),
1561
"date": pl.Series([date(2025, 12, 31)], dtype=pl.Date),
1562
}
1563
)
1564
1565
lf2 = pl.LazyFrame(
1566
{
1567
"id": pl.Series([1], dtype=dtype),
1568
"date": pl.Series([date(2025, 12, 31)], dtype=pl.Date),
1569
"value": pl.Series([2.5], dtype=pl.Float32),
1570
}
1571
)
1572
1573
result = lf1.join_asof(
1574
other=lf2,
1575
on="date",
1576
by="id",
1577
check_sortedness=False,
1578
)
1579
expected = pl.DataFrame(
1580
{
1581
"id": pl.Series([1], dtype=dtype),
1582
"date": pl.Series([date(2025, 12, 31)], dtype=pl.Date),
1583
"value": pl.Series([2.5], dtype=pl.Float32),
1584
}
1585
)
1586
assert result.collect_schema() == expected.schema
1587
assert_frame_equal(result.collect(), expected)
1588
1589
1590
@pytest.mark.parametrize("by", [None, "one"])
1591
def test_join_asof_on_cast_expr_24999(by: str | None) -> None:
1592
q = pl.LazyFrame({"x": date(2025, 10, 13), "one": 1}).join_asof(
1593
pl.LazyFrame({"x_right": datetime(2025, 10, 10, 2), "one": 1}),
1594
left_on=pl.col("x").cast(pl.Datetime),
1595
right_on=pl.col("x_right"),
1596
tolerance=timedelta(days=100),
1597
by=by,
1598
)
1599
1600
expect = pl.DataFrame(
1601
{
1602
"x": date(2025, 10, 13),
1603
"one": 1,
1604
"x_right": datetime(2025, 10, 10, 2),
1605
}
1606
)
1607
1608
if by is None:
1609
expect = expect.with_columns(pl.col("one").alias("one_right"))
1610
1611
assert_frame_equal(q.collect(), expect)
1612
1613
1614
@pytest.mark.parametrize(
1615
("left", "right", "expected_right"),
1616
[
1617
([], [], []),
1618
([1], [2], [2]),
1619
([], [1, 2], []),
1620
([1, 2], [], [None, None]),
1621
([1, 2], [1, 2], [2, 1]),
1622
([1, 1], [1, 1], [None, None]),
1623
([1, 2, 2, 2, 3], [1, 2, 2, 2, 3], [2, 3, 3, 3, 2]),
1624
],
1625
)
1626
def test_join_asof_nearest_no_exact_matches_25468(
1627
left: list[int],
1628
right: list[int],
1629
expected_right: list[int],
1630
) -> None:
1631
df_left = pl.DataFrame({"a": left}, schema={"a": pl.Int32})
1632
df_right = pl.DataFrame({"a": right}, schema={"a": pl.Int32})
1633
expected_df = pl.DataFrame(
1634
{"a": left, "a_right": expected_right},
1635
schema={"a": pl.Int32, "a_right": pl.Int32},
1636
)
1637
result = df_left.join_asof(
1638
df_right, on="a", strategy="nearest", coalesce=False, allow_exact_matches=False
1639
)
1640
assert_frame_equal(result, expected_df)
1641
1642
1643
@pytest.mark.parametrize("strategy", ["backward", "nearest"])
1644
def test_join_asof_nans(strategy: AsofJoinStrategy) -> None:
1645
df_left = pl.LazyFrame(
1646
{
1647
"time": [0.0, 6.0, math.nan, math.nan],
1648
"value": [100, 101, 102, 103],
1649
}
1650
)
1651
df_right = pl.LazyFrame(
1652
{
1653
"time": [0.0, 5.0, 10.0, 42.0],
1654
"value": [100, 200, 300, 400],
1655
}
1656
)
1657
actual = df_left.join_asof(
1658
df_right,
1659
on="time",
1660
strategy=strategy,
1661
).collect()
1662
expected = pl.DataFrame(
1663
{
1664
"time": [0.0, 6.0, math.nan, math.nan],
1665
"value": [100, 101, 102, 103],
1666
"value_right": [100, 200, 400, 400],
1667
}
1668
)
1669
assert_frame_equal(actual, expected)
1670
1671