CoCalc -- test

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/sql/test_joins.py
⁶⁹³⁹ views
1
from __future__ import annotations
2

3
from io import BytesIO
4
from pathlib import Path
5
from typing import Any
6

7
import pytest
8

9
import polars as pl
10
from polars.exceptions import SQLInterfaceError, SQLSyntaxError
11
from polars.testing import assert_frame_equal
12

13

14
@pytest.fixture
15
def foods_ipc_path() -> Path:
16
    return Path(__file__).parent.parent / "io" / "files" / "foods1.ipc"
17

18

19
@pytest.mark.parametrize(
20
    ("sql", "expected"),
21
    [
22
        (
23
            "SELECT * FROM tbl_a LEFT SEMI JOIN tbl_b USING (a,c)",
24
            pl.DataFrame({"a": [2], "b": [0], "c": ["y"]}),
25
        ),
26
        (
27
            "SELECT * FROM tbl_a SEMI JOIN tbl_b USING (a,c)",
28
            pl.DataFrame({"a": [2], "b": [0], "c": ["y"]}),
29
        ),
30
        (
31
            "SELECT * FROM tbl_a LEFT SEMI JOIN tbl_b USING (a)",
32
            pl.DataFrame({"a": [1, 2, 3], "b": [4, 0, 6], "c": ["w", "y", "z"]}),
33
        ),
34
        (
35
            "SELECT * FROM tbl_a LEFT ANTI JOIN tbl_b USING (a)",
36
            pl.DataFrame(schema={"a": pl.Int64, "b": pl.Int64, "c": pl.String}),
37
        ),
38
        (
39
            "SELECT * FROM tbl_a ANTI JOIN tbl_b USING (a)",
40
            pl.DataFrame(schema={"a": pl.Int64, "b": pl.Int64, "c": pl.String}),
41
        ),
42
        (
43
            "SELECT * FROM tbl_a LEFT SEMI JOIN tbl_b USING (b) LEFT SEMI JOIN tbl_c USING (c)",
44
            pl.DataFrame({"a": [1, 3], "b": [4, 6], "c": ["w", "z"]}),
45
        ),
46
        (
47
            "SELECT * FROM tbl_a LEFT ANTI JOIN tbl_b USING (b) LEFT SEMI JOIN tbl_c USING (c)",
48
            pl.DataFrame({"a": [2], "b": [0], "c": ["y"]}),
49
        ),
50
        (
51
            "SELECT * FROM tbl_a RIGHT ANTI JOIN tbl_b USING (b) LEFT SEMI JOIN tbl_c USING (c)",
52
            pl.DataFrame({"a": [2], "b": [5], "c": ["y"]}),
53
        ),
54
        (
55
            "SELECT * FROM tbl_a RIGHT SEMI JOIN tbl_b USING (b) RIGHT SEMI JOIN tbl_c USING (c)",
56
            pl.DataFrame({"c": ["z"], "d": [25.5]}),
57
        ),
58
        (
59
            "SELECT * FROM tbl_a RIGHT SEMI JOIN tbl_b USING (b) RIGHT ANTI JOIN tbl_c USING (c)",
60
            pl.DataFrame({"c": ["w", "y"], "d": [10.5, -50.0]}),
61
        ),
62
    ],
63
)
64
def test_join_anti_semi(sql: str, expected: pl.DataFrame) -> None:
65
    frames = {
66
        "tbl_a": pl.DataFrame({"a": [1, 2, 3], "b": [4, 0, 6], "c": ["w", "y", "z"]}),
67
        "tbl_b": pl.DataFrame({"a": [3, 2, 1], "b": [6, 5, 4], "c": ["x", "y", "z"]}),
68
        "tbl_c": pl.DataFrame({"c": ["w", "y", "z"], "d": [10.5, -50.0, 25.5]}),
69
    }
70
    ctx = pl.SQLContext(frames, eager=True)
71
    assert_frame_equal(expected, ctx.execute(sql))
72

73

74
def test_join_cross() -> None:
75
    frames = {
76
        "tbl_a": pl.DataFrame({"a": [1, 2, 3], "b": [4, 0, 6], "c": ["w", "y", "z"]}),
77
        "tbl_b": pl.DataFrame({"a": [3, 2, 1], "b": [6, 5, 4], "c": ["x", "y", "z"]}),
78
    }
79
    with pl.SQLContext(frames, eager=True) as ctx:
80
        out = ctx.execute(
81
            """
82
            SELECT *
83
            FROM tbl_a
84
            CROSS JOIN tbl_b
85
            ORDER BY a, b, c
86
            """
87
        )
88
        assert out.rows() == [
89
            (1, 4, "w", 3, 6, "x"),
90
            (1, 4, "w", 2, 5, "y"),
91
            (1, 4, "w", 1, 4, "z"),
92
            (2, 0, "y", 3, 6, "x"),
93
            (2, 0, "y", 2, 5, "y"),
94
            (2, 0, "y", 1, 4, "z"),
95
            (3, 6, "z", 3, 6, "x"),
96
            (3, 6, "z", 2, 5, "y"),
97
            (3, 6, "z", 1, 4, "z"),
98
        ]
99

100

101
def test_join_cross_11927() -> None:
102
    df1 = pl.DataFrame({"id": [1, 2, 3]})  # noqa: F841
103
    df2 = pl.DataFrame({"id": [3, 4, 5]})  # noqa: F841
104
    df3 = pl.DataFrame({"id": [4, 5, 6]})  # noqa: F841
105

106
    res = pl.sql("SELECT df1.id FROM df1 CROSS JOIN df2 WHERE df1.id = df2.id")
107
    assert_frame_equal(res.collect(), pl.DataFrame({"id": [3]}))
108

109
    res = pl.sql("SELECT * FROM df1 CROSS JOIN df3 WHERE df1.id = df3.id")
110
    assert res.collect().is_empty()
111

112

113
@pytest.mark.parametrize(
114
    "join_clause",
115
    [
116
        "ON foods1.category = foods2.category",
117
        "ON foods2.category = foods1.category",
118
        "USING (category)",
119
    ],
120
)
121
def test_join_inner(foods_ipc_path: Path, join_clause: str) -> None:
122
    foods1 = pl.scan_ipc(foods_ipc_path)
123
    foods2 = foods1  # noqa: F841
124
    schema = foods1.collect_schema()
125

126
    sort_clause = ", ".join(f'{c} ASC, "{c}:foods2" DESC' for c in schema)
127
    out = pl.sql(
128
        f"""
129
        SELECT *
130
        FROM foods1
131
        INNER JOIN foods2 {join_clause}
132
        ORDER BY {sort_clause}
133
        LIMIT 2
134
        """,
135
        eager=True,
136
    )
137

138
    assert_frame_equal(
139
        out,
140
        pl.DataFrame(
141
            {
142
                "category": ["fruit", "fruit"],
143
                "calories": [30, 30],
144
                "fats_g": [0.0, 0.0],
145
                "sugars_g": [3, 5],
146
                "category:foods2": ["fruit", "fruit"],
147
                "calories:foods2": [130, 130],
148
                "fats_g:foods2": [0.0, 0.0],
149
                "sugars_g:foods2": [25, 25],
150
            }
151
        ),
152
        check_dtypes=False,
153
    )
154

155

156
@pytest.mark.parametrize(
157
    "join_clause",
158
    [
159
        """
160
        INNER JOIN tbl_b USING (a,b)
161
        INNER JOIN tbl_c USING (c)
162
        """,
163
        """
164
        INNER JOIN tbl_b ON tbl_a.a = tbl_b.a AND tbl_a.b = tbl_b.b
165
        INNER JOIN tbl_c ON tbl_a.c = tbl_c.c
166
        """,
167
    ],
168
)
169
def test_join_inner_multi(join_clause: str) -> None:
170
    frames = {
171
        "tbl_a": pl.DataFrame({"a": [1, 2, 3], "b": [4, None, 6]}),
172
        "tbl_b": pl.DataFrame({"a": [3, 2, 1], "b": [6, 5, 4], "c": ["x", "y", "z"]}),
173
        "tbl_c": pl.DataFrame({"c": ["w", "y", "z"], "d": [10.5, -50.0, 25.5]}),
174
    }
175
    with pl.SQLContext(frames) as ctx:
176
        assert ctx.tables() == ["tbl_a", "tbl_b", "tbl_c"]
177
        for select_cols in ("a, b, c, d", "tbl_a.a, tbl_a.b, tbl_b.c, tbl_c.d"):
178
            out = ctx.execute(
179
                f"SELECT {select_cols} FROM tbl_a {join_clause} ORDER BY a DESC"
180
            )
181
            assert out.collect().rows() == [(1, 4, "z", 25.5)]
182

183

184
def test_join_inner_15663() -> None:
185
    df_a = pl.DataFrame({"LOCID": [1, 2, 3], "VALUE": [0.1, 0.2, 0.3]})  # noqa: F841
186
    df_b = pl.DataFrame({"LOCID": [1, 2, 3], "VALUE": [25.6, 53.4, 12.7]})  # noqa: F841
187
    expected = pl.DataFrame(
188
        {
189
            "LOCID": [1, 2, 3],
190
            "VALUE_A": [0.1, 0.2, 0.3],
191
            "VALUE_B": [25.6, 53.4, 12.7],
192
        }
193
    )
194
    with pl.SQLContext(register_globals=True, eager=True) as ctx:
195
        query = """
196
        SELECT
197
            a.LOCID,
198
            a.VALUE AS VALUE_A,
199
            b.VALUE AS VALUE_B
200
        FROM df_a AS a
201
        INNER JOIN df_b AS b
202
        USING (LOCID)
203
        ORDER BY LOCID
204
        """
205
        actual = ctx.execute(query)
206
        assert_frame_equal(expected, actual)
207

208

209
@pytest.mark.parametrize(
210
    "join_clause",
211
    [
212
        """
213
        LEFT JOIN tbl_b USING (a,b)
214
        LEFT JOIN tbl_c USING (c)
215
        """,
216
        """
217
        LEFT JOIN tbl_b ON tbl_a.a = tbl_b.a AND tbl_a.b = tbl_b.b
218
        LEFT JOIN tbl_c ON tbl_a.c = tbl_c.c
219
        """,
220
    ],
221
)
222
def test_join_left_multi(join_clause: str) -> None:
223
    frames = {
224
        "tbl_a": pl.DataFrame({"a": [1, 2, 3], "b": [4, None, 6]}),
225
        "tbl_b": pl.DataFrame({"a": [3, 2, 1], "b": [6, 5, 4], "c": ["x", "y", "z"]}),
226
        "tbl_c": pl.DataFrame({"c": ["w", "y", "z"], "d": [10.5, -50.0, 25.5]}),
227
    }
228
    with pl.SQLContext(frames) as ctx:
229
        for select_cols in ("a, b, c, d", "tbl_a.a, tbl_a.b, tbl_b.c, tbl_c.d"):
230
            out = ctx.execute(
231
                f"SELECT {select_cols} FROM tbl_a {join_clause} ORDER BY a DESC"
232
            )
233
            assert out.collect().rows() == [
234
                (3, 6, "x", None),
235
                (2, None, None, None),
236
                (1, 4, "z", 25.5),
237
            ]
238

239

240
def test_join_left_multi_nested() -> None:
241
    frames = {
242
        "tbl_a": pl.DataFrame({"a": [1, 2, 3], "b": [4, None, 6]}),
243
        "tbl_b": pl.DataFrame({"a": [3, 2, 1], "b": [6, 5, 4], "c": ["x", "y", "z"]}),
244
        "tbl_c": pl.DataFrame({"c": ["w", "y", "z"], "d": [10.5, -50.0, 25.5]}),
245
    }
246
    with pl.SQLContext(frames) as ctx:
247
        for select_cols in ("a, b, c, d", "tbl_x.a, tbl_x.b, tbl_x.c, tbl_c.d"):
248
            out = ctx.execute(
249
                f"""
250
                SELECT {select_cols} FROM (SELECT *
251
                    FROM tbl_a
252
                    LEFT JOIN tbl_b ON tbl_a.a = tbl_b.a AND tbl_a.b = tbl_b.b
253
                ) tbl_x
254
                LEFT JOIN tbl_c ON tbl_x.c = tbl_c.c
255
                ORDER BY tbl_x.a ASC
256
                """
257
            ).collect()
258

259
            assert out.rows() == [
260
                (1, 4, "z", 25.5),
261
                (2, None, None, None),
262
                (3, 6, "x", None),
263
            ]
264

265

266
def test_join_misc_13618() -> None:
267
    import polars as pl
268

269
    df = pl.DataFrame(
270
        {
271
            "A": [1, 2, 3, 4, 5],
272
            "B": [5, 4, 3, 2, 1],
273
            "fruits": ["banana", "banana", "apple", "apple", "banana"],
274
            "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
275
        }
276
    )
277
    res = (
278
        pl.SQLContext(t=df, t1=df, eager=True)
279
        .execute(
280
            """
281
            SELECT t.A, t.fruits, t1.B, t1.cars
282
            FROM t
283
            JOIN t1 ON t.A = t1.B
284
            ORDER BY t.A DESC
285
            """
286
        )
287
        .to_dict(as_series=False)
288
    )
289
    assert res == {
290
        "A": [5, 4, 3, 2, 1],
291
        "fruits": ["banana", "apple", "apple", "banana", "banana"],
292
        "B": [5, 4, 3, 2, 1],
293
        "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
294
    }
295

296

297
def test_join_misc_16255() -> None:
298
    df1 = pl.read_csv(BytesIO(b"id,data\n1,open"))  # noqa: F841
299
    df2 = pl.read_csv(BytesIO(b"id,data\n1,closed"))  # noqa: F841
300
    res = pl.sql(
301
        """
302
        SELECT a.id, a.data AS d1, b.data AS d2
303
        FROM df1 AS a JOIN df2 AS b
304
        ON a.id = b.id
305
        """,
306
        eager=True,
307
    )
308
    assert res.rows() == [(1, "open", "closed")]
309

310

311
@pytest.mark.parametrize(
312
    "constraint", ["tbl.a != tbl.b", "tbl.a > tbl.b", "a >= b", "a < b", "b <= a"]
313
)
314
def test_non_equi_joins(constraint: str) -> None:
315
    # no support (yet) for non equi-joins in polars joins
316
    # TODO: integrate awareness of new IEJoin
317
    with (
318
        pytest.raises(
319
            SQLInterfaceError,
320
            match=r"only equi-join constraints \(combined with 'AND'\) are currently supported",
321
        ),
322
        pl.SQLContext({"tbl": pl.DataFrame({"a": [1, 2, 3], "b": [4, 3, 2]})}) as ctx,
323
    ):
324
        ctx.execute(
325
            f"""
326
            SELECT *
327
            FROM tbl
328
            LEFT JOIN tbl ON {constraint}  -- not an equi-join
329
            """
330
        )
331

332

333
def test_implicit_joins() -> None:
334
    # no support for this yet; ensure we catch it
335
    with (
336
        pytest.raises(
337
            SQLInterfaceError,
338
            match=r"not currently supported .* use explicit JOIN syntax instead",
339
        ),
340
        pl.SQLContext(
341
            {
342
                "tbl": pl.DataFrame(
343
                    {"a": [1, 2, 3], "b": [4, 3, 2], "c": ["x", "y", "z"]}
344
                )
345
            }
346
        ) as ctx,
347
    ):
348
        ctx.execute(
349
            """
350
            SELECT t1.*
351
            FROM tbl AS t1, tbl AS t2
352
            WHERE t1.a = t2.b
353
            """
354
        )
355

356

357
@pytest.mark.parametrize(
358
    ("query", "expected"),
359
    [
360
        # INNER joins
361
        (
362
            "SELECT df1.* FROM df1 INNER JOIN df2 USING (a)",
363
            {"a": [1, 3], "b": ["x", "z"], "c": [100, 300]},
364
        ),
365
        (
366
            "SELECT df2.* FROM df1 INNER JOIN df2 USING (a)",
367
            {"a": [1, 3], "b": ["qq", "pp"], "c": [400, 500]},
368
        ),
369
        (
370
            "SELECT df1.* FROM df2 INNER JOIN df1 USING (a)",
371
            {"a": [1, 3], "b": ["x", "z"], "c": [100, 300]},
372
        ),
373
        (
374
            "SELECT df2.* FROM df2 INNER JOIN df1 USING (a)",
375
            {"a": [1, 3], "b": ["qq", "pp"], "c": [400, 500]},
376
        ),
377
        # LEFT joins
378
        (
379
            "SELECT df1.* FROM df1 LEFT JOIN df2 USING (a)",
380
            {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": [100, 200, 300]},
381
        ),
382
        (
383
            "SELECT df2.* FROM df1 LEFT JOIN df2 USING (a)",
384
            {"a": [1, 3, None], "b": ["qq", "pp", None], "c": [400, 500, None]},
385
        ),
386
        (
387
            "SELECT df1.* FROM df2 LEFT JOIN df1 USING (a)",
388
            {"a": [1, 3, None], "b": ["x", "z", None], "c": [100, 300, None]},
389
        ),
390
        (
391
            "SELECT df2.* FROM df2 LEFT JOIN df1 USING (a)",
392
            {"a": [1, 3, 4], "b": ["qq", "pp", "oo"], "c": [400, 500, 600]},
393
        ),
394
        # RIGHT joins
395
        (
396
            "SELECT df1.* FROM df1 RIGHT JOIN df2 USING (a)",
397
            {"a": [1, 3, None], "b": ["x", "z", None], "c": [100, 300, None]},
398
        ),
399
        (
400
            "SELECT df2.* FROM df1 RIGHT JOIN df2 USING (a)",
401
            {"a": [1, 3, 4], "b": ["qq", "pp", "oo"], "c": [400, 500, 600]},
402
        ),
403
        (
404
            "SELECT df1.* FROM df2 RIGHT JOIN df1 USING (a)",
405
            {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": [100, 200, 300]},
406
        ),
407
        (
408
            "SELECT df2.* FROM df2 RIGHT JOIN df1 USING (a)",
409
            {"a": [1, 3, None], "b": ["qq", "pp", None], "c": [400, 500, None]},
410
        ),
411
        # FULL joins
412
        (
413
            "SELECT df1.* FROM df1 FULL JOIN df2 USING (a)",
414
            {
415
                "a": [1, 2, 3, None],
416
                "b": ["x", "y", "z", None],
417
                "c": [100, 200, 300, None],
418
            },
419
        ),
420
        (
421
            "SELECT df2.* FROM df1 FULL JOIN df2 USING (a)",
422
            {
423
                "a": [1, 3, 4, None],
424
                "b": ["qq", "pp", "oo", None],
425
                "c": [400, 500, 600, None],
426
            },
427
        ),
428
        (
429
            "SELECT df1.* FROM df2 FULL JOIN df1 USING (a)",
430
            {
431
                "a": [1, 2, 3, None],
432
                "b": ["x", "y", "z", None],
433
                "c": [100, 200, 300, None],
434
            },
435
        ),
436
        (
437
            "SELECT df2.* FROM df2 FULL JOIN df1 USING (a)",
438
            {
439
                "a": [1, 3, 4, None],
440
                "b": ["qq", "pp", "oo", None],
441
                "c": [400, 500, 600, None],
442
            },
443
        ),
444
    ],
445
)
446
def test_wildcard_resolution_and_join_order(
447
    query: str, expected: dict[str, Any]
448
) -> None:
449
    df1 = pl.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"], "c": [100, 200, 300]})  # noqa: F841
450
    df2 = pl.DataFrame({"a": [1, 3, 4], "b": ["qq", "pp", "oo"], "c": [400, 500, 600]})  # noqa: F841
451

452
    res = pl.sql(query).collect()
453
    assert_frame_equal(
454
        res,
455
        pl.DataFrame(expected),
456
        check_row_order=False,
457
    )
458

459

460
def test_natural_joins_01() -> None:
461
    df1 = pl.DataFrame(
462
        {
463
            "CharacterID": [1, 2, 3, 4],
464
            "FirstName": ["Jernau Morat", "Cheradenine", "Byr", "Diziet"],
465
            "LastName": ["Gurgeh", "Zakalwe", "Genar-Hofoen", "Sma"],
466
        }
467
    )
468
    df2 = pl.DataFrame(
469
        {
470
            "CharacterID": [1, 2, 3, 5],
471
            "Role": ["Protagonist", "Protagonist", "Protagonist", "Antagonist"],
472
            "Book": [
473
                "Player of Games",
474
                "Use of Weapons",
475
                "Excession",
476
                "Consider Phlebas",
477
            ],
478
        }
479
    )
480
    df3 = pl.DataFrame(
481
        {
482
            "CharacterID": [1, 2, 3, 4],
483
            "Affiliation": ["Culture", "Culture", "Culture", "Shellworld"],
484
            "Species": ["Pan-human", "Human", "Human", "Oct"],
485
        }
486
    )
487
    df4 = pl.DataFrame(
488
        {
489
            "CharacterID": [1, 2, 3, 6],
490
            "Ship": [
491
                "Limiting Factor",
492
                "Xenophobe",
493
                "Grey Area",
494
                "Falling Outside The Normal Moral Constraints",
495
            ],
496
            "Drone": ["Flere-Imsaho", "Skaffen-Amtiskaw", "Eccentric", "Psychopath"],
497
        }
498
    )
499
    with pl.SQLContext(
500
        {"df1": df1, "df2": df2, "df3": df3, "df4": df4}, eager=True
501
    ) as ctx:
502
        # note: use of 'COLUMNS' is a neat way to drop
503
        # all non-coalesced "<name>:<suffix>" cols
504
        res = ctx.execute(
505
            """
506
            SELECT COLUMNS('^[^:]*$')
507
            FROM df1
508
            NATURAL LEFT JOIN df2
509
            NATURAL INNER JOIN df3
510
            NATURAL LEFT JOIN df4
511
            ORDER BY ALL
512
            """
513
        )
514
        assert res.rows(named=True) == [
515
            {
516
                "CharacterID": 1,
517
                "FirstName": "Jernau Morat",
518
                "LastName": "Gurgeh",
519
                "Role": "Protagonist",
520
                "Book": "Player of Games",
521
                "Affiliation": "Culture",
522
                "Species": "Pan-human",
523
                "Ship": "Limiting Factor",
524
                "Drone": "Flere-Imsaho",
525
            },
526
            {
527
                "CharacterID": 2,
528
                "FirstName": "Cheradenine",
529
                "LastName": "Zakalwe",
530
                "Role": "Protagonist",
531
                "Book": "Use of Weapons",
532
                "Affiliation": "Culture",
533
                "Species": "Human",
534
                "Ship": "Xenophobe",
535
                "Drone": "Skaffen-Amtiskaw",
536
            },
537
            {
538
                "CharacterID": 3,
539
                "FirstName": "Byr",
540
                "LastName": "Genar-Hofoen",
541
                "Role": "Protagonist",
542
                "Book": "Excession",
543
                "Affiliation": "Culture",
544
                "Species": "Human",
545
                "Ship": "Grey Area",
546
                "Drone": "Eccentric",
547
            },
548
            {
549
                "CharacterID": 4,
550
                "FirstName": "Diziet",
551
                "LastName": "Sma",
552
                "Role": None,
553
                "Book": None,
554
                "Affiliation": "Shellworld",
555
                "Species": "Oct",
556
                "Ship": None,
557
                "Drone": None,
558
            },
559
        ]
560

561
    # misc errors
562
    with pytest.raises(SQLSyntaxError, match=r"did you mean COLUMNS\(\*\)\?"):
563
        pl.sql("SELECT * FROM df1 NATURAL JOIN df2 WHERE COLUMNS('*') >= 5")
564

565
    with pytest.raises(SQLSyntaxError, match=r"COLUMNS expects a regex"):
566
        pl.sql("SELECT COLUMNS(1234) FROM df1 NATURAL JOIN df2")
567

568

569
@pytest.mark.parametrize(
570
    ("cols_constraint", "expect_data"),
571
    [
572
        (">= 5", [(8, 8, 6)]),
573
        ("< 7", [(5, 4, 4)]),
574
        ("< 8", [(5, 4, 4), (7, 4, 4), (0, 7, 2)]),
575
        ("!= 4", [(8, 8, 6), (2, 8, 6), (0, 7, 2)]),
576
    ],
577
)
578
def test_natural_joins_02(cols_constraint: str, expect_data: list[tuple[int]]) -> None:
579
    df1 = pl.DataFrame(  # noqa: F841
580
        {
581
            "x": [1, 5, 3, 8, 6, 7, 4, 0, 2],
582
            "y": [3, 4, 6, 8, 3, 4, 1, 7, 8],
583
        }
584
    )
585
    df2 = pl.DataFrame(  # noqa: F841
586
        {
587
            "y": [0, 4, 0, 8, 0, 4, 0, 7, None],
588
            "z": [9, 8, 7, 6, 5, 4, 3, 2, 1],
589
        },
590
    )
591
    actual = pl.sql(
592
        f"""
593
        SELECT * EXCLUDE "y:df2"
594
        FROM df1 NATURAL JOIN df2
595
        WHERE COLUMNS(*) {cols_constraint}
596
        """
597
    ).collect()
598

599
    expected = pl.DataFrame(expect_data, schema=actual.columns, orient="row")
600
    assert_frame_equal(actual, expected, check_row_order=False)
601

602

603
@pytest.mark.parametrize(
604
    "join_clause",
605
    [
606
        """
607
        df2 JOIN df3 ON
608
        df2.CharacterID = df3.CharacterID
609
        """,
610
        """
611
        df2 INNER JOIN (
612
          df3 JOIN df4 ON df3.CharacterID = df4.CharacterID
613
        ) AS r0 ON df2.CharacterID = df3.CharacterID
614
        """,
615
    ],
616
)
617
def test_nested_join(join_clause: str) -> None:
618
    df1 = pl.DataFrame(
619
        {
620
            "CharacterID": [1, 2, 3, 4],
621
            "FirstName": ["Jernau Morat", "Cheradenine", "Byr", "Diziet"],
622
            "LastName": ["Gurgeh", "Zakalwe", "Genar-Hofoen", "Sma"],
623
        }
624
    )
625
    df2 = pl.DataFrame(
626
        {
627
            "CharacterID": [1, 2, 3, 5],
628
            "Role": ["Protagonist", "Protagonist", "Protagonist", "Antagonist"],
629
            "Book": [
630
                "Player of Games",
631
                "Use of Weapons",
632
                "Excession",
633
                "Consider Phlebas",
634
            ],
635
        }
636
    )
637
    df3 = pl.DataFrame(
638
        {
639
            "CharacterID": [1, 2, 5, 6],
640
            "Affiliation": ["Culture", "Culture", "Culture", "Shellworld"],
641
            "Species": ["Pan-human", "Human", "Human", "Oct"],
642
        }
643
    )
644
    df4 = pl.DataFrame(
645
        {
646
            "CharacterID": [1, 2, 3, 6],
647
            "Ship": [
648
                "Limiting Factor",
649
                "Xenophobe",
650
                "Grey Area",
651
                "Falling Outside The Normal Moral Constraints",
652
            ],
653
            "Drone": ["Flere-Imsaho", "Skaffen-Amtiskaw", "Eccentric", "Psychopath"],
654
        }
655
    )
656

657
    with pl.SQLContext(
658
        {"df1": df1, "df2": df2, "df3": df3, "df4": df4}, eager=True
659
    ) as ctx:
660
        res = ctx.execute(
661
            f"""
662
            SELECT df1.CharacterID, df1.FirstName, df2.Role, df3.Species
663
            FROM df1
664
            INNER JOIN ({join_clause}) AS r99
665
            ON df1.CharacterID = df2.CharacterID
666
            ORDER BY ALL
667
            """
668
        )
669
        assert res.rows(named=True) == [
670
            {
671
                "CharacterID": 1,
672
                "FirstName": "Jernau Morat",
673
                "Role": "Protagonist",
674
                "Species": "Pan-human",
675
            },
676
            {
677
                "CharacterID": 2,
678
                "FirstName": "Cheradenine",
679
                "Role": "Protagonist",
680
                "Species": "Human",
681
            },
682
        ]
683

684

685
def test_sql_forbid_nested_join_unnamed_relation() -> None:
686
    df = pl.DataFrame({"a": 1})
687

688
    with (
689
        pl.SQLContext({"left": df, "right": df}) as ctx,
690
        pytest.raises(SQLInterfaceError, match="cannot join on unnamed relation"),
691
    ):
692
        ctx.execute(
693
            """\
694
SELECT *
695
FROM left
696
JOIN (right JOIN right ON right.a = right.a)
697
ON left.a = right.a
698
"""
699
        )
700

701

702
def test_nulls_equal_19624() -> None:
703
    df1 = pl.DataFrame({"a": [1, 2, None, None]})
704
    df2 = pl.DataFrame({"a": [1, 1, 2, 2, None], "b": [0, 1, 2, 3, 4]})
705

706
    # left join
707
    result_df = df1.join(df2, how="left", on="a", nulls_equal=False, validate="1:m")
708
    expected_df = pl.DataFrame(
709
        {"a": [1, 1, 2, 2, None, None], "b": [0, 1, 2, 3, None, None]}
710
    )
711
    assert_frame_equal(result_df, expected_df)
712
    result_df = df2.join(df1, how="left", on="a", nulls_equal=False, validate="m:1")
713
    expected_df = pl.DataFrame({"a": [1, 1, 2, 2, None], "b": [0, 1, 2, 3, 4]})
714
    assert_frame_equal(result_df, expected_df)
715

716
    # inner join
717
    result_df = df1.join(df2, how="inner", on="a", nulls_equal=False, validate="1:m")
718
    expected_df = pl.DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3]})
719
    assert_frame_equal(result_df, expected_df)
720
    result_df = df2.join(df1, how="inner", on="a", nulls_equal=False, validate="m:1")
721
    expected_df = pl.DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3]})
722
    assert_frame_equal(result_df, expected_df)
723

724
Product

Resources

Company