Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/sql/test_joins.py
6939 views
1
from __future__ import annotations
2
3
from io import BytesIO
4
from pathlib import Path
5
from typing import Any
6
7
import pytest
8
9
import polars as pl
10
from polars.exceptions import SQLInterfaceError, SQLSyntaxError
11
from polars.testing import assert_frame_equal
12
13
14
@pytest.fixture
15
def foods_ipc_path() -> Path:
16
return Path(__file__).parent.parent / "io" / "files" / "foods1.ipc"
17
18
19
@pytest.mark.parametrize(
20
("sql", "expected"),
21
[
22
(
23
"SELECT * FROM tbl_a LEFT SEMI JOIN tbl_b USING (a,c)",
24
pl.DataFrame({"a": [2], "b": [0], "c": ["y"]}),
25
),
26
(
27
"SELECT * FROM tbl_a SEMI JOIN tbl_b USING (a,c)",
28
pl.DataFrame({"a": [2], "b": [0], "c": ["y"]}),
29
),
30
(
31
"SELECT * FROM tbl_a LEFT SEMI JOIN tbl_b USING (a)",
32
pl.DataFrame({"a": [1, 2, 3], "b": [4, 0, 6], "c": ["w", "y", "z"]}),
33
),
34
(
35
"SELECT * FROM tbl_a LEFT ANTI JOIN tbl_b USING (a)",
36
pl.DataFrame(schema={"a": pl.Int64, "b": pl.Int64, "c": pl.String}),
37
),
38
(
39
"SELECT * FROM tbl_a ANTI JOIN tbl_b USING (a)",
40
pl.DataFrame(schema={"a": pl.Int64, "b": pl.Int64, "c": pl.String}),
41
),
42
(
43
"SELECT * FROM tbl_a LEFT SEMI JOIN tbl_b USING (b) LEFT SEMI JOIN tbl_c USING (c)",
44
pl.DataFrame({"a": [1, 3], "b": [4, 6], "c": ["w", "z"]}),
45
),
46
(
47
"SELECT * FROM tbl_a LEFT ANTI JOIN tbl_b USING (b) LEFT SEMI JOIN tbl_c USING (c)",
48
pl.DataFrame({"a": [2], "b": [0], "c": ["y"]}),
49
),
50
(
51
"SELECT * FROM tbl_a RIGHT ANTI JOIN tbl_b USING (b) LEFT SEMI JOIN tbl_c USING (c)",
52
pl.DataFrame({"a": [2], "b": [5], "c": ["y"]}),
53
),
54
(
55
"SELECT * FROM tbl_a RIGHT SEMI JOIN tbl_b USING (b) RIGHT SEMI JOIN tbl_c USING (c)",
56
pl.DataFrame({"c": ["z"], "d": [25.5]}),
57
),
58
(
59
"SELECT * FROM tbl_a RIGHT SEMI JOIN tbl_b USING (b) RIGHT ANTI JOIN tbl_c USING (c)",
60
pl.DataFrame({"c": ["w", "y"], "d": [10.5, -50.0]}),
61
),
62
],
63
)
64
def test_join_anti_semi(sql: str, expected: pl.DataFrame) -> None:
65
frames = {
66
"tbl_a": pl.DataFrame({"a": [1, 2, 3], "b": [4, 0, 6], "c": ["w", "y", "z"]}),
67
"tbl_b": pl.DataFrame({"a": [3, 2, 1], "b": [6, 5, 4], "c": ["x", "y", "z"]}),
68
"tbl_c": pl.DataFrame({"c": ["w", "y", "z"], "d": [10.5, -50.0, 25.5]}),
69
}
70
ctx = pl.SQLContext(frames, eager=True)
71
assert_frame_equal(expected, ctx.execute(sql))
72
73
74
def test_join_cross() -> None:
75
frames = {
76
"tbl_a": pl.DataFrame({"a": [1, 2, 3], "b": [4, 0, 6], "c": ["w", "y", "z"]}),
77
"tbl_b": pl.DataFrame({"a": [3, 2, 1], "b": [6, 5, 4], "c": ["x", "y", "z"]}),
78
}
79
with pl.SQLContext(frames, eager=True) as ctx:
80
out = ctx.execute(
81
"""
82
SELECT *
83
FROM tbl_a
84
CROSS JOIN tbl_b
85
ORDER BY a, b, c
86
"""
87
)
88
assert out.rows() == [
89
(1, 4, "w", 3, 6, "x"),
90
(1, 4, "w", 2, 5, "y"),
91
(1, 4, "w", 1, 4, "z"),
92
(2, 0, "y", 3, 6, "x"),
93
(2, 0, "y", 2, 5, "y"),
94
(2, 0, "y", 1, 4, "z"),
95
(3, 6, "z", 3, 6, "x"),
96
(3, 6, "z", 2, 5, "y"),
97
(3, 6, "z", 1, 4, "z"),
98
]
99
100
101
def test_join_cross_11927() -> None:
102
df1 = pl.DataFrame({"id": [1, 2, 3]}) # noqa: F841
103
df2 = pl.DataFrame({"id": [3, 4, 5]}) # noqa: F841
104
df3 = pl.DataFrame({"id": [4, 5, 6]}) # noqa: F841
105
106
res = pl.sql("SELECT df1.id FROM df1 CROSS JOIN df2 WHERE df1.id = df2.id")
107
assert_frame_equal(res.collect(), pl.DataFrame({"id": [3]}))
108
109
res = pl.sql("SELECT * FROM df1 CROSS JOIN df3 WHERE df1.id = df3.id")
110
assert res.collect().is_empty()
111
112
113
@pytest.mark.parametrize(
114
"join_clause",
115
[
116
"ON foods1.category = foods2.category",
117
"ON foods2.category = foods1.category",
118
"USING (category)",
119
],
120
)
121
def test_join_inner(foods_ipc_path: Path, join_clause: str) -> None:
122
foods1 = pl.scan_ipc(foods_ipc_path)
123
foods2 = foods1 # noqa: F841
124
schema = foods1.collect_schema()
125
126
sort_clause = ", ".join(f'{c} ASC, "{c}:foods2" DESC' for c in schema)
127
out = pl.sql(
128
f"""
129
SELECT *
130
FROM foods1
131
INNER JOIN foods2 {join_clause}
132
ORDER BY {sort_clause}
133
LIMIT 2
134
""",
135
eager=True,
136
)
137
138
assert_frame_equal(
139
out,
140
pl.DataFrame(
141
{
142
"category": ["fruit", "fruit"],
143
"calories": [30, 30],
144
"fats_g": [0.0, 0.0],
145
"sugars_g": [3, 5],
146
"category:foods2": ["fruit", "fruit"],
147
"calories:foods2": [130, 130],
148
"fats_g:foods2": [0.0, 0.0],
149
"sugars_g:foods2": [25, 25],
150
}
151
),
152
check_dtypes=False,
153
)
154
155
156
@pytest.mark.parametrize(
157
"join_clause",
158
[
159
"""
160
INNER JOIN tbl_b USING (a,b)
161
INNER JOIN tbl_c USING (c)
162
""",
163
"""
164
INNER JOIN tbl_b ON tbl_a.a = tbl_b.a AND tbl_a.b = tbl_b.b
165
INNER JOIN tbl_c ON tbl_a.c = tbl_c.c
166
""",
167
],
168
)
169
def test_join_inner_multi(join_clause: str) -> None:
170
frames = {
171
"tbl_a": pl.DataFrame({"a": [1, 2, 3], "b": [4, None, 6]}),
172
"tbl_b": pl.DataFrame({"a": [3, 2, 1], "b": [6, 5, 4], "c": ["x", "y", "z"]}),
173
"tbl_c": pl.DataFrame({"c": ["w", "y", "z"], "d": [10.5, -50.0, 25.5]}),
174
}
175
with pl.SQLContext(frames) as ctx:
176
assert ctx.tables() == ["tbl_a", "tbl_b", "tbl_c"]
177
for select_cols in ("a, b, c, d", "tbl_a.a, tbl_a.b, tbl_b.c, tbl_c.d"):
178
out = ctx.execute(
179
f"SELECT {select_cols} FROM tbl_a {join_clause} ORDER BY a DESC"
180
)
181
assert out.collect().rows() == [(1, 4, "z", 25.5)]
182
183
184
def test_join_inner_15663() -> None:
185
df_a = pl.DataFrame({"LOCID": [1, 2, 3], "VALUE": [0.1, 0.2, 0.3]}) # noqa: F841
186
df_b = pl.DataFrame({"LOCID": [1, 2, 3], "VALUE": [25.6, 53.4, 12.7]}) # noqa: F841
187
expected = pl.DataFrame(
188
{
189
"LOCID": [1, 2, 3],
190
"VALUE_A": [0.1, 0.2, 0.3],
191
"VALUE_B": [25.6, 53.4, 12.7],
192
}
193
)
194
with pl.SQLContext(register_globals=True, eager=True) as ctx:
195
query = """
196
SELECT
197
a.LOCID,
198
a.VALUE AS VALUE_A,
199
b.VALUE AS VALUE_B
200
FROM df_a AS a
201
INNER JOIN df_b AS b
202
USING (LOCID)
203
ORDER BY LOCID
204
"""
205
actual = ctx.execute(query)
206
assert_frame_equal(expected, actual)
207
208
209
@pytest.mark.parametrize(
210
"join_clause",
211
[
212
"""
213
LEFT JOIN tbl_b USING (a,b)
214
LEFT JOIN tbl_c USING (c)
215
""",
216
"""
217
LEFT JOIN tbl_b ON tbl_a.a = tbl_b.a AND tbl_a.b = tbl_b.b
218
LEFT JOIN tbl_c ON tbl_a.c = tbl_c.c
219
""",
220
],
221
)
222
def test_join_left_multi(join_clause: str) -> None:
223
frames = {
224
"tbl_a": pl.DataFrame({"a": [1, 2, 3], "b": [4, None, 6]}),
225
"tbl_b": pl.DataFrame({"a": [3, 2, 1], "b": [6, 5, 4], "c": ["x", "y", "z"]}),
226
"tbl_c": pl.DataFrame({"c": ["w", "y", "z"], "d": [10.5, -50.0, 25.5]}),
227
}
228
with pl.SQLContext(frames) as ctx:
229
for select_cols in ("a, b, c, d", "tbl_a.a, tbl_a.b, tbl_b.c, tbl_c.d"):
230
out = ctx.execute(
231
f"SELECT {select_cols} FROM tbl_a {join_clause} ORDER BY a DESC"
232
)
233
assert out.collect().rows() == [
234
(3, 6, "x", None),
235
(2, None, None, None),
236
(1, 4, "z", 25.5),
237
]
238
239
240
def test_join_left_multi_nested() -> None:
241
frames = {
242
"tbl_a": pl.DataFrame({"a": [1, 2, 3], "b": [4, None, 6]}),
243
"tbl_b": pl.DataFrame({"a": [3, 2, 1], "b": [6, 5, 4], "c": ["x", "y", "z"]}),
244
"tbl_c": pl.DataFrame({"c": ["w", "y", "z"], "d": [10.5, -50.0, 25.5]}),
245
}
246
with pl.SQLContext(frames) as ctx:
247
for select_cols in ("a, b, c, d", "tbl_x.a, tbl_x.b, tbl_x.c, tbl_c.d"):
248
out = ctx.execute(
249
f"""
250
SELECT {select_cols} FROM (SELECT *
251
FROM tbl_a
252
LEFT JOIN tbl_b ON tbl_a.a = tbl_b.a AND tbl_a.b = tbl_b.b
253
) tbl_x
254
LEFT JOIN tbl_c ON tbl_x.c = tbl_c.c
255
ORDER BY tbl_x.a ASC
256
"""
257
).collect()
258
259
assert out.rows() == [
260
(1, 4, "z", 25.5),
261
(2, None, None, None),
262
(3, 6, "x", None),
263
]
264
265
266
def test_join_misc_13618() -> None:
267
import polars as pl
268
269
df = pl.DataFrame(
270
{
271
"A": [1, 2, 3, 4, 5],
272
"B": [5, 4, 3, 2, 1],
273
"fruits": ["banana", "banana", "apple", "apple", "banana"],
274
"cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
275
}
276
)
277
res = (
278
pl.SQLContext(t=df, t1=df, eager=True)
279
.execute(
280
"""
281
SELECT t.A, t.fruits, t1.B, t1.cars
282
FROM t
283
JOIN t1 ON t.A = t1.B
284
ORDER BY t.A DESC
285
"""
286
)
287
.to_dict(as_series=False)
288
)
289
assert res == {
290
"A": [5, 4, 3, 2, 1],
291
"fruits": ["banana", "apple", "apple", "banana", "banana"],
292
"B": [5, 4, 3, 2, 1],
293
"cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
294
}
295
296
297
def test_join_misc_16255() -> None:
298
df1 = pl.read_csv(BytesIO(b"id,data\n1,open")) # noqa: F841
299
df2 = pl.read_csv(BytesIO(b"id,data\n1,closed")) # noqa: F841
300
res = pl.sql(
301
"""
302
SELECT a.id, a.data AS d1, b.data AS d2
303
FROM df1 AS a JOIN df2 AS b
304
ON a.id = b.id
305
""",
306
eager=True,
307
)
308
assert res.rows() == [(1, "open", "closed")]
309
310
311
@pytest.mark.parametrize(
312
"constraint", ["tbl.a != tbl.b", "tbl.a > tbl.b", "a >= b", "a < b", "b <= a"]
313
)
314
def test_non_equi_joins(constraint: str) -> None:
315
# no support (yet) for non equi-joins in polars joins
316
# TODO: integrate awareness of new IEJoin
317
with (
318
pytest.raises(
319
SQLInterfaceError,
320
match=r"only equi-join constraints \(combined with 'AND'\) are currently supported",
321
),
322
pl.SQLContext({"tbl": pl.DataFrame({"a": [1, 2, 3], "b": [4, 3, 2]})}) as ctx,
323
):
324
ctx.execute(
325
f"""
326
SELECT *
327
FROM tbl
328
LEFT JOIN tbl ON {constraint} -- not an equi-join
329
"""
330
)
331
332
333
def test_implicit_joins() -> None:
334
# no support for this yet; ensure we catch it
335
with (
336
pytest.raises(
337
SQLInterfaceError,
338
match=r"not currently supported .* use explicit JOIN syntax instead",
339
),
340
pl.SQLContext(
341
{
342
"tbl": pl.DataFrame(
343
{"a": [1, 2, 3], "b": [4, 3, 2], "c": ["x", "y", "z"]}
344
)
345
}
346
) as ctx,
347
):
348
ctx.execute(
349
"""
350
SELECT t1.*
351
FROM tbl AS t1, tbl AS t2
352
WHERE t1.a = t2.b
353
"""
354
)
355
356
357
@pytest.mark.parametrize(
358
("query", "expected"),
359
[
360
# INNER joins
361
(
362
"SELECT df1.* FROM df1 INNER JOIN df2 USING (a)",
363
{"a": [1, 3], "b": ["x", "z"], "c": [100, 300]},
364
),
365
(
366
"SELECT df2.* FROM df1 INNER JOIN df2 USING (a)",
367
{"a": [1, 3], "b": ["qq", "pp"], "c": [400, 500]},
368
),
369
(
370
"SELECT df1.* FROM df2 INNER JOIN df1 USING (a)",
371
{"a": [1, 3], "b": ["x", "z"], "c": [100, 300]},
372
),
373
(
374
"SELECT df2.* FROM df2 INNER JOIN df1 USING (a)",
375
{"a": [1, 3], "b": ["qq", "pp"], "c": [400, 500]},
376
),
377
# LEFT joins
378
(
379
"SELECT df1.* FROM df1 LEFT JOIN df2 USING (a)",
380
{"a": [1, 2, 3], "b": ["x", "y", "z"], "c": [100, 200, 300]},
381
),
382
(
383
"SELECT df2.* FROM df1 LEFT JOIN df2 USING (a)",
384
{"a": [1, 3, None], "b": ["qq", "pp", None], "c": [400, 500, None]},
385
),
386
(
387
"SELECT df1.* FROM df2 LEFT JOIN df1 USING (a)",
388
{"a": [1, 3, None], "b": ["x", "z", None], "c": [100, 300, None]},
389
),
390
(
391
"SELECT df2.* FROM df2 LEFT JOIN df1 USING (a)",
392
{"a": [1, 3, 4], "b": ["qq", "pp", "oo"], "c": [400, 500, 600]},
393
),
394
# RIGHT joins
395
(
396
"SELECT df1.* FROM df1 RIGHT JOIN df2 USING (a)",
397
{"a": [1, 3, None], "b": ["x", "z", None], "c": [100, 300, None]},
398
),
399
(
400
"SELECT df2.* FROM df1 RIGHT JOIN df2 USING (a)",
401
{"a": [1, 3, 4], "b": ["qq", "pp", "oo"], "c": [400, 500, 600]},
402
),
403
(
404
"SELECT df1.* FROM df2 RIGHT JOIN df1 USING (a)",
405
{"a": [1, 2, 3], "b": ["x", "y", "z"], "c": [100, 200, 300]},
406
),
407
(
408
"SELECT df2.* FROM df2 RIGHT JOIN df1 USING (a)",
409
{"a": [1, 3, None], "b": ["qq", "pp", None], "c": [400, 500, None]},
410
),
411
# FULL joins
412
(
413
"SELECT df1.* FROM df1 FULL JOIN df2 USING (a)",
414
{
415
"a": [1, 2, 3, None],
416
"b": ["x", "y", "z", None],
417
"c": [100, 200, 300, None],
418
},
419
),
420
(
421
"SELECT df2.* FROM df1 FULL JOIN df2 USING (a)",
422
{
423
"a": [1, 3, 4, None],
424
"b": ["qq", "pp", "oo", None],
425
"c": [400, 500, 600, None],
426
},
427
),
428
(
429
"SELECT df1.* FROM df2 FULL JOIN df1 USING (a)",
430
{
431
"a": [1, 2, 3, None],
432
"b": ["x", "y", "z", None],
433
"c": [100, 200, 300, None],
434
},
435
),
436
(
437
"SELECT df2.* FROM df2 FULL JOIN df1 USING (a)",
438
{
439
"a": [1, 3, 4, None],
440
"b": ["qq", "pp", "oo", None],
441
"c": [400, 500, 600, None],
442
},
443
),
444
],
445
)
446
def test_wildcard_resolution_and_join_order(
447
query: str, expected: dict[str, Any]
448
) -> None:
449
df1 = pl.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"], "c": [100, 200, 300]}) # noqa: F841
450
df2 = pl.DataFrame({"a": [1, 3, 4], "b": ["qq", "pp", "oo"], "c": [400, 500, 600]}) # noqa: F841
451
452
res = pl.sql(query).collect()
453
assert_frame_equal(
454
res,
455
pl.DataFrame(expected),
456
check_row_order=False,
457
)
458
459
460
def test_natural_joins_01() -> None:
461
df1 = pl.DataFrame(
462
{
463
"CharacterID": [1, 2, 3, 4],
464
"FirstName": ["Jernau Morat", "Cheradenine", "Byr", "Diziet"],
465
"LastName": ["Gurgeh", "Zakalwe", "Genar-Hofoen", "Sma"],
466
}
467
)
468
df2 = pl.DataFrame(
469
{
470
"CharacterID": [1, 2, 3, 5],
471
"Role": ["Protagonist", "Protagonist", "Protagonist", "Antagonist"],
472
"Book": [
473
"Player of Games",
474
"Use of Weapons",
475
"Excession",
476
"Consider Phlebas",
477
],
478
}
479
)
480
df3 = pl.DataFrame(
481
{
482
"CharacterID": [1, 2, 3, 4],
483
"Affiliation": ["Culture", "Culture", "Culture", "Shellworld"],
484
"Species": ["Pan-human", "Human", "Human", "Oct"],
485
}
486
)
487
df4 = pl.DataFrame(
488
{
489
"CharacterID": [1, 2, 3, 6],
490
"Ship": [
491
"Limiting Factor",
492
"Xenophobe",
493
"Grey Area",
494
"Falling Outside The Normal Moral Constraints",
495
],
496
"Drone": ["Flere-Imsaho", "Skaffen-Amtiskaw", "Eccentric", "Psychopath"],
497
}
498
)
499
with pl.SQLContext(
500
{"df1": df1, "df2": df2, "df3": df3, "df4": df4}, eager=True
501
) as ctx:
502
# note: use of 'COLUMNS' is a neat way to drop
503
# all non-coalesced "<name>:<suffix>" cols
504
res = ctx.execute(
505
"""
506
SELECT COLUMNS('^[^:]*$')
507
FROM df1
508
NATURAL LEFT JOIN df2
509
NATURAL INNER JOIN df3
510
NATURAL LEFT JOIN df4
511
ORDER BY ALL
512
"""
513
)
514
assert res.rows(named=True) == [
515
{
516
"CharacterID": 1,
517
"FirstName": "Jernau Morat",
518
"LastName": "Gurgeh",
519
"Role": "Protagonist",
520
"Book": "Player of Games",
521
"Affiliation": "Culture",
522
"Species": "Pan-human",
523
"Ship": "Limiting Factor",
524
"Drone": "Flere-Imsaho",
525
},
526
{
527
"CharacterID": 2,
528
"FirstName": "Cheradenine",
529
"LastName": "Zakalwe",
530
"Role": "Protagonist",
531
"Book": "Use of Weapons",
532
"Affiliation": "Culture",
533
"Species": "Human",
534
"Ship": "Xenophobe",
535
"Drone": "Skaffen-Amtiskaw",
536
},
537
{
538
"CharacterID": 3,
539
"FirstName": "Byr",
540
"LastName": "Genar-Hofoen",
541
"Role": "Protagonist",
542
"Book": "Excession",
543
"Affiliation": "Culture",
544
"Species": "Human",
545
"Ship": "Grey Area",
546
"Drone": "Eccentric",
547
},
548
{
549
"CharacterID": 4,
550
"FirstName": "Diziet",
551
"LastName": "Sma",
552
"Role": None,
553
"Book": None,
554
"Affiliation": "Shellworld",
555
"Species": "Oct",
556
"Ship": None,
557
"Drone": None,
558
},
559
]
560
561
# misc errors
562
with pytest.raises(SQLSyntaxError, match=r"did you mean COLUMNS\(\*\)\?"):
563
pl.sql("SELECT * FROM df1 NATURAL JOIN df2 WHERE COLUMNS('*') >= 5")
564
565
with pytest.raises(SQLSyntaxError, match=r"COLUMNS expects a regex"):
566
pl.sql("SELECT COLUMNS(1234) FROM df1 NATURAL JOIN df2")
567
568
569
@pytest.mark.parametrize(
570
("cols_constraint", "expect_data"),
571
[
572
(">= 5", [(8, 8, 6)]),
573
("< 7", [(5, 4, 4)]),
574
("< 8", [(5, 4, 4), (7, 4, 4), (0, 7, 2)]),
575
("!= 4", [(8, 8, 6), (2, 8, 6), (0, 7, 2)]),
576
],
577
)
578
def test_natural_joins_02(cols_constraint: str, expect_data: list[tuple[int]]) -> None:
579
df1 = pl.DataFrame( # noqa: F841
580
{
581
"x": [1, 5, 3, 8, 6, 7, 4, 0, 2],
582
"y": [3, 4, 6, 8, 3, 4, 1, 7, 8],
583
}
584
)
585
df2 = pl.DataFrame( # noqa: F841
586
{
587
"y": [0, 4, 0, 8, 0, 4, 0, 7, None],
588
"z": [9, 8, 7, 6, 5, 4, 3, 2, 1],
589
},
590
)
591
actual = pl.sql(
592
f"""
593
SELECT * EXCLUDE "y:df2"
594
FROM df1 NATURAL JOIN df2
595
WHERE COLUMNS(*) {cols_constraint}
596
"""
597
).collect()
598
599
expected = pl.DataFrame(expect_data, schema=actual.columns, orient="row")
600
assert_frame_equal(actual, expected, check_row_order=False)
601
602
603
@pytest.mark.parametrize(
604
"join_clause",
605
[
606
"""
607
df2 JOIN df3 ON
608
df2.CharacterID = df3.CharacterID
609
""",
610
"""
611
df2 INNER JOIN (
612
df3 JOIN df4 ON df3.CharacterID = df4.CharacterID
613
) AS r0 ON df2.CharacterID = df3.CharacterID
614
""",
615
],
616
)
617
def test_nested_join(join_clause: str) -> None:
618
df1 = pl.DataFrame(
619
{
620
"CharacterID": [1, 2, 3, 4],
621
"FirstName": ["Jernau Morat", "Cheradenine", "Byr", "Diziet"],
622
"LastName": ["Gurgeh", "Zakalwe", "Genar-Hofoen", "Sma"],
623
}
624
)
625
df2 = pl.DataFrame(
626
{
627
"CharacterID": [1, 2, 3, 5],
628
"Role": ["Protagonist", "Protagonist", "Protagonist", "Antagonist"],
629
"Book": [
630
"Player of Games",
631
"Use of Weapons",
632
"Excession",
633
"Consider Phlebas",
634
],
635
}
636
)
637
df3 = pl.DataFrame(
638
{
639
"CharacterID": [1, 2, 5, 6],
640
"Affiliation": ["Culture", "Culture", "Culture", "Shellworld"],
641
"Species": ["Pan-human", "Human", "Human", "Oct"],
642
}
643
)
644
df4 = pl.DataFrame(
645
{
646
"CharacterID": [1, 2, 3, 6],
647
"Ship": [
648
"Limiting Factor",
649
"Xenophobe",
650
"Grey Area",
651
"Falling Outside The Normal Moral Constraints",
652
],
653
"Drone": ["Flere-Imsaho", "Skaffen-Amtiskaw", "Eccentric", "Psychopath"],
654
}
655
)
656
657
with pl.SQLContext(
658
{"df1": df1, "df2": df2, "df3": df3, "df4": df4}, eager=True
659
) as ctx:
660
res = ctx.execute(
661
f"""
662
SELECT df1.CharacterID, df1.FirstName, df2.Role, df3.Species
663
FROM df1
664
INNER JOIN ({join_clause}) AS r99
665
ON df1.CharacterID = df2.CharacterID
666
ORDER BY ALL
667
"""
668
)
669
assert res.rows(named=True) == [
670
{
671
"CharacterID": 1,
672
"FirstName": "Jernau Morat",
673
"Role": "Protagonist",
674
"Species": "Pan-human",
675
},
676
{
677
"CharacterID": 2,
678
"FirstName": "Cheradenine",
679
"Role": "Protagonist",
680
"Species": "Human",
681
},
682
]
683
684
685
def test_sql_forbid_nested_join_unnamed_relation() -> None:
686
df = pl.DataFrame({"a": 1})
687
688
with (
689
pl.SQLContext({"left": df, "right": df}) as ctx,
690
pytest.raises(SQLInterfaceError, match="cannot join on unnamed relation"),
691
):
692
ctx.execute(
693
"""\
694
SELECT *
695
FROM left
696
JOIN (right JOIN right ON right.a = right.a)
697
ON left.a = right.a
698
"""
699
)
700
701
702
def test_nulls_equal_19624() -> None:
703
df1 = pl.DataFrame({"a": [1, 2, None, None]})
704
df2 = pl.DataFrame({"a": [1, 1, 2, 2, None], "b": [0, 1, 2, 3, 4]})
705
706
# left join
707
result_df = df1.join(df2, how="left", on="a", nulls_equal=False, validate="1:m")
708
expected_df = pl.DataFrame(
709
{"a": [1, 1, 2, 2, None, None], "b": [0, 1, 2, 3, None, None]}
710
)
711
assert_frame_equal(result_df, expected_df)
712
result_df = df2.join(df1, how="left", on="a", nulls_equal=False, validate="m:1")
713
expected_df = pl.DataFrame({"a": [1, 1, 2, 2, None], "b": [0, 1, 2, 3, 4]})
714
assert_frame_equal(result_df, expected_df)
715
716
# inner join
717
result_df = df1.join(df2, how="inner", on="a", nulls_equal=False, validate="1:m")
718
expected_df = pl.DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3]})
719
assert_frame_equal(result_df, expected_df)
720
result_df = df2.join(df1, how="inner", on="a", nulls_equal=False, validate="m:1")
721
expected_df = pl.DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3]})
722
assert_frame_equal(result_df, expected_df)
723
724