Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/sql/test_unnest.py
7884 views
1
from __future__ import annotations
2
3
import pytest
4
5
import polars as pl
6
from polars.exceptions import SQLInterfaceError, SQLSyntaxError
7
from polars.testing import assert_frame_equal
8
from tests.unit.sql.asserts import assert_sql_matches
9
10
# ---------------------------------------------------------------------------------
11
# NOTE: 'UNNEST' is available as both a table function and a select-level function
12
# ---------------------------------------------------------------------------------
13
14
15
@pytest.mark.parametrize(
16
"array_keyword",
17
["ARRAY", ""],
18
)
19
def test_unnest_table_function(array_keyword: str) -> None:
20
with pl.SQLContext(df=None, eager=True) as ctx:
21
res = ctx.execute(
22
f"""
23
SELECT * FROM
24
UNNEST(
25
{array_keyword}[1, 2, 3, 4],
26
{array_keyword}['ww','xx','yy','zz'],
27
{array_keyword}[23.0, 24.5, 28.0, 27.5],
28
) AS tbl (x,y,z);
29
"""
30
)
31
assert_frame_equal(
32
res,
33
pl.DataFrame(
34
{
35
"x": [1, 2, 3, 4],
36
"y": ["ww", "xx", "yy", "zz"],
37
"z": [23.0, 24.5, 28.0, 27.5],
38
}
39
),
40
)
41
42
43
def test_unnest_table_function_errors() -> None:
44
with pl.SQLContext(df=None, eager=True) as ctx:
45
with pytest.raises(
46
SQLSyntaxError,
47
match=r'UNNEST table alias must also declare column names, eg: "frame data" \(a,b,c\)',
48
):
49
ctx.execute('SELECT * FROM UNNEST([1, 2, 3]) AS "frame data"')
50
51
with pytest.raises(
52
SQLSyntaxError,
53
match="UNNEST table alias requires 1 column name, found 2",
54
):
55
ctx.execute("SELECT * FROM UNNEST([1, 2, 3]) AS tbl (a, b)")
56
57
with pytest.raises(
58
SQLSyntaxError,
59
match="UNNEST table alias requires 2 column names, found 1",
60
):
61
ctx.execute("SELECT * FROM UNNEST([1,2,3], [3,4,5]) AS tbl (a)")
62
63
with pytest.raises(
64
SQLSyntaxError,
65
match=r"UNNEST table must have an alias",
66
):
67
ctx.execute("SELECT * FROM UNNEST([1, 2, 3])")
68
69
with pytest.raises(
70
SQLInterfaceError,
71
match=r"UNNEST tables do not \(yet\) support WITH OFFSET|ORDINALITY",
72
):
73
ctx.execute("SELECT * FROM UNNEST([1, 2, 3]) tbl (colx) WITH OFFSET")
74
75
76
def test_unnest_select_expressions() -> None:
77
# Multiple expressions should be exploded inline/together
78
df = pl.DataFrame(
79
{
80
"id": [1, 2],
81
"data1": ["a,b", "c,d,e,f"],
82
"data2": ["x,x", "y,y,z,z"],
83
}
84
)
85
query = """
86
SELECT
87
id,
88
UNNEST(STRING_TO_ARRAY(data1,',')) AS d1,
89
UNNEST(STRING_TO_ARRAY(data2,',')) AS d2
90
FROM self
91
ORDER BY ALL
92
"""
93
expected = pl.DataFrame(
94
{
95
"id": [1, 1, 2, 2, 2, 2],
96
"d1": ["a", "b", "c", "d", "e", "f"],
97
"d2": ["x", "x", "y", "y", "z", "z"],
98
}
99
)
100
res = df.sql(query)
101
assert_frame_equal(res, expected)
102
103
assert_sql_matches(
104
df,
105
query=query,
106
compare_with="duckdb",
107
expected=expected,
108
)
109
110
111
def test_unnest_aggregates() -> None:
112
df = pl.DataFrame({"a": [i // 100 for i in range(1, 1000)]})
113
query = """
114
SELECT
115
UNNEST(ARRAY_AGG(DISTINCT a)) AS x,
116
UNNEST(ARRAY_AGG(DISTINCT a ORDER BY a)) AS y,
117
UNNEST(ARRAY_AGG(DISTINCT a ORDER BY a DESC)) AS z
118
FROM self
119
"""
120
res = df.sql(query)
121
assert res.to_dict(as_series=False) == {
122
"x": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
123
"y": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
124
"z": [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
125
}
126
127
128
def test_unnest_select_height_filter_order_by() -> None:
129
# Note: SQL UNNEST equates to `pl.Dataframe.explode()`
130
# (ordering is applied after the explode/unnest)
131
df = pl.DataFrame(
132
{
133
"list_long": [[1, 2, 3], [4, 5, 6]],
134
"sort_key": [2, 1],
135
"filter_mask": [False, True],
136
"filter_mask_all_true": True,
137
}
138
)
139
140
# Unnest/explode is applied at the dataframe level, sort is applied afterward
141
assert_frame_equal(
142
df.sql("SELECT UNNEST(list_long) as list FROM self ORDER BY sort_key"),
143
pl.Series("list", [4, 5, 6, 1, 2, 3]).to_frame(),
144
)
145
146
# No NULLS: since order is applied after explode on the dataframe level
147
assert_frame_equal(
148
df.sql(
149
"SELECT UNNEST(list_long) as list FROM self ORDER BY sort_key NULLS FIRST"
150
),
151
pl.Series("list", [4, 5, 6, 1, 2, 3]).to_frame(),
152
)
153
154
# Literals are broadcasted to output height of UNNEST:
155
assert_frame_equal(
156
df.sql("SELECT UNNEST(list_long) as list, 1 as x FROM self ORDER BY sort_key"),
157
pl.select(pl.Series("list", [4, 5, 6, 1, 2, 3]), x=1),
158
)
159
160
# Note: Filter applies before projections in SQL
161
assert_frame_equal(
162
df.sql(
163
"SELECT UNNEST(list_long) as list FROM self WHERE filter_mask ORDER BY sort_key"
164
),
165
pl.Series("list", [4, 5, 6]).to_frame(),
166
)
167
assert_frame_equal(
168
df.sql(
169
"SELECT UNNEST(list_long) as list FROM self WHERE filter_mask_all_true ORDER BY sort_key"
170
),
171
pl.Series("list", [4, 5, 6, 1, 2, 3]).to_frame(),
172
)
173
174