Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/test_rows.py
6939 views
1
from datetime import date
2
3
import pytest
4
5
import polars as pl
6
from polars.exceptions import NoRowsReturnedError, TooManyRowsReturnedError
7
from tests.unit.conftest import INTEGER_DTYPES
8
9
10
def test_row_tuple() -> None:
11
df = pl.DataFrame({"a": ["foo", "bar", "2"], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})
12
13
# return row by index
14
assert df.row(0) == ("foo", 1, 1.0)
15
assert df.row(1) == ("bar", 2, 2.0)
16
assert df.row(-1) == ("2", 3, 3.0)
17
18
# return named row by index
19
row = df.row(0, named=True)
20
assert row == {"a": "foo", "b": 1, "c": 1.0}
21
22
# return row by predicate
23
assert df.row(by_predicate=pl.col("a") == "bar") == ("bar", 2, 2.0)
24
assert df.row(by_predicate=pl.col("b").is_in([2, 4, 6])) == ("bar", 2, 2.0)
25
26
# return named row by predicate
27
row = df.row(by_predicate=pl.col("a") == "bar", named=True)
28
assert row == {"a": "bar", "b": 2, "c": 2.0}
29
30
# expected error conditions
31
with pytest.raises(TooManyRowsReturnedError):
32
df.row(by_predicate=pl.col("b").is_in([1, 3, 5]))
33
34
with pytest.raises(NoRowsReturnedError):
35
df.row(by_predicate=pl.col("a") == "???")
36
37
# cannot set both 'index' and 'by_predicate'
38
with pytest.raises(ValueError):
39
df.row(0, by_predicate=pl.col("a") == "bar")
40
41
# must call 'by_predicate' by keyword
42
with pytest.raises(TypeError):
43
df.row(None, pl.col("a") == "bar") # type: ignore[call-overload]
44
45
# cannot pass predicate into 'index'
46
with pytest.raises(TypeError):
47
df.row(pl.col("a") == "bar") # type: ignore[call-overload]
48
49
# at least one of 'index' and 'by_predicate' must be set
50
with pytest.raises(ValueError):
51
df.row()
52
53
54
def test_rows() -> None:
55
df = pl.DataFrame({"a": [1, 2], "b": [1, 2]})
56
57
# Regular rows
58
assert df.rows() == [(1, 1), (2, 2)]
59
assert df.reverse().rows() == [(2, 2), (1, 1)]
60
61
# Named rows
62
rows = df.rows(named=True)
63
assert rows == [{"a": 1, "b": 1}, {"a": 2, "b": 2}]
64
65
# Rows with nullarray cols
66
df = df.with_columns(c=pl.lit(None))
67
assert df.schema == {"a": pl.Int64, "b": pl.Int64, "c": pl.Null}
68
assert df.rows() == [(1, 1, None), (2, 2, None)]
69
assert df.rows(named=True) == [
70
{"a": 1, "b": 1, "c": None},
71
{"a": 2, "b": 2, "c": None},
72
]
73
74
75
def test_rows_by_key() -> None:
76
df = pl.DataFrame(
77
{
78
"w": ["a", "b", "b", "a"],
79
"x": ["q", "q", "q", "k"],
80
"y": [1.0, 2.5, 3.0, 4.5],
81
"z": [9, 8, 7, 6],
82
}
83
)
84
85
# tuple (unnamed) rows
86
assert df.rows_by_key("w") == {
87
"a": [("q", 1.0, 9), ("k", 4.5, 6)],
88
"b": [("q", 2.5, 8), ("q", 3.0, 7)],
89
}
90
assert df.rows_by_key("w", unique=True) == {
91
"a": ("k", 4.5, 6),
92
"b": ("q", 3.0, 7),
93
}
94
assert df.rows_by_key("w", include_key=True) == {
95
"a": [("a", "q", 1.0, 9), ("a", "k", 4.5, 6)],
96
"b": [("b", "q", 2.5, 8), ("b", "q", 3.0, 7)],
97
}
98
assert df.rows_by_key("w", include_key=True) == {
99
key[0]: grp.rows() for key, grp in df.group_by(["w"])
100
}
101
assert df.rows_by_key("w", include_key=True, unique=True) == {
102
"a": ("a", "k", 4.5, 6),
103
"b": ("b", "q", 3.0, 7),
104
}
105
assert df.rows_by_key(["x", "w"]) == {
106
("q", "a"): [(1.0, 9)],
107
("q", "b"): [(2.5, 8), (3.0, 7)],
108
("k", "a"): [(4.5, 6)],
109
}
110
assert df.rows_by_key(["w", "x"], include_key=True) == {
111
("a", "q"): [("a", "q", 1.0, 9)],
112
("a", "k"): [("a", "k", 4.5, 6)],
113
("b", "q"): [("b", "q", 2.5, 8), ("b", "q", 3.0, 7)],
114
}
115
assert df.rows_by_key(["w", "x"], include_key=True, unique=True) == {
116
("a", "q"): ("a", "q", 1.0, 9),
117
("b", "q"): ("b", "q", 3.0, 7),
118
("a", "k"): ("a", "k", 4.5, 6),
119
}
120
121
# dict (named) rows
122
assert df.rows_by_key("w", named=True) == {
123
"a": [{"x": "q", "y": 1.0, "z": 9}, {"x": "k", "y": 4.5, "z": 6}],
124
"b": [{"x": "q", "y": 2.5, "z": 8}, {"x": "q", "y": 3.0, "z": 7}],
125
}
126
assert df.rows_by_key("w", named=True, unique=True) == {
127
"a": {"x": "k", "y": 4.5, "z": 6},
128
"b": {"x": "q", "y": 3.0, "z": 7},
129
}
130
assert df.rows_by_key("w", named=True, include_key=True) == {
131
"a": [
132
{"w": "a", "x": "q", "y": 1.0, "z": 9},
133
{"w": "a", "x": "k", "y": 4.5, "z": 6},
134
],
135
"b": [
136
{"w": "b", "x": "q", "y": 2.5, "z": 8},
137
{"w": "b", "x": "q", "y": 3.0, "z": 7},
138
],
139
}
140
assert df.rows_by_key("w", named=True, include_key=True) == {
141
key[0]: grp.rows(named=True) for key, grp in df.group_by(["w"])
142
}
143
assert df.rows_by_key("w", named=True, include_key=True, unique=True) == {
144
"a": {"w": "a", "x": "k", "y": 4.5, "z": 6},
145
"b": {"w": "b", "x": "q", "y": 3.0, "z": 7},
146
}
147
assert df.rows_by_key(["x", "w"], named=True) == {
148
("q", "a"): [{"y": 1.0, "z": 9}],
149
("q", "b"): [{"y": 2.5, "z": 8}, {"y": 3.0, "z": 7}],
150
("k", "a"): [{"y": 4.5, "z": 6}],
151
}
152
assert df.rows_by_key(["w", "x"], named=True, include_key=True) == {
153
("a", "q"): [{"w": "a", "x": "q", "y": 1.0, "z": 9}],
154
("a", "k"): [{"w": "a", "x": "k", "y": 4.5, "z": 6}],
155
("b", "q"): [
156
{"w": "b", "x": "q", "y": 2.5, "z": 8},
157
{"w": "b", "x": "q", "y": 3.0, "z": 7},
158
],
159
}
160
assert df.rows_by_key(["w", "x"], named=True, include_key=True, unique=True) == {
161
("a", "q"): {"w": "a", "x": "q", "y": 1.0, "z": 9},
162
("b", "q"): {"w": "b", "x": "q", "y": 3.0, "z": 7},
163
("a", "k"): {"w": "a", "x": "k", "y": 4.5, "z": 6},
164
}
165
166
167
def test_iter_rows() -> None:
168
df = pl.DataFrame(
169
{
170
"a": [1, 2, 3],
171
"b": [True, False, None],
172
}
173
).with_columns(pl.Series(["a:b", "c:d", "e:f"]).str.split_exact(":", 1).alias("c"))
174
175
# expected struct values
176
c1 = {"field_0": "a", "field_1": "b"}
177
c2 = {"field_0": "c", "field_1": "d"}
178
c3 = {"field_0": "e", "field_1": "f"}
179
180
# Default iter_rows behaviour
181
it = df.iter_rows()
182
assert next(it) == (1, True, c1)
183
assert next(it) == (2, False, c2)
184
assert next(it) == (3, None, c3)
185
with pytest.raises(StopIteration):
186
next(it)
187
188
# Apply explicit row-buffer size
189
for sz in (0, 1, 2, 3, 4):
190
it = df.iter_rows(buffer_size=sz)
191
assert next(it) == (1, True, c1)
192
assert next(it) == (2, False, c2)
193
assert next(it) == (3, None, c3)
194
with pytest.raises(StopIteration):
195
next(it)
196
197
# Return named rows
198
it_named = df.iter_rows(named=True, buffer_size=sz)
199
row = next(it_named)
200
assert row == {"a": 1, "b": True, "c": c1}
201
row = next(it_named)
202
assert row == {"a": 2, "b": False, "c": c2}
203
row = next(it_named)
204
assert row == {"a": 3, "b": None, "c": c3}
205
206
with pytest.raises(StopIteration):
207
next(it_named)
208
209
# test over chunked frame
210
df = pl.concat(
211
[
212
pl.DataFrame({"id": [0, 1], "values": ["a", "b"]}),
213
pl.DataFrame({"id": [2, 3], "values": ["c", "d"]}),
214
],
215
rechunk=False,
216
)
217
assert df.n_chunks() == 2
218
assert df.to_dicts() == [
219
{"id": 0, "values": "a"},
220
{"id": 1, "values": "b"},
221
{"id": 2, "values": "c"},
222
{"id": 3, "values": "d"},
223
]
224
225
226
@pytest.mark.parametrize("primitive", INTEGER_DTYPES)
227
def test_row_constructor_schema(primitive: pl.DataType) -> None:
228
result = pl.DataFrame(data=[[1], [2], [3]], schema={"d": primitive}, orient="row")
229
230
assert result.dtypes == [primitive]
231
assert result.to_dict(as_series=False) == {"d": [1, 2, 3]}
232
233
234
def test_row_constructor_uint64() -> None:
235
# validate init with a valid UInt64 that exceeds Int64 upper bound
236
df = pl.DataFrame(
237
data=[[0], [int(2**63) + 1]], schema={"x": pl.UInt64}, orient="row"
238
)
239
assert df.rows() == [(0,), (9223372036854775809,)]
240
241
242
def test_physical_row_encoding() -> None:
243
dt_str = [
244
{
245
"ts": date(2023, 7, 1),
246
"files": "AGG_202307.xlsx",
247
"period_bins": [date(2023, 7, 1), date(2024, 1, 1)],
248
},
249
]
250
251
df = pl.from_dicts(dt_str)
252
df_groups = df.group_by("period_bins")
253
assert df_groups.all().to_dicts() == [
254
{
255
"period_bins": [date(2023, 7, 1), date(2024, 1, 1)],
256
"ts": [date(2023, 7, 1)],
257
"files": ["AGG_202307.xlsx"],
258
}
259
]
260
261