Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/map/test_map_elements.py
8408 views
1
from __future__ import annotations
2
3
import json
4
from datetime import date, datetime, timedelta
5
from typing import Any, NamedTuple
6
7
import numpy as np
8
import pytest
9
10
import polars as pl
11
from polars.exceptions import PolarsInefficientMapWarning
12
from polars.testing import assert_frame_equal, assert_series_equal
13
from tests.unit.conftest import NUMERIC_DTYPES, TEMPORAL_DTYPES
14
15
pytestmark = pytest.mark.filterwarnings(
16
"ignore::polars.exceptions.PolarsInefficientMapWarning"
17
)
18
19
20
@pytest.mark.may_fail_auto_streaming # dtype not set
21
@pytest.mark.may_fail_cloud # reason: eager - return_dtype must be set
22
def test_map_elements_infer_list() -> None:
23
df = pl.DataFrame(
24
{
25
"int": [1, 2],
26
"str": ["a", "b"],
27
"bool": [True, None],
28
}
29
)
30
assert df.select([pl.all().map_elements(lambda x: [x])]).dtypes == [pl.List] * 3
31
32
33
def test_map_elements_upcast_null_dtype_empty_list() -> None:
34
df = pl.DataFrame({"a": [1, 2]})
35
out = df.select(
36
pl.col("a").map_elements(lambda _: [], return_dtype=pl.List(pl.Int64))
37
)
38
assert_frame_equal(
39
out, pl.DataFrame({"a": [[], []]}, schema={"a": pl.List(pl.Int64)})
40
)
41
42
43
@pytest.mark.may_fail_cloud # reason: eager - return_dtype must be set
44
def test_map_elements_arithmetic_consistency() -> None:
45
df = pl.DataFrame({"A": ["a", "a"], "B": [2, 3]})
46
with pytest.warns(
47
PolarsInefficientMapWarning,
48
match="with this one instead",
49
):
50
assert df.group_by("A").agg(
51
pl.col("B")
52
.implode()
53
.map_elements(lambda x: x + 1.0, return_dtype=pl.List(pl.Float64))
54
)["B"].to_list() == [[3.0, 4.0]]
55
56
57
@pytest.mark.may_fail_cloud # reason: eager - return_dtype must be set
58
def test_map_elements_struct() -> None:
59
df = pl.DataFrame(
60
{
61
"A": ["a", "a", None],
62
"B": [2, 3, None],
63
"C": [True, False, None],
64
"D": [12.0, None, None],
65
"E": [None, [1], [2, 3]],
66
}
67
)
68
69
out = df.with_columns(pl.struct(df.columns).alias("struct")).select(
70
pl.col("struct").map_elements(lambda x: x["A"]).alias("A_field"),
71
pl.col("struct").map_elements(lambda x: x["B"]).alias("B_field"),
72
pl.col("struct").map_elements(lambda x: x["C"]).alias("C_field"),
73
pl.col("struct").map_elements(lambda x: x["D"]).alias("D_field"),
74
pl.col("struct").map_elements(lambda x: x["E"]).alias("E_field"),
75
)
76
expected = pl.DataFrame(
77
{
78
"A_field": ["a", "a", None],
79
"B_field": [2, 3, None],
80
"C_field": [True, False, None],
81
"D_field": [12.0, None, None],
82
"E_field": [None, [1], [2, 3]],
83
}
84
)
85
86
assert_frame_equal(out, expected)
87
88
89
@pytest.mark.may_fail_cloud # reason: eager - return_dtype must be set
90
def test_map_elements_numpy_int_out() -> None:
91
df = pl.DataFrame({"col1": [2, 4, 8, 16]})
92
result = df.with_columns(
93
pl.col("col1").map_elements(lambda x: np.left_shift(x, 8)).alias("result")
94
)
95
expected = pl.DataFrame({"col1": [2, 4, 8, 16], "result": [512, 1024, 2048, 4096]})
96
assert_frame_equal(result, expected)
97
98
df = pl.DataFrame({"col1": [2, 4, 8, 16], "shift": [1, 1, 2, 2]})
99
result = df.select(
100
pl.struct(["col1", "shift"])
101
.map_elements(lambda cols: np.left_shift(cols["col1"], cols["shift"]))
102
.alias("result")
103
)
104
expected = pl.DataFrame({"result": [4, 8, 32, 64]})
105
assert_frame_equal(result, expected)
106
107
108
def test_datelike_identity() -> None:
109
for s in [
110
pl.Series([datetime(year=2000, month=1, day=1)]),
111
pl.Series([timedelta(hours=2)]),
112
pl.Series([date(year=2000, month=1, day=1)]),
113
]:
114
assert s.map_elements(lambda x: x).to_list() == s.to_list()
115
116
117
def test_map_elements_list_any_value_fallback() -> None:
118
df = pl.DataFrame({"text": ['[{"x": 1, "y": 2}, {"x": 3, "y": 4}]']})
119
with pytest.warns(
120
PolarsInefficientMapWarning,
121
match=r'(?s)with this one instead:.*pl.col\("text"\).str.json_decode()',
122
):
123
assert df.select(
124
pl.col("text").map_elements(
125
json.loads,
126
return_dtype=pl.List(pl.Struct({"x": pl.Int64, "y": pl.Int64})),
127
)
128
).to_dict(as_series=False) == {"text": [[{"x": 1, "y": 2}, {"x": 3, "y": 4}]]}
129
130
# starts with empty list '[]'
131
df = pl.DataFrame(
132
{
133
"text": [
134
"[]",
135
'[{"x": 1, "y": 2}, {"x": 3, "y": 4}]',
136
'[{"x": 1, "y": 2}]',
137
]
138
}
139
)
140
with pytest.warns(
141
PolarsInefficientMapWarning,
142
match=r'(?s)with this one instead:.*pl.col\("text"\).str.json_decode()',
143
):
144
assert df.select(
145
pl.col("text").map_elements(
146
json.loads,
147
return_dtype=pl.List(pl.Struct({"x": pl.Int64, "y": pl.Int64})),
148
)
149
).to_dict(as_series=False) == {
150
"text": [[], [{"x": 1, "y": 2}, {"x": 3, "y": 4}], [{"x": 1, "y": 2}]]
151
}
152
153
154
def test_map_elements_all_types() -> None:
155
# test we don't panic
156
dtypes = NUMERIC_DTYPES + TEMPORAL_DTYPES + [pl.Decimal(None, 2)]
157
for dtype in dtypes:
158
pl.Series([1, 2, 3, 4, 5], dtype=dtype).map_elements(lambda x: x)
159
160
161
def test_map_elements_type_propagation() -> None:
162
assert (
163
pl.from_dict(
164
{
165
"a": [1, 2, 3],
166
"b": [{"c": 1, "d": 2}, {"c": 2, "d": 3}, {"c": None, "d": None}],
167
}
168
)
169
.group_by("a", maintain_order=True)
170
.agg(
171
[
172
pl.when(~pl.col("b").has_nulls())
173
.then(
174
pl.col("b")
175
.implode()
176
.map_elements(
177
lambda s: float(s[0]["c"]) if s[0]["c"] is not None else None,
178
return_dtype=pl.Float64,
179
)
180
)
181
.otherwise(None)
182
]
183
)
184
).to_dict(as_series=False) == {"a": [1, 2, 3], "b": [1.0, 2.0, None]}
185
186
187
@pytest.mark.may_fail_auto_streaming # dtype not set
188
@pytest.mark.may_fail_cloud # reason: eager - return_dtype must be set
189
def test_empty_list_in_map_elements() -> None:
190
df = pl.DataFrame(
191
{"a": [[1], [1, 2], [3, 4], [5, 6]], "b": [[3], [1, 2], [1, 2], [4, 5]]}
192
)
193
194
assert df.select(
195
pl.struct(["a", "b"]).map_elements(
196
lambda row: list(set(row["a"]) & set(row["b"]))
197
)
198
).to_dict(as_series=False) == {"a": [[], [1, 2], [], [5]]}
199
200
201
@pytest.mark.parametrize("value", [1, True, "abc", [1, 2], {"a": 1}])
202
@pytest.mark.parametrize("return_value", [1, True, "abc", [1, 2], {"a": 1}])
203
def test_map_elements_skip_nulls(value: Any, return_value: Any) -> None:
204
s = pl.Series([value, None])
205
206
result = s.map_elements(lambda x: return_value, skip_nulls=True).to_list()
207
assert result == [return_value, None]
208
209
result = s.map_elements(lambda x: return_value, skip_nulls=False).to_list()
210
assert result == [return_value, return_value]
211
212
213
@pytest.mark.may_fail_cloud # reason: Object type not supported
214
def test_map_elements_object_dtypes() -> None:
215
with pytest.warns(
216
PolarsInefficientMapWarning,
217
match=r"(?s)Replace this expression.*lambda x:",
218
):
219
assert pl.DataFrame(
220
{"a": pl.Series([1, 2, "a", 4, 5], dtype=pl.Object)}
221
).with_columns(
222
pl.col("a").map_elements(lambda x: x * 2, return_dtype=pl.Object),
223
pl.col("a")
224
.map_elements(
225
lambda x: isinstance(x, (int, float)), return_dtype=pl.Boolean
226
)
227
.alias("is_numeric1"),
228
pl.col("a")
229
.map_elements(
230
lambda x: isinstance(x, (int, float)), return_dtype=pl.Boolean
231
)
232
.alias("is_numeric_infer"),
233
).to_dict(as_series=False) == {
234
"a": [2, 4, "aa", 8, 10],
235
"is_numeric1": [True, True, False, True, True],
236
"is_numeric_infer": [True, True, False, True, True],
237
}
238
239
240
def test_map_elements_explicit_list_output_type() -> None:
241
out = pl.DataFrame({"str": ["a", "b"]}).with_columns(
242
pl.col("str").map_elements(
243
lambda _: pl.Series([1, 2, 3]), return_dtype=pl.List(pl.Int64)
244
)
245
)
246
247
assert out.dtypes == [pl.List(pl.Int64)]
248
assert out.to_dict(as_series=False) == {"str": [[1, 2, 3], [1, 2, 3]]}
249
250
251
@pytest.mark.may_fail_auto_streaming # dtype not set
252
def test_map_elements_dict() -> None:
253
df = pl.DataFrame({"abc": ['{"A":"Value1"}', '{"B":"Value2"}']})
254
with pytest.warns(
255
PolarsInefficientMapWarning,
256
match=r'(?s)with this one instead:.*pl.col\("abc"\).str.json_decode()',
257
):
258
assert df.select(
259
pl.col("abc").map_elements(
260
json.loads, return_dtype=pl.Struct({"A": pl.String, "B": pl.String})
261
)
262
).to_dict(as_series=False) == {
263
"abc": [{"A": "Value1", "B": None}, {"A": None, "B": "Value2"}]
264
}
265
266
with pytest.warns(
267
PolarsInefficientMapWarning,
268
match=r'(?s)with this one instead:.*pl.col\("abc"\).str.json_decode()',
269
):
270
assert pl.DataFrame(
271
{"abc": ['{"A":"Value1", "B":"Value2"}', '{"B":"Value3"}']}
272
).select(
273
pl.col("abc").map_elements(
274
json.loads, return_dtype=pl.Struct({"A": pl.String, "B": pl.String})
275
)
276
).to_dict(as_series=False) == {
277
"abc": [{"A": "Value1", "B": "Value2"}, {"A": None, "B": "Value3"}]
278
}
279
280
281
def test_map_elements_pass_name() -> None:
282
df = pl.DataFrame(
283
{
284
"bar": [1, 1, 2],
285
"foo": [1, 2, 3],
286
}
287
)
288
289
mapper = {"foo": "foo1"}
290
291
def element_mapper(s: pl.Series) -> pl.Series:
292
return pl.Series([mapper[s.name]])
293
294
assert df.group_by("bar", maintain_order=True).agg(
295
pl.col("foo")
296
.implode()
297
.map_elements(element_mapper, pass_name=True, return_dtype=pl.List(pl.String)),
298
).to_dict(as_series=False) == {"bar": [1, 2], "foo": [["foo1"], ["foo1"]]}
299
300
301
@pytest.mark.may_fail_cloud # reason: eager - return_dtype must be set
302
def test_map_elements_binary() -> None:
303
assert pl.DataFrame({"bin": [b"\x11" * 12, b"\x22" * 12, b"\xaa" * 12]}).select(
304
pl.col("bin").map_elements(bytes.hex)
305
).to_dict(as_series=False) == {
306
"bin": [
307
"111111111111111111111111",
308
"222222222222222222222222",
309
"aaaaaaaaaaaaaaaaaaaaaaaa",
310
]
311
}
312
313
314
def test_map_elements_set_datetime_output_8984() -> None:
315
df = pl.DataFrame({"a": [""]})
316
payload = datetime(2001, 1, 1)
317
assert df.select(
318
pl.col("a").map_elements(lambda _: payload, return_dtype=pl.Datetime),
319
)["a"].to_list() == [payload]
320
321
322
@pytest.mark.may_fail_cloud # reason: eager - return_dtype must be set
323
def test_map_elements_dict_order_10128() -> None:
324
df = pl.select(pl.lit("").map_elements(lambda x: {"c": 1, "b": 2, "a": 3}))
325
assert df.to_dict(as_series=False) == {"literal": [{"c": 1, "b": 2, "a": 3}]}
326
327
328
@pytest.mark.may_fail_cloud # reason: eager - return_dtype must be set
329
def test_map_elements_10237() -> None:
330
df = pl.DataFrame({"a": [1, 2, 3]})
331
assert (
332
df.select(pl.all().map_elements(lambda x: x > 50))["a"].to_list() == [False] * 3
333
)
334
335
336
@pytest.mark.may_fail_cloud # reason: eager - return_dtype must be set
337
def test_map_elements_on_empty_col_10639() -> None:
338
df = pl.DataFrame({"A": [], "B": []}, schema={"A": pl.Float32, "B": pl.Float32})
339
res = df.group_by("B").agg(
340
pl.col("A")
341
.map_elements(lambda x: x, return_dtype=pl.Int32, strategy="threading")
342
.alias("Foo")
343
)
344
assert res.to_dict(as_series=False) == {
345
"B": [],
346
"Foo": [],
347
}
348
349
res = df.group_by("B").agg(
350
pl.col("A")
351
.map_elements(lambda x: x, return_dtype=pl.Int32, strategy="thread_local")
352
.alias("Foo")
353
)
354
assert res.to_dict(as_series=False) == {
355
"B": [],
356
"Foo": [],
357
}
358
359
360
def test_map_elements_chunked_14390() -> None:
361
s = pl.concat(2 * [pl.Series([1])], rechunk=False)
362
assert s.n_chunks() > 1
363
with pytest.warns(PolarsInefficientMapWarning):
364
assert_series_equal(
365
s.map_elements(str, return_dtype=pl.String),
366
pl.Series(["1", "1"]),
367
check_names=False,
368
)
369
370
371
def test_cabbage_strategy_14396() -> None:
372
df = pl.DataFrame({"x": [1, 2, 3]})
373
with (
374
pytest.raises(ValueError, match="strategy 'cabbage' is not supported"),
375
pytest.warns(PolarsInefficientMapWarning),
376
):
377
df.select(pl.col("x").map_elements(lambda x: 2 * x, strategy="cabbage")) # type: ignore[arg-type]
378
379
380
def test_map_elements_list_dtype_18472() -> None:
381
s = pl.Series([[None], ["abc ", None]])
382
result = s.map_elements(lambda s: [i.strip() if i else None for i in s])
383
expected = pl.Series([[None], ["abc", None]])
384
assert_series_equal(result, expected)
385
386
387
def test_map_elements_list_return_dtype() -> None:
388
s = pl.Series([[1], [2, 3]])
389
return_dtype = pl.List(pl.UInt16)
390
391
result = s.map_elements(
392
lambda s: [i + 1 for i in s],
393
return_dtype=return_dtype,
394
)
395
expected = pl.Series([[2], [3, 4]], dtype=return_dtype)
396
assert_series_equal(result, expected)
397
398
399
def test_map_elements_list_of_named_tuple_15425() -> None:
400
class Foo(NamedTuple):
401
x: int
402
403
df = pl.DataFrame({"a": [0, 1, 2]})
404
result = df.select(
405
pl.col("a").map_elements(
406
lambda x: [Foo(i) for i in range(x)],
407
return_dtype=pl.List(pl.Struct({"x": pl.Int64})),
408
)
409
)
410
expected = pl.DataFrame({"a": [[], [{"x": 0}], [{"x": 0}, {"x": 1}]]})
411
assert_frame_equal(result, expected)
412
413
414
def test_map_elements_list_dtype_24006() -> None:
415
values = [None, [1, 2], [2, 3]]
416
dtype = pl.List(pl.Int64)
417
418
s1 = pl.Series([0, 1, 2]).map_elements(lambda x: values[x])
419
s2 = pl.Series([0, 1, 2]).map_elements(lambda x: values[x], return_dtype=dtype)
420
421
assert_series_equal(s1, s2)
422
assert_series_equal(s1, pl.Series(values, dtype=dtype))
423
424
425
def test_map_elements_reentrant_mutable_no_deadlock() -> None:
426
s = pl.Series("a", [1, 2, 3])
427
s.map_elements(lambda _: s.rechunk(in_place=True)[0])
428
429