Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/functions/test_concat.py
8421 views
1
import io
2
from typing import IO
3
4
import pytest
5
6
import polars as pl
7
from polars.testing import assert_frame_equal
8
9
10
@pytest.mark.may_fail_cloud # reason: @serialize-stack-overflow
11
@pytest.mark.slow
12
def test_concat_expressions_stack_overflow() -> None:
13
n = 10000
14
e = pl.concat([pl.lit(x) for x in range(n)])
15
16
df = pl.select(e)
17
assert df.shape == (n, 1)
18
19
20
@pytest.mark.may_fail_cloud # reason: @serialize-stack-overflow
21
@pytest.mark.slow
22
def test_concat_lf_stack_overflow() -> None:
23
n = 1000
24
bar = pl.DataFrame({"a": 0}).lazy()
25
26
for i in range(n):
27
bar = pl.concat([bar, pl.DataFrame({"a": i}).lazy()])
28
assert bar.collect().shape == (1001, 1)
29
30
31
def test_concat_horizontally_strict() -> None:
32
df1 = pl.DataFrame({"c": [11], "d": [42]}) # 1 vs N (may broadcast)
33
df2 = pl.DataFrame({"c": [11, 12], "d": [42, 24]}) # 2 vs N
34
df3 = pl.DataFrame({"a": [0, 1, 2], "b": [1, 2, 3]})
35
with pytest.raises(pl.exceptions.ShapeError):
36
pl.concat([df1, df3], how="horizontal", strict=True)
37
38
with pytest.raises(pl.exceptions.ShapeError):
39
pl.concat([df2, df3], how="horizontal", strict=True)
40
41
with pytest.raises(pl.exceptions.ShapeError):
42
pl.concat([df1.lazy(), df3.lazy()], how="horizontal", strict=True).collect()
43
44
with pytest.raises(pl.exceptions.ShapeError):
45
pl.concat([df2.lazy(), df3.lazy()], how="horizontal", strict=True).collect()
46
47
out = pl.concat([df1, df3], how="horizontal", strict=False)
48
assert out.to_dict(as_series=False) == {
49
"a": [0, 1, 2],
50
"b": [1, 2, 3],
51
"c": [11, None, None],
52
"d": [42, None, None],
53
}
54
55
out = pl.concat([df2, df3], how="horizontal", strict=False)
56
assert out.to_dict(as_series=False) == {
57
"a": [0, 1, 2],
58
"b": [1, 2, 3],
59
"c": [11, 12, None],
60
"d": [42, 24, None],
61
}
62
63
64
def test_concat_vertically_relaxed() -> None:
65
a = pl.DataFrame(
66
data={"a": [1, 2, 3], "b": [True, False, None]},
67
schema={"a": pl.Int8, "b": pl.Boolean},
68
)
69
b = pl.DataFrame(
70
data={"a": [43, 2, 3], "b": [32, 1, None]},
71
schema={"a": pl.Int16, "b": pl.Int64},
72
)
73
out = pl.concat([a, b], how="vertical_relaxed")
74
assert out.schema == {"a": pl.Int16, "b": pl.Int64}
75
assert out.to_dict(as_series=False) == {
76
"a": [1, 2, 3, 43, 2, 3],
77
"b": [1, 0, None, 32, 1, None],
78
}
79
out = pl.concat([b, a], how="vertical_relaxed")
80
assert out.schema == {"a": pl.Int16, "b": pl.Int64}
81
assert out.to_dict(as_series=False) == {
82
"a": [43, 2, 3, 1, 2, 3],
83
"b": [32, 1, None, 1, 0, None],
84
}
85
86
c = pl.DataFrame({"a": [1, 2], "b": [2, 1]})
87
d = pl.DataFrame({"a": [1.0, 0.2], "b": [None, 0.1]})
88
89
out = pl.concat([c, d], how="vertical_relaxed")
90
assert out.schema == {"a": pl.Float64, "b": pl.Float64}
91
assert out.to_dict(as_series=False) == {
92
"a": [1.0, 2.0, 1.0, 0.2],
93
"b": [2.0, 1.0, None, 0.1],
94
}
95
out = pl.concat([d, c], how="vertical_relaxed")
96
assert out.schema == {"a": pl.Float64, "b": pl.Float64}
97
assert out.to_dict(as_series=False) == {
98
"a": [1.0, 0.2, 1.0, 2.0],
99
"b": [None, 0.1, 2.0, 1.0],
100
}
101
102
103
def test_concat_group_by() -> None:
104
df = pl.DataFrame(
105
{
106
"g": [0, 0, 0, 0, 1, 1, 1, 1],
107
"a": [0, 1, 2, 3, 4, 5, 6, 7],
108
"b": [8, 9, 10, 11, 12, 13, 14, 15],
109
}
110
)
111
out = df.group_by("g").agg(pl.concat([pl.col.a, pl.col.b]))
112
113
assert_frame_equal(
114
out,
115
pl.DataFrame(
116
{
117
"g": [0, 1],
118
"a": [[0, 1, 2, 3, 8, 9, 10, 11], [4, 5, 6, 7, 12, 13, 14, 15]],
119
}
120
),
121
check_row_order=False,
122
)
123
124
125
def test_concat_19877() -> None:
126
df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
127
out = df.select(pl.concat([pl.col("a"), pl.col("b")]))
128
assert_frame_equal(out, pl.DataFrame({"a": [1, 2, 3, 4]}))
129
130
131
def test_concat_zip_series_21980() -> None:
132
df = pl.DataFrame({"x": 1, "y": 2})
133
out = df.select(pl.concat([pl.col.x, pl.col.y]), pl.Series([3, 4]))
134
assert_frame_equal(out, pl.DataFrame({"x": [1, 2], "": [3, 4]}))
135
136
137
def test_concat_invalid_schema_err_20355() -> None:
138
lf1 = pl.LazyFrame({"x": [1], "y": [None]})
139
lf2 = pl.LazyFrame({"y": [1]})
140
with pytest.raises(pl.exceptions.InvalidOperationError):
141
pl.concat([lf1, lf2]).collect(engine="streaming")
142
143
144
def test_concat_df() -> None:
145
df1 = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
146
df2 = pl.concat([df1, df1], rechunk=True)
147
148
assert df2.shape == (6, 3)
149
assert df2.n_chunks() == 1
150
assert df2.rows() == df1.rows() + df1.rows()
151
assert pl.concat([df1, df1], rechunk=False).n_chunks() == 2
152
153
# concat from generator of frames
154
df3 = pl.concat(items=(df1 for _ in range(2)))
155
assert_frame_equal(df2, df3)
156
157
# check that df4 is not modified following concat of itself
158
df4 = pl.from_records(((1, 2), (1, 2)))
159
_ = pl.concat([df4, df4, df4])
160
161
assert df4.shape == (2, 2)
162
assert df4.rows() == [(1, 1), (2, 2)]
163
164
# misc error conditions
165
with pytest.raises(ValueError):
166
_ = pl.concat([])
167
168
with pytest.raises(ValueError):
169
pl.concat([df1, df1], how="rubbish") # type: ignore[arg-type]
170
171
172
def test_concat_to_empty() -> None:
173
assert pl.concat([pl.DataFrame([]), pl.DataFrame({"a": [1]})]).to_dict(
174
as_series=False
175
) == {"a": [1]}
176
177
178
def test_concat_multiple_parquet_inmem() -> None:
179
f = io.BytesIO()
180
g = io.BytesIO()
181
182
df1 = pl.DataFrame(
183
{
184
"a": [1, 2, 3],
185
"b": ["xyz", "abc", "wow"],
186
}
187
)
188
df2 = pl.DataFrame(
189
{
190
"a": [5, 6, 7],
191
"b": ["a", "few", "entries"],
192
}
193
)
194
195
dfs = pl.concat([df1, df2])
196
197
df1.write_parquet(f)
198
df2.write_parquet(g)
199
200
f.seek(0)
201
g.seek(0)
202
203
items: list[IO[bytes]] = [f, g]
204
assert_frame_equal(pl.read_parquet(items), dfs)
205
206
f.seek(0)
207
g.seek(0)
208
209
assert_frame_equal(pl.read_parquet(items, use_pyarrow=True), dfs)
210
211
f.seek(0)
212
g.seek(0)
213
214
fb = f.read()
215
gb = g.read()
216
217
assert_frame_equal(pl.read_parquet([fb, gb]), dfs)
218
assert_frame_equal(pl.read_parquet([fb, gb], use_pyarrow=True), dfs)
219
220
221
def test_concat_series() -> None:
222
s = pl.Series("a", [2, 1, 3])
223
224
assert pl.concat([s, s]).len() == 6
225
# check if s remains unchanged
226
assert s.len() == 3
227
228
229
def test_concat_null_20501() -> None:
230
a = pl.DataFrame({"id": [1], "value": ["foo"]})
231
b = pl.DataFrame({"id": [2], "value": [None]})
232
233
assert pl.concat([a.lazy(), b.lazy()]).collect().to_dict(as_series=False) == {
234
"id": [1, 2],
235
"value": ["foo", None],
236
}
237
238
239
def test_concat_single_element() -> None:
240
df = pl.DataFrame({"a": [1, 2, 3]})
241
result = pl.concat([df])
242
assert result is df
243
244
s = pl.Series("test", [1, 2, 3])
245
result_s = pl.concat([s])
246
assert result_s is s
247
248
249
def test_concat_diagonal() -> None:
250
df1 = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
251
df2 = pl.DataFrame({"a": [5, 6], "c": [7, 8]})
252
df3 = pl.DataFrame({"b": [9, 10], "c": [11, 12]})
253
254
result = pl.concat([df1, df2, df3], how="diagonal")
255
expected = pl.DataFrame(
256
{
257
"a": [1, 2, 5, 6, None, None],
258
"b": [3, 4, None, None, 9, 10],
259
"c": [None, None, 7, 8, 11, 12],
260
}
261
)
262
assert_frame_equal(result, expected)
263
264
265
def test_concat_diagonal_relaxed() -> None:
266
df1 = pl.DataFrame(
267
{"a": [1, 2], "c": [10, 20]}, schema={"a": pl.Int32, "c": pl.Int64}
268
)
269
df2 = pl.DataFrame(
270
{"a": [3.5, 4.5], "b": [30.1, 40.2]}, schema={"a": pl.Float64, "b": pl.Float32}
271
)
272
df3 = pl.DataFrame({"b": [5, 6], "c": [50, 60]})
273
274
result = pl.concat([df1, df2, df3], how="diagonal_relaxed")
275
276
assert result.schema["a"] == pl.Float64
277
assert result.schema["b"] == pl.Float64
278
assert result.schema["c"] == pl.Int64
279
280
expected = pl.DataFrame(
281
{
282
"a": [1.0, 2.0, 3.5, 4.5, None, None],
283
"c": [10, 20, None, None, 50, 60],
284
"b": [None, None, 30.1, 40.2, 5.0, 6.0],
285
}
286
)
287
288
assert_frame_equal(result, expected)
289
290
291
def test_concat_horizontal() -> None:
292
df1 = pl.DataFrame({"a": [1, 2, 3]})
293
df2 = pl.DataFrame({"b": [4, 5]})
294
df3 = pl.DataFrame({"c": [6, 7, 8, 9]})
295
296
result = pl.concat([df1, df2, df3], how="horizontal")
297
expected = pl.DataFrame(
298
{"a": [1, 2, 3, None], "b": [4, 5, None, None], "c": [6, 7, 8, 9]}
299
)
300
assert_frame_equal(result, expected)
301
302
303
def test_concat_align_no_common_columns() -> None:
304
df1 = pl.DataFrame({"a": [1, 2]})
305
df2 = pl.DataFrame({"b": [3, 4]})
306
307
with pytest.raises(
308
pl.exceptions.InvalidOperationError, match="requires at least one common column"
309
):
310
pl.concat([df1, df2], how="align")
311
312
313
def test_concat_align_lazy_frames() -> None:
314
lf1 = pl.DataFrame({"id": [1, 2], "x": [3, 4]}).lazy()
315
lf2 = pl.DataFrame({"id": [2, 3], "y": [5, 6]}).lazy()
316
317
result = pl.concat([lf1, lf2], how="align")
318
assert isinstance(result, pl.LazyFrame)
319
320
collected = result.collect()
321
expected = pl.DataFrame({"id": [1, 2, 3], "x": [3, 4, None], "y": [None, 5, 6]})
322
assert_frame_equal(collected, expected, check_row_order=False)
323
324
325
def test_concat_lazyframe_horizontal() -> None:
326
lf1 = pl.DataFrame({"a": [1, 2]}).lazy()
327
lf2 = pl.DataFrame({"b": [3, 4, 5]}).lazy()
328
329
result = pl.concat([lf1, lf2], how="horizontal")
330
assert isinstance(result, pl.LazyFrame)
331
332
collected = result.collect()
333
expected = pl.DataFrame({"a": [1, 2, None], "b": [3, 4, 5]})
334
assert_frame_equal(collected, expected)
335
336
337
def test_concat_lazyframe_diagonal() -> None:
338
lf1 = pl.DataFrame({"a": [1, 2], "b": [3, 4]}).lazy()
339
lf2 = pl.DataFrame({"a": [5, 6], "c": [7, 8]}).lazy()
340
341
result = pl.concat([lf1, lf2], how="diagonal")
342
assert isinstance(result, pl.LazyFrame)
343
344
collected = result.collect()
345
expected = pl.DataFrame(
346
{"a": [1, 2, 5, 6], "b": [3, 4, None, None], "c": [None, None, 7, 8]}
347
)
348
assert_frame_equal(collected, expected)
349
350
351
def test_concat_series_invalid_strategy() -> None:
352
s1 = pl.Series("a", [1, 2, 3])
353
s2 = pl.Series("b", [4, 5, 6])
354
355
with pytest.raises(
356
ValueError, match="Series only supports 'vertical' concat strategy"
357
):
358
pl.concat([s1, s2], how="horizontal")
359
360
with pytest.raises(
361
ValueError, match="Series only supports 'vertical' concat strategy"
362
):
363
pl.concat([s1, s2], how="diagonal")
364
365
366
def test_concat_invalid_how_parameter() -> None:
367
df1 = pl.DataFrame({"a": [1, 2]})
368
df2 = pl.DataFrame({"a": [3, 4]})
369
370
with pytest.raises(ValueError, match="DataFrame `how` must be one of"):
371
pl.concat([df1, df2], how="invalid_strategy") # type: ignore[arg-type]
372
373
374
def test_concat_unsupported_type() -> None:
375
with pytest.raises(TypeError, match="did not expect type"):
376
pl.concat([1, 2, 3]) # type: ignore[type-var]
377
378
379
def test_concat_expressions() -> None:
380
expr1 = pl.col("a")
381
expr2 = pl.col("b")
382
concat_expr = pl.concat([expr1, expr2])
383
384
df_input = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
385
result = df_input.select(concat_expr.alias("concatenated"))
386
387
expected = pl.DataFrame({"concatenated": [1, 2, 3, 4]})
388
assert_frame_equal(result, expected)
389
390
391
def test_concat_with_empty_dataframes() -> None:
392
empty_df = pl.DataFrame(schema={"a": pl.Int64, "b": pl.String})
393
df_with_data = pl.DataFrame({"a": [1, 2], "b": ["x", "y"]})
394
395
result = pl.concat([empty_df, df_with_data])
396
assert_frame_equal(result, df_with_data)
397
398
result2 = pl.concat([df_with_data, empty_df])
399
assert_frame_equal(result2, df_with_data)
400
401
402
def test_concat_with_empty_dataframes_nonstrict_25727() -> None:
403
df = pl.LazyFrame({"a": [1, 2], "b": ["x", "y"]})
404
result = pl.concat([df, df.select([])], how="horizontal", strict=False)
405
expected = pl.LazyFrame({"a": [1, 2], "b": ["x", "y"]})
406
assert_frame_equal(result, expected)
407
408
empty_df = pl.LazyFrame(schema={"c": pl.Int64})
409
result = pl.concat([empty_df, df], how="horizontal", strict=False)
410
expected = pl.LazyFrame(
411
{"c": [None, None], "a": [1, 2], "b": ["x", "y"]},
412
schema={"c": pl.Int64, "a": pl.Int64, "b": pl.String},
413
)
414
assert_frame_equal(result, expected)
415
416