Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/test_match_to_schema.py
6939 views
1
import pytest
2
3
import polars as pl
4
from polars.testing import assert_frame_equal
5
6
7
def test_match_to_schema() -> None:
8
df = pl.DataFrame(
9
[
10
pl.Series("a", [1, 2, 3], pl.Int64),
11
pl.Series("b", ["A", "B", "C"], pl.String),
12
]
13
)
14
15
result = df.lazy().match_to_schema(df.schema).collect()
16
assert_frame_equal(df, result)
17
18
result = df.lazy().match_to_schema({"a": pl.Int64(), "b": pl.String()}).collect()
19
assert_frame_equal(df, result)
20
21
with pytest.raises(pl.exceptions.SchemaError):
22
df.lazy().match_to_schema({"a": pl.String(), "b": pl.Int64()}).collect()
23
24
with pytest.raises(pl.exceptions.SchemaError):
25
df.lazy().match_to_schema({"x": pl.Int64(), "y": pl.String()}).collect()
26
27
28
def test_match_to_schema_missing_columns() -> None:
29
df = pl.DataFrame(
30
[
31
pl.Series("a", [1, 2, 3], pl.Int64),
32
pl.Series("b", ["A", "B", "C"], pl.String),
33
]
34
)
35
36
expected = df.with_columns(c=pl.lit(None, dtype=pl.Datetime()))
37
38
result = (
39
df.lazy()
40
.match_to_schema(
41
expected.schema,
42
missing_columns="insert",
43
)
44
.collect()
45
)
46
assert_frame_equal(expected, result)
47
48
result = (
49
df.lazy()
50
.match_to_schema(
51
expected.schema,
52
missing_columns={"c": "insert"},
53
)
54
.collect()
55
)
56
57
result = (
58
df.lazy()
59
.match_to_schema(
60
{"a": pl.Int64(), "b": pl.String(), "c": pl.Datetime()},
61
missing_columns="insert",
62
)
63
.collect()
64
)
65
assert_frame_equal(df.with_columns(c=pl.lit(None, dtype=pl.Datetime())), result)
66
67
df = pl.DataFrame(
68
[
69
pl.Series("b", ["A", "B", "C"], pl.String),
70
]
71
)
72
73
result = (
74
df.lazy()
75
.match_to_schema(
76
{"a": pl.Int64(), "b": pl.String(), "c": pl.Datetime()},
77
missing_columns="insert",
78
)
79
.collect()
80
)
81
assert_frame_equal(
82
df.select(
83
a=pl.lit(None, dtype=pl.Int64()),
84
b=pl.col.b,
85
c=pl.lit(None, dtype=pl.Datetime()),
86
),
87
result,
88
)
89
90
91
def test_match_to_schema_extra_columns() -> None:
92
df = pl.DataFrame(
93
[
94
pl.Series("a", [1, 2, 3], pl.Int64),
95
pl.Series("b", ["A", "B", "C"], pl.String),
96
pl.Series("c", [["A", "B"], ["C"], ["D"]], pl.List(pl.String)),
97
]
98
)
99
100
expected = df.select(["a"])
101
102
with pytest.raises(pl.exceptions.SchemaError, match=r'`match_to_schema`: "b", "c"'):
103
df.lazy().match_to_schema(expected.schema).collect()
104
105
result = (
106
df.lazy().match_to_schema(expected.schema, extra_columns="ignore").collect()
107
)
108
assert_frame_equal(expected, result)
109
110
expected = df.select(["a", "b", "c"])
111
result = (
112
df.lazy().match_to_schema(expected.schema, extra_columns="ignore").collect()
113
)
114
assert_frame_equal(expected, result)
115
116
expected = df.select(["a", "c"])
117
result = (
118
df.lazy().match_to_schema(expected.schema, extra_columns="ignore").collect()
119
)
120
assert_frame_equal(expected, result)
121
122
123
def test_match_to_schema_missing_struct_fields() -> None:
124
df = pl.DataFrame(
125
[
126
pl.Series("a", [1, 2, 3], pl.Int64),
127
pl.Series(
128
"b", [{"x": "A"}, {"x": "B"}, {"x": "C"}], pl.Struct({"x": pl.String()})
129
),
130
pl.Series("c", [["A", "B"], ["C"], ["D"]], pl.List(pl.String)),
131
]
132
)
133
134
expected = df.with_columns(
135
pl.col.b.struct.with_fields(
136
y=pl.repeat(pl.lit(None, dtype=pl.Datetime()), pl.len())
137
)
138
)
139
140
with pytest.raises(pl.exceptions.SchemaError):
141
df.lazy().match_to_schema(expected.schema).collect()
142
143
with pytest.raises(pl.exceptions.SchemaError):
144
df.lazy().match_to_schema(expected.schema, missing_columns="insert").collect()
145
146
result = (
147
df.lazy()
148
.match_to_schema(expected.schema, missing_struct_fields="insert")
149
.collect()
150
)
151
assert_frame_equal(expected, result)
152
153
154
def test_match_to_schema_extra_struct_fields() -> None:
155
expected = pl.DataFrame(
156
[
157
pl.Series("a", [1, 2, 3], pl.Int64),
158
pl.Series(
159
"b", [{"x": "A"}, {"x": "B"}, {"x": "C"}], pl.Struct({"x": pl.String()})
160
),
161
pl.Series("c", [["A", "B"], ["C"], ["D"]], pl.List(pl.String)),
162
]
163
)
164
165
df = expected.with_columns(
166
pl.col.b.struct.with_fields(
167
y=pl.repeat(pl.lit(None, dtype=pl.Datetime()), pl.len())
168
)
169
)
170
171
with pytest.raises(pl.exceptions.SchemaError):
172
df.lazy().match_to_schema(expected.schema).collect()
173
174
with pytest.raises(pl.exceptions.SchemaError):
175
df.lazy().match_to_schema(expected.schema, extra_columns="ignore").collect()
176
177
result = (
178
df.lazy()
179
.match_to_schema(expected.schema, extra_struct_fields="ignore")
180
.collect()
181
)
182
assert_frame_equal(expected, result)
183
184
185
def test_match_to_schema_int_upcast() -> None:
186
df = pl.DataFrame(
187
[
188
pl.Series("a", [1, 2, 3], pl.Int32),
189
pl.Series(
190
"b", [{"x": "A"}, {"x": "B"}, {"x": "C"}], pl.Struct({"x": pl.String()})
191
),
192
pl.Series("c", [["A", "B"], ["C"], ["D"]], pl.List(pl.String)),
193
]
194
)
195
196
expected = df.with_columns(pl.col.a.cast(pl.Int64))
197
198
with pytest.raises(pl.exceptions.SchemaError):
199
df.lazy().match_to_schema(expected.schema).collect()
200
201
with pytest.raises(pl.exceptions.SchemaError):
202
df.lazy().match_to_schema(expected.schema, float_cast="upcast").collect()
203
204
result = df.lazy().match_to_schema(expected.schema, integer_cast="upcast").collect()
205
assert_frame_equal(expected, result)
206
207
208
def test_match_to_schema_float_upcast() -> None:
209
df = pl.DataFrame(
210
[
211
pl.Series("a", [1.0, 2.0, 3.0], pl.Float32()),
212
pl.Series(
213
"b", [{"x": "A"}, {"x": "B"}, {"x": "C"}], pl.Struct({"x": pl.String()})
214
),
215
pl.Series("c", [["A", "B"], ["C"], ["D"]], pl.List(pl.String)),
216
]
217
)
218
219
expected = df.with_columns(pl.col.a.cast(pl.Float64()))
220
221
with pytest.raises(pl.exceptions.SchemaError):
222
df.lazy().match_to_schema(expected.schema).collect()
223
224
with pytest.raises(pl.exceptions.SchemaError):
225
df.lazy().match_to_schema(expected.schema, integer_cast="upcast").collect()
226
227
result = df.lazy().match_to_schema(expected.schema, float_cast="upcast").collect()
228
assert_frame_equal(expected, result)
229
230