Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/test_drop.py
6939 views
1
from typing import Any
2
3
import pytest
4
5
import polars as pl
6
import polars.selectors as cs
7
from polars.testing import assert_frame_equal
8
9
10
def test_drop() -> None:
11
df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]})
12
df = df.drop("a")
13
assert df.shape == (3, 2)
14
15
df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"], "c": [1, 2, 3]})
16
s = df.drop_in_place("a")
17
assert s.name == "a"
18
19
20
def test_drop_explode_6641() -> None:
21
df = pl.DataFrame(
22
{
23
"chromosome": ["chr1"] * 2,
24
"identifier": [["chr1:10426:10429:ACC>A"], ["chr1:10426:10429:ACC>*"]],
25
"alternate": [["A"], ["T"]],
26
"quality": pl.Series([None, None], dtype=pl.Float32()),
27
}
28
).lazy()
29
30
assert (
31
df.explode(["identifier", "alternate"])
32
.with_columns(pl.struct(["identifier", "alternate"]).alias("test"))
33
.drop(["identifier", "alternate"])
34
.select(pl.concat_list([pl.col("test"), pl.col("test")]))
35
.collect()
36
).to_dict(as_series=False) == {
37
"test": [
38
[
39
{"identifier": "chr1:10426:10429:ACC>A", "alternate": "A"},
40
{"identifier": "chr1:10426:10429:ACC>A", "alternate": "A"},
41
],
42
[
43
{"identifier": "chr1:10426:10429:ACC>*", "alternate": "T"},
44
{"identifier": "chr1:10426:10429:ACC>*", "alternate": "T"},
45
],
46
]
47
}
48
49
50
@pytest.mark.parametrize(
51
"subset",
52
[
53
"foo",
54
["foo"],
55
{"foo"},
56
],
57
)
58
def test_drop_nulls(subset: Any) -> None:
59
df = pl.DataFrame(
60
{
61
"foo": [1, 2, 3],
62
"bar": [6, None, 8],
63
"ham": ["a", "b", "c"],
64
}
65
)
66
result = df.drop_nulls()
67
expected = pl.DataFrame(
68
{
69
"foo": [1, 3],
70
"bar": [6, 8],
71
"ham": ["a", "c"],
72
}
73
)
74
assert_frame_equal(result, expected)
75
76
# below we only drop entries if they are null in the column 'foo'
77
result = df.drop_nulls(subset)
78
assert_frame_equal(result, df)
79
80
81
def test_drop_nulls_lazy() -> None:
82
lf = pl.LazyFrame({"foo": [1, 2, 3], "bar": [6, None, 8], "ham": ["a", "b", "c"]})
83
expected = pl.LazyFrame({"foo": [1, 3], "bar": [6, 8], "ham": ["a", "c"]})
84
85
result = lf.drop_nulls()
86
assert_frame_equal(result, expected)
87
88
result = lf.drop_nulls(cs.contains("a"))
89
assert_frame_equal(result, expected)
90
91
92
def test_drop_nulls_misc() -> None:
93
df = pl.DataFrame({"nrs": [None, 1, 2, 3, None, 4, 5, None]})
94
assert df.select(pl.col("nrs").drop_nulls()).to_dict(as_series=False) == {
95
"nrs": [1, 2, 3, 4, 5]
96
}
97
98
99
def test_drop_nulls_empty_subset() -> None:
100
df = pl.DataFrame({"a": [1, None]})
101
assert_frame_equal(df.drop_nulls([]), df)
102
assert_frame_equal(df.drop_nulls(()), df)
103
104
105
def test_drop_columns() -> None:
106
out = pl.LazyFrame({"a": [1], "b": [2], "c": [3]}).drop(["a", "b"])
107
assert out.collect_schema().names() == ["c"]
108
109
out = pl.LazyFrame({"a": [1], "b": [2], "c": [3]}).drop(~cs.starts_with("c"))
110
assert out.collect_schema().names() == ["c"]
111
112
out = pl.LazyFrame({"a": [1], "b": [2], "c": [3]}).drop("a")
113
assert out.collect_schema().names() == ["b", "c"]
114
115
out2 = pl.DataFrame({"a": [1], "b": [2], "c": [3]}).drop("a", "b")
116
assert out2.collect_schema().names() == ["c"]
117
118
out2 = pl.DataFrame({"a": [1], "b": [2], "c": [3]}).drop({"a", "b", "c"})
119
assert out2.collect_schema().names() == []
120
121
122
@pytest.mark.parametrize("lazy", [True, False])
123
def test_drop_nans(lazy: bool) -> None:
124
DataFrame = pl.LazyFrame if lazy else pl.DataFrame
125
df = DataFrame(
126
{
127
"a": [1.0, float("nan"), 3.0, 4.0, None],
128
"b": [10000, 20000, 30000, 40000, None],
129
"c": [-90.5, 25.0, 0.0, float("nan"), None],
130
}
131
)
132
expected = DataFrame(
133
{
134
"a": [1.0, 3.0, None],
135
"b": [10000, 30000, None],
136
"c": [-90.5, 0.0, None],
137
}
138
)
139
assert_frame_equal(expected, df.drop_nans())
140
141
expected = DataFrame(
142
{
143
"a": [1.0, float("nan"), 3.0, None],
144
"b": [10000, 20000, 30000, None],
145
"c": [-90.5, 25.0, 0.0, None],
146
}
147
)
148
assert_frame_equal(expected, df.drop_nans(subset=["c"]))
149
assert_frame_equal(expected, df.drop_nans(subset=cs.ends_with("c")))
150
151
expected = DataFrame(
152
{
153
"a": [1.0, 3.0, None],
154
"b": [10000, 30000, None],
155
"c": [-90.5, 0.0, None],
156
}
157
)
158
assert_frame_equal(expected, df.drop_nans(subset=["a", "c"]))
159
assert_frame_equal(expected, df.drop_nans(subset=cs.float()))
160
161
162
def test_drop_nan_ignore_null_3525() -> None:
163
df = pl.DataFrame({"a": [1.0, float("nan"), 2.0, None, 3.0, 4.0]})
164
assert df.select(pl.col("a").drop_nans()).to_series().to_list() == [
165
1.0,
166
2.0,
167
None,
168
3.0,
169
4.0,
170
]
171
172
173
def test_drop_nans_empty_subset() -> None:
174
df = pl.DataFrame({"a": [1.0, float("NaN")]})
175
assert_frame_equal(df.drop_nans([]), df)
176
assert_frame_equal(df.drop_nans(()), df)
177
178
179
def test_drop_without_parameters() -> None:
180
df = pl.DataFrame({"a": [1, 2]})
181
assert_frame_equal(df.drop(), df)
182
assert_frame_equal(df.lazy().drop(*[]), df.lazy())
183
184
185
def test_drop_strict() -> None:
186
df = pl.DataFrame({"a": [1, 2]})
187
188
df.drop("a")
189
190
with pytest.raises(pl.exceptions.ColumnNotFoundError, match="b"):
191
df.drop("b")
192
193
df.drop("a", strict=False)
194
df.drop("b", strict=False)
195
196
197
def test_drop_regex_14069() -> None:
198
df = pl.DataFrame({"a": 1, "a2": 2, "b": 3})
199
assert df.drop(cs.matches("^a.*$")).columns == ["b"]
200
201
202
def test_drop_invalid_selector_19023() -> None:
203
df = pl.DataFrame(
204
data={"x": [1, 2], "x_b": [3, 4], "y_b": [10, 20], "z": ["a", "b"]}
205
)
206
with pytest.raises(pl.exceptions.InvalidOperationError, match="is not a selector"):
207
df.drop(pl.selectors.ends_with("_b") + []) # type: ignore[arg-type]
208
209