Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/namespaces/test_categorical.py
8445 views
1
from __future__ import annotations
2
3
from io import BytesIO
4
from typing import TYPE_CHECKING
5
6
import pytest
7
8
import polars as pl
9
from polars.testing import assert_frame_equal, assert_series_equal
10
11
if TYPE_CHECKING:
12
from polars._typing import PolarsDataType
13
14
15
def test_categorical_lexical_sort() -> None:
16
df = pl.DataFrame(
17
{"cats": ["z", "z", "k", "a", "b"], "vals": [3, 1, 2, 2, 3]}
18
).with_columns(
19
pl.col("cats").cast(pl.Categorical()),
20
)
21
22
out = df.sort(["cats"])
23
assert out["cats"].dtype == pl.Categorical
24
expected = pl.DataFrame(
25
{"cats": ["a", "b", "k", "z", "z"], "vals": [2, 3, 2, 3, 1]}
26
)
27
assert_frame_equal(out.with_columns(pl.col("cats").cast(pl.String)), expected)
28
out = df.sort(["cats", "vals"])
29
expected = pl.DataFrame(
30
{"cats": ["a", "b", "k", "z", "z"], "vals": [2, 3, 2, 1, 3]}
31
)
32
assert_frame_equal(out.with_columns(pl.col("cats").cast(pl.String)), expected)
33
out = df.sort(["vals", "cats"])
34
35
expected = pl.DataFrame(
36
{"cats": ["z", "a", "k", "b", "z"], "vals": [1, 2, 2, 3, 3]}
37
)
38
assert_frame_equal(out.with_columns(pl.col("cats").cast(pl.String)), expected)
39
40
s = pl.Series(["a", "c", "a", "b", "a"], dtype=pl.Categorical())
41
assert s.sort().cast(pl.String).to_list() == [
42
"a",
43
"a",
44
"a",
45
"b",
46
"c",
47
]
48
49
50
def test_categorical_lexical_ordering_after_concat() -> None:
51
ldf1 = (
52
pl.DataFrame([pl.Series("key1", [8, 5]), pl.Series("key2", ["fox", "baz"])])
53
.lazy()
54
.with_columns(pl.col("key2").cast(pl.Categorical()))
55
)
56
ldf2 = (
57
pl.DataFrame(
58
[pl.Series("key1", [6, 8, 6]), pl.Series("key2", ["fox", "foo", "bar"])]
59
)
60
.lazy()
61
.with_columns(pl.col("key2").cast(pl.Categorical()))
62
)
63
df = pl.concat([ldf1, ldf2]).select(pl.col("key2")).collect()
64
65
assert df.sort("key2").to_dict(as_series=False) == {
66
"key2": ["bar", "baz", "foo", "fox", "fox"]
67
}
68
69
70
def test_sort_categoricals_6014_lexical() -> None:
71
# create lexically-ordered categorical
72
df = pl.DataFrame({"key": ["bbb", "aaa", "ccc"]}).with_columns(
73
pl.col("key").cast(pl.Categorical())
74
)
75
76
out = df.sort("key")
77
assert out.to_dict(as_series=False) == {"key": ["aaa", "bbb", "ccc"]}
78
79
80
def test_categorical_get_categories() -> None:
81
s = pl.Series("cats", ["foo", "bar", "foo", "foo", "ham"], dtype=pl.Categorical)
82
assert set(s.cat.get_categories().to_list()) >= {"foo", "bar", "ham"}
83
84
85
def test_cat_to_local() -> None:
86
s = pl.Series(["a", "b", "a"], dtype=pl.Categorical)
87
assert_series_equal(s, s.cat.to_local())
88
89
90
def test_cat_uses_lexical_ordering() -> None:
91
with pytest.warns(DeprecationWarning, match="ordering parameter"):
92
physical_cat = pl.Categorical(ordering="physical")
93
94
for dtype in [pl.Categorical, pl.Categorical(), physical_cat]:
95
s = pl.Series(["a", "b", None, "b"]).cast(dtype) # type: ignore[arg-type]
96
97
with pytest.warns(
98
DeprecationWarning,
99
match="Categoricals are now always ordered lexically",
100
):
101
assert s.cat.uses_lexical_ordering()
102
103
104
@pytest.mark.parametrize("dtype", [pl.Categorical, pl.Enum])
105
def test_cat_len_bytes(dtype: PolarsDataType) -> None:
106
# test Series
107
values = ["Café", None, "Café", "345", "東京"]
108
if dtype == pl.Enum:
109
dtype = pl.Enum(list({x for x in values if x is not None}))
110
s = pl.Series("a", values, dtype=dtype)
111
result = s.cat.len_bytes()
112
expected = pl.Series("a", [5, None, 5, 3, 6], dtype=pl.UInt32)
113
assert_series_equal(result, expected)
114
115
# test DataFrame expr
116
df = pl.DataFrame(s)
117
result_df = df.select(pl.col("a").cat.len_bytes())
118
expected_df = pl.DataFrame(expected)
119
assert_frame_equal(result_df, expected_df)
120
121
# test LazyFrame expr
122
result_lf = df.lazy().select(pl.col("a").cat.len_bytes()).collect()
123
assert_frame_equal(result_lf, expected_df)
124
125
# test GroupBy
126
result_df = (
127
pl.LazyFrame({"key": [1, 1, 1, 1, 1, 2, 2, 2, 2, 2], "value": s.extend(s)})
128
.group_by("key", maintain_order=True)
129
.agg(pl.col("value").cat.len_bytes().alias("len_bytes"))
130
.explode("len_bytes")
131
.collect()
132
)
133
expected_df = pl.DataFrame(
134
{
135
"key": [1, 1, 1, 1, 1, 2, 2, 2, 2, 2],
136
"len_bytes": pl.Series(
137
[5, None, 5, 3, 6, 5, None, 5, 3, 6], dtype=pl.UInt32
138
),
139
}
140
)
141
assert_frame_equal(result_df, expected_df)
142
143
144
@pytest.mark.parametrize("dtype", [pl.Categorical, pl.Enum])
145
def test_cat_len_chars(dtype: PolarsDataType) -> None:
146
values = ["Café", None, "Café", "345", "東京"]
147
if dtype == pl.Enum:
148
dtype = pl.Enum(list({x for x in values if x is not None}))
149
# test Series
150
s = pl.Series("a", values, dtype=dtype)
151
result = s.cat.len_chars()
152
expected = pl.Series("a", [4, None, 4, 3, 2], dtype=pl.UInt32)
153
assert_series_equal(result, expected)
154
155
# test DataFrame expr
156
df = pl.DataFrame(s)
157
result_df = df.select(pl.col("a").cat.len_chars())
158
expected_df = pl.DataFrame(expected)
159
assert_frame_equal(result_df, expected_df)
160
161
# test LazyFrame expr
162
result_lf = df.lazy().select(pl.col("a").cat.len_chars()).collect()
163
assert_frame_equal(result_lf, expected_df)
164
165
# test GroupBy
166
result_df = (
167
pl.LazyFrame({"key": [1, 1, 1, 1, 1, 2, 2, 2, 2, 2], "value": s.extend(s)})
168
.group_by("key", maintain_order=True)
169
.agg(pl.col("value").cat.len_chars().alias("len_bytes"))
170
.explode("len_bytes")
171
.collect()
172
)
173
expected_df = pl.DataFrame(
174
{
175
"key": [1, 1, 1, 1, 1, 2, 2, 2, 2, 2],
176
"len_bytes": pl.Series(
177
[4, None, 4, 3, 2, 4, None, 4, 3, 2], dtype=pl.UInt32
178
),
179
}
180
)
181
assert_frame_equal(result_df, expected_df)
182
183
184
@pytest.mark.parametrize("dtype", [pl.Categorical, pl.Enum])
185
def test_starts_ends_with(dtype: PolarsDataType) -> None:
186
values = ["hamburger_with_tomatoes", "nuts", "nuts", "lollypop", None]
187
if dtype == pl.Enum:
188
dtype = pl.Enum(list({x for x in values if x is not None}))
189
s = pl.Series("a", values, dtype=dtype)
190
assert_series_equal(
191
s.cat.ends_with("pop"), pl.Series("a", [False, False, False, True, None])
192
)
193
assert_series_equal(
194
s.cat.starts_with("nu"), pl.Series("a", [False, True, True, False, None])
195
)
196
197
with pytest.raises(TypeError, match="'prefix' must be a string; found"):
198
s.cat.starts_with(None) # type: ignore[arg-type]
199
200
with pytest.raises(TypeError, match="'suffix' must be a string; found"):
201
s.cat.ends_with(None) # type: ignore[arg-type]
202
203
df = pl.DataFrame({"a": pl.Series(values, dtype=dtype)})
204
205
expected = {
206
"ends_pop": [False, False, False, True, None],
207
"starts_ham": [True, False, False, False, None],
208
}
209
210
assert (
211
df.select(
212
pl.col("a").cat.ends_with("pop").alias("ends_pop"),
213
pl.col("a").cat.starts_with("ham").alias("starts_ham"),
214
).to_dict(as_series=False)
215
== expected
216
)
217
218
with pytest.raises(TypeError, match="'prefix' must be a string; found"):
219
df.select(pl.col("a").cat.starts_with(None)) # type: ignore[arg-type]
220
221
with pytest.raises(TypeError, match="'suffix' must be a string; found"):
222
df.select(pl.col("a").cat.ends_with(None)) # type: ignore[arg-type]
223
224
225
@pytest.mark.parametrize("dtype", [pl.Categorical, pl.Enum])
226
def test_cat_slice(dtype: PolarsDataType) -> None:
227
values = ["foobar", "barfoo", "foobar", "x", None]
228
if dtype == pl.Enum:
229
dtype = pl.Enum(list({x for x in values if x is not None}))
230
df = pl.DataFrame({"a": pl.Series(values, dtype=dtype)})
231
assert df["a"].cat.slice(-3).to_list() == ["bar", "foo", "bar", "x", None]
232
assert df.select([pl.col("a").cat.slice(2, 4)])["a"].to_list() == [
233
"obar",
234
"rfoo",
235
"obar",
236
"",
237
None,
238
]
239
240
241
def test_cat_order_flag_csv_read_23823() -> None:
242
data = BytesIO(b"colx,coly\nabc,123\n#not_a_row\nxyz,456")
243
lf = pl.scan_csv(
244
source=data,
245
comment_prefix="#",
246
schema_overrides={"colx": pl.Categorical},
247
)
248
expected = pl.DataFrame(
249
{"colx": ["abc", "xyz"], "coly": [123, 456]},
250
schema_overrides={"colx": pl.Categorical},
251
)
252
assert_frame_equal(expected, lf.sort("colx", descending=False).collect())
253
254