Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/test_qcut.py
6939 views
1
from __future__ import annotations
2
3
import pytest
4
5
import polars as pl
6
from polars.exceptions import DuplicateError
7
from polars.testing import assert_frame_equal, assert_series_equal
8
9
inf = float("inf")
10
11
12
def test_qcut() -> None:
13
s = pl.Series("a", [-2, -1, 0, 1, 2])
14
15
result = s.qcut([0.25, 0.50])
16
17
expected = pl.Series(
18
"a",
19
[
20
"(-inf, -1]",
21
"(-inf, -1]",
22
"(-1, 0]",
23
"(0, inf]",
24
"(0, inf]",
25
],
26
dtype=pl.Categorical,
27
)
28
assert_series_equal(result, expected, categorical_as_str=True)
29
30
31
def test_qcut_lazy_schema() -> None:
32
lf = pl.LazyFrame({"a": [-2, -1, 0, 1, 2]})
33
34
result = lf.select(pl.col("a").qcut([0.25, 0.75]))
35
36
expected = pl.LazyFrame(
37
{"a": ["(-inf, -1]", "(-inf, -1]", "(-1, 1]", "(-1, 1]", "(1, inf]"]},
38
schema={"a": pl.Categorical},
39
)
40
assert_frame_equal(result, expected, categorical_as_str=True)
41
42
43
def test_qcut_n() -> None:
44
s = pl.Series("a", [-2, -1, 0, 1, 2])
45
46
out = s.qcut(2, labels=["x", "y"], left_closed=True)
47
48
expected = pl.Series("a", ["x", "x", "y", "y", "y"], dtype=pl.Categorical)
49
assert_series_equal(out, expected, categorical_as_str=True)
50
51
52
def test_qcut_include_breaks() -> None:
53
s = pl.int_range(-2, 3, eager=True).alias("a")
54
55
out = s.qcut([0.0, 0.25, 0.75], labels=["a", "b", "c", "d"], include_breaks=True)
56
57
expected = pl.DataFrame(
58
{
59
"breakpoint": [-2.0, -1.0, 1.0, 1.0, inf],
60
"category": ["a", "b", "c", "c", "d"],
61
},
62
schema_overrides={"category": pl.Categorical},
63
).to_struct("a")
64
assert_series_equal(out, expected, categorical_as_str=True)
65
66
67
# https://github.com/pola-rs/polars/issues/11255
68
def test_qcut_include_breaks_lazy_schema() -> None:
69
lf = pl.LazyFrame({"a": [-2, -1, 0, 1, 2]})
70
71
result = lf.select(
72
pl.col("a").qcut([0.25, 0.75], include_breaks=True).alias("qcut")
73
).unnest("qcut")
74
75
expected = pl.LazyFrame(
76
{
77
"breakpoint": [-1.0, -1.0, 1.0, 1.0, inf],
78
"category": ["(-inf, -1]", "(-inf, -1]", "(-1, 1]", "(-1, 1]", "(1, inf]"],
79
},
80
schema_overrides={"category": pl.Categorical},
81
)
82
assert_frame_equal(result, expected, categorical_as_str=True)
83
84
85
def test_qcut_null_values() -> None:
86
s = pl.Series([-1.0, None, 1.0, 2.0, None, 8.0, 4.0])
87
88
result = s.qcut([0.2, 0.3], labels=["a", "b", "c"])
89
90
expected = pl.Series(["a", None, "b", "c", None, "c", "c"], dtype=pl.Categorical)
91
assert_series_equal(result, expected, categorical_as_str=True)
92
93
94
def test_qcut_full_null() -> None:
95
s = pl.Series("a", [None, None, None, None])
96
97
result = s.qcut([0.25, 0.50])
98
99
expected = pl.Series("a", [None, None, None, None], dtype=pl.Categorical)
100
assert_series_equal(result, expected, categorical_as_str=True)
101
102
103
def test_qcut_full_null_with_labels() -> None:
104
s = pl.Series("a", [None, None, None, None])
105
106
result = s.qcut([0.25, 0.50], labels=["1", "2", "3"])
107
108
expected = pl.Series("a", [None, None, None, None], dtype=pl.Categorical)
109
assert_series_equal(result, expected, categorical_as_str=True)
110
111
112
def test_qcut_allow_duplicates() -> None:
113
s = pl.Series([1, 2, 2, 3])
114
115
with pytest.raises(DuplicateError):
116
s.qcut([0.50, 0.51])
117
118
result = s.qcut([0.50, 0.51], allow_duplicates=True)
119
120
expected = pl.Series(
121
["(-inf, 2]", "(-inf, 2]", "(-inf, 2]", "(2, inf]"], dtype=pl.Categorical
122
)
123
assert_series_equal(result, expected, categorical_as_str=True)
124
125
126
def test_qcut_over() -> None:
127
df = pl.DataFrame(
128
{
129
"group": ["a"] * 4 + ["b"] * 4,
130
"value": range(8),
131
}
132
)
133
134
out = df.select(
135
pl.col("value").qcut([0.5], labels=["low", "high"]).over("group")
136
).to_series()
137
138
expected = pl.Series(
139
"value",
140
["low", "low", "high", "high", "low", "low", "high", "high"],
141
dtype=pl.Categorical,
142
)
143
assert_series_equal(out, expected, categorical_as_str=True)
144
145