Path: blob/main/py-polars/tests/unit/operations/test_qcut.py
6939 views
from __future__ import annotations12import pytest34import polars as pl5from polars.exceptions import DuplicateError6from polars.testing import assert_frame_equal, assert_series_equal78inf = float("inf")91011def test_qcut() -> None:12s = pl.Series("a", [-2, -1, 0, 1, 2])1314result = s.qcut([0.25, 0.50])1516expected = pl.Series(17"a",18[19"(-inf, -1]",20"(-inf, -1]",21"(-1, 0]",22"(0, inf]",23"(0, inf]",24],25dtype=pl.Categorical,26)27assert_series_equal(result, expected, categorical_as_str=True)282930def test_qcut_lazy_schema() -> None:31lf = pl.LazyFrame({"a": [-2, -1, 0, 1, 2]})3233result = lf.select(pl.col("a").qcut([0.25, 0.75]))3435expected = pl.LazyFrame(36{"a": ["(-inf, -1]", "(-inf, -1]", "(-1, 1]", "(-1, 1]", "(1, inf]"]},37schema={"a": pl.Categorical},38)39assert_frame_equal(result, expected, categorical_as_str=True)404142def test_qcut_n() -> None:43s = pl.Series("a", [-2, -1, 0, 1, 2])4445out = s.qcut(2, labels=["x", "y"], left_closed=True)4647expected = pl.Series("a", ["x", "x", "y", "y", "y"], dtype=pl.Categorical)48assert_series_equal(out, expected, categorical_as_str=True)495051def test_qcut_include_breaks() -> None:52s = pl.int_range(-2, 3, eager=True).alias("a")5354out = s.qcut([0.0, 0.25, 0.75], labels=["a", "b", "c", "d"], include_breaks=True)5556expected = pl.DataFrame(57{58"breakpoint": [-2.0, -1.0, 1.0, 1.0, inf],59"category": ["a", "b", "c", "c", "d"],60},61schema_overrides={"category": pl.Categorical},62).to_struct("a")63assert_series_equal(out, expected, categorical_as_str=True)646566# https://github.com/pola-rs/polars/issues/1125567def test_qcut_include_breaks_lazy_schema() -> None:68lf = pl.LazyFrame({"a": [-2, -1, 0, 1, 2]})6970result = lf.select(71pl.col("a").qcut([0.25, 0.75], include_breaks=True).alias("qcut")72).unnest("qcut")7374expected = pl.LazyFrame(75{76"breakpoint": [-1.0, -1.0, 1.0, 1.0, inf],77"category": ["(-inf, -1]", "(-inf, -1]", "(-1, 1]", "(-1, 1]", "(1, inf]"],78},79schema_overrides={"category": pl.Categorical},80)81assert_frame_equal(result, expected, categorical_as_str=True)828384def test_qcut_null_values() -> None:85s = pl.Series([-1.0, None, 1.0, 2.0, None, 8.0, 4.0])8687result = s.qcut([0.2, 0.3], labels=["a", "b", "c"])8889expected = pl.Series(["a", None, "b", "c", None, "c", "c"], dtype=pl.Categorical)90assert_series_equal(result, expected, categorical_as_str=True)919293def test_qcut_full_null() -> None:94s = pl.Series("a", [None, None, None, None])9596result = s.qcut([0.25, 0.50])9798expected = pl.Series("a", [None, None, None, None], dtype=pl.Categorical)99assert_series_equal(result, expected, categorical_as_str=True)100101102def test_qcut_full_null_with_labels() -> None:103s = pl.Series("a", [None, None, None, None])104105result = s.qcut([0.25, 0.50], labels=["1", "2", "3"])106107expected = pl.Series("a", [None, None, None, None], dtype=pl.Categorical)108assert_series_equal(result, expected, categorical_as_str=True)109110111def test_qcut_allow_duplicates() -> None:112s = pl.Series([1, 2, 2, 3])113114with pytest.raises(DuplicateError):115s.qcut([0.50, 0.51])116117result = s.qcut([0.50, 0.51], allow_duplicates=True)118119expected = pl.Series(120["(-inf, 2]", "(-inf, 2]", "(-inf, 2]", "(2, inf]"], dtype=pl.Categorical121)122assert_series_equal(result, expected, categorical_as_str=True)123124125def test_qcut_over() -> None:126df = pl.DataFrame(127{128"group": ["a"] * 4 + ["b"] * 4,129"value": range(8),130}131)132133out = df.select(134pl.col("value").qcut([0.5], labels=["low", "high"]).over("group")135).to_series()136137expected = pl.Series(138"value",139["low", "low", "high", "high", "low", "low", "high", "high"],140dtype=pl.Categorical,141)142assert_series_equal(out, expected, categorical_as_str=True)143144145