Path: blob/main/py-polars/tests/unit/operations/test_hist.py
8422 views
from __future__ import annotations12import numpy as np3import pytest45import polars as pl6from polars.exceptions import ComputeError7from polars.testing import assert_frame_equal89inf = float("inf")101112def test_hist_empty_data_no_inputs() -> None:13s = pl.Series([], dtype=pl.UInt8)1415# No bins or edges specified: 10 bins around unit interval16expected = pl.DataFrame(17{18"breakpoint": pl.Series(19[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], dtype=pl.Float6420),21"category": pl.Series(22[23"[0.0, 0.1]",24"(0.1, 0.2]",25"(0.2, 0.3]",26"(0.3, 0.4]",27"(0.4, 0.5]",28"(0.5, 0.6]",29"(0.6, 0.7]",30"(0.7, 0.8]",31"(0.8, 0.9]",32"(0.9, 1.0]",33],34dtype=pl.Categorical,35),36"count": pl.Series(37[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=pl.get_index_type()38),39}40)41result = s.hist()42assert_frame_equal(result, expected)434445def test_hist_empty_data_empty_bins() -> None:46s = pl.Series([], dtype=pl.UInt8)4748# No bins or edges specified: 10 bins around unit interval49expected = pl.DataFrame(50{51"breakpoint": pl.Series([], dtype=pl.Float64),52"category": pl.Series([], dtype=pl.Categorical),53"count": pl.Series([], dtype=pl.get_index_type()),54}55)56result = s.hist(bins=[])57assert_frame_equal(result, expected)585960def test_hist_empty_data_single_bin_edge() -> None:61s = pl.Series([], dtype=pl.UInt8)6263# No bins or edges specified: 10 bins around unit interval64expected = pl.DataFrame(65{66"breakpoint": pl.Series([], dtype=pl.Float64),67"category": pl.Series([], dtype=pl.Categorical),68"count": pl.Series([], dtype=pl.get_index_type()),69}70)71result = s.hist(bins=[2])72assert_frame_equal(result, expected)737475def test_hist_empty_data_valid_edges() -> None:76s = pl.Series([], dtype=pl.UInt8)7778# No bins or edges specified: 10 bins around unit interval79expected = pl.DataFrame(80{81"breakpoint": pl.Series([2.0, 3.0], dtype=pl.Float64),82"category": pl.Series(["[1.0, 2.0]", "(2.0, 3.0]"], dtype=pl.Categorical),83"count": pl.Series([0, 0], dtype=pl.get_index_type()),84}85)86result = s.hist(bins=[1, 2, 3])87assert_frame_equal(result, expected)888990def test_hist_empty_data_invalid_edges() -> None:91s = pl.Series([], dtype=pl.UInt8)92with pytest.raises(ComputeError, match="bins must increase monotonically"):93s.hist(bins=[1, 0]) # invalid order949596def test_hist_empty_data_bad_bin_count() -> None:97s = pl.Series([], dtype=pl.UInt8)98with pytest.raises(OverflowError, match="can't convert negative int to unsigned"):99s.hist(bin_count=-1) # invalid order100101102def test_hist_empty_data_zero_bin_count() -> None:103s = pl.Series([], dtype=pl.UInt8)104expected = pl.DataFrame(105{106"breakpoint": pl.Series([], dtype=pl.Float64),107"category": pl.Series([], dtype=pl.Categorical),108"count": pl.Series([], dtype=pl.get_index_type()),109}110)111result = s.hist(bin_count=0)112assert_frame_equal(result, expected)113114115def test_hist_empty_data_single_bin_count() -> None:116s = pl.Series([], dtype=pl.UInt8)117expected = pl.DataFrame(118{119"breakpoint": pl.Series([1.0], dtype=pl.Float64),120"category": pl.Series(["[0.0, 1.0]"], dtype=pl.Categorical),121"count": pl.Series([0], dtype=pl.get_index_type()),122}123)124result = s.hist(bin_count=1)125assert_frame_equal(result, expected)126127128def test_hist_empty_data_valid_bin_count() -> None:129s = pl.Series([], dtype=pl.UInt8)130expected = pl.DataFrame(131{132"breakpoint": pl.Series([0.2, 0.4, 0.6, 0.8, 1.0], dtype=pl.Float64),133"category": pl.Series(134[135"[0.0, 0.2]",136"(0.2, 0.4]",137"(0.4, 0.6]",138"(0.6, 0.8]",139"(0.8, 1.0]",140],141dtype=pl.Categorical,142),143"count": pl.Series([0, 0, 0, 0, 0], dtype=pl.get_index_type()),144}145)146result = s.hist(bin_count=5)147assert_frame_equal(result, expected)148149150def test_hist_invalid_bin_count() -> None:151s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)152with pytest.raises(OverflowError, match="can't convert negative int to unsigned"):153s.hist(bin_count=-1) # invalid order154155156def test_hist_invalid_bins() -> None:157s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)158with pytest.raises(ComputeError, match="bins must increase monotonically"):159s.hist(bins=[1, 0]) # invalid order160161162def test_hist_bin_outside_data() -> None:163s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)164result = s.hist(bins=[-10, -9])165expected = pl.DataFrame(166{167"breakpoint": pl.Series([-9.0], dtype=pl.Float64),168"category": pl.Series(["[-10.0, -9.0]"], dtype=pl.Categorical),169"count": pl.Series([0], dtype=pl.get_index_type()),170}171)172assert_frame_equal(result, expected)173174175def test_hist_bins_between_data() -> None:176s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)177result = s.hist(bins=[4.5, 10.5])178expected = pl.DataFrame(179{180"breakpoint": pl.Series([10.5], dtype=pl.Float64),181"category": pl.Series(["[4.5, 10.5]"], dtype=pl.Categorical),182"count": pl.Series([0], dtype=pl.get_index_type()),183}184)185assert_frame_equal(result, expected)186187188def test_hist_bins_first_edge() -> None:189s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)190result = s.hist(bins=[2, 3, 4])191expected = pl.DataFrame(192{193"breakpoint": pl.Series([3.0, 4.0], dtype=pl.Float64),194"category": pl.Series(["[2.0, 3.0]", "(3.0, 4.0]"], dtype=pl.Categorical),195"count": pl.Series([1, 0], dtype=pl.get_index_type()),196}197)198assert_frame_equal(result, expected)199200201def test_hist_bins_last_edge() -> None:202s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)203result = s.hist(bins=[-4, 0, 99, 100])204expected = pl.DataFrame(205{206"breakpoint": pl.Series([0.0, 99.0, 100.0], dtype=pl.Float64),207"category": pl.Series(208[209"[-4.0, 0.0]",210"(0.0, 99.0]",211"(99.0, 100.0]",212],213dtype=pl.Categorical,214),215"count": pl.Series([1, 3, 0], dtype=pl.get_index_type()),216}217)218assert_frame_equal(result, expected)219220221def test_hist_single_value_single_bin_count() -> None:222s = pl.Series([1], dtype=pl.Int32)223result = s.hist(bin_count=1)224expected = pl.DataFrame(225{226"breakpoint": pl.Series([1.5], dtype=pl.Float64),227"category": pl.Series(["[0.5, 1.5]"], dtype=pl.Categorical),228"count": pl.Series([1], dtype=pl.get_index_type()),229}230)231assert_frame_equal(result, expected)232233234def test_hist_single_bin_count() -> None:235s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)236result = s.hist(bin_count=1)237expected = pl.DataFrame(238{239"breakpoint": pl.Series([99.0], dtype=pl.Float64),240"category": pl.Series(["[-5.0, 99.0]"], dtype=pl.Categorical),241"count": pl.Series([5], dtype=pl.get_index_type()),242}243)244assert_frame_equal(result, expected)245246247def test_hist_partial_covering() -> None:248s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)249result = s.hist(bins=[-1.5, 2.5, 50, 105])250expected = pl.DataFrame(251{252"breakpoint": pl.Series([2.5, 50.0, 105.0], dtype=pl.Float64),253"category": pl.Series(254["[-1.5, 2.5]", "(2.5, 50.0]", "(50.0, 105.0]"], dtype=pl.Categorical255),256"count": pl.Series([3, 0, 1], dtype=pl.get_index_type()),257}258)259assert_frame_equal(result, expected)260261262def test_hist_full_covering() -> None:263s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)264result = s.hist(bins=[-5.5, 2.5, 50, 105])265expected = pl.DataFrame(266{267"breakpoint": pl.Series([2.5, 50.0, 105.0], dtype=pl.Float64),268"category": pl.Series(269["[-5.5, 2.5]", "(2.5, 50.0]", "(50.0, 105.0]"], dtype=pl.Categorical270),271"count": pl.Series([4, 0, 1], dtype=pl.get_index_type()),272}273)274assert_frame_equal(result, expected)275276277def test_hist_more_bins_than_data() -> None:278s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)279result = s.hist(bin_count=8)280281# manually compute breaks282span = 99 - (-5)283width = span / 8284breaks = [-5 + width * i for i in range(8 + 1)]285categories = [f"({breaks[i]}, {breaks[i + 1]}]" for i in range(8)]286categories[0] = f"[{categories[0][1:]}"287288expected = pl.DataFrame(289{290"breakpoint": pl.Series(breaks[1:], dtype=pl.Float64),291"category": pl.Series(categories, dtype=pl.Categorical),292"count": pl.Series([4, 0, 0, 0, 0, 0, 0, 1], dtype=pl.get_index_type()),293}294)295assert_frame_equal(result, expected)296297298def test_hist() -> None:299s = pl.Series("a", [1, 3, 8, 8, 2, 1, 3])300out = s.hist(bin_count=4)301expected = pl.DataFrame(302{303"breakpoint": pl.Series([2.75, 4.5, 6.25, 8.0], dtype=pl.Float64),304"category": pl.Series(305["[1.0, 2.75]", "(2.75, 4.5]", "(4.5, 6.25]", "(6.25, 8.0]"],306dtype=pl.Categorical,307),308"count": pl.Series([3, 2, 0, 2], dtype=pl.get_index_type()),309}310)311assert_frame_equal(out, expected)312313314def test_hist_all_null() -> None:315s = pl.Series([None], dtype=pl.Float64)316out = s.hist()317expected = pl.DataFrame(318{319"breakpoint": pl.Series(320[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], dtype=pl.Float64321),322"category": pl.Series(323[324"[0.0, 0.1]",325"(0.1, 0.2]",326"(0.2, 0.3]",327"(0.3, 0.4]",328"(0.4, 0.5]",329"(0.5, 0.6]",330"(0.6, 0.7]",331"(0.7, 0.8]",332"(0.8, 0.9]",333"(0.9, 1.0]",334],335dtype=pl.Categorical,336),337"count": pl.Series([0] * 10, dtype=pl.get_index_type()),338}339)340assert_frame_equal(out, expected)341342343@pytest.mark.parametrize("n_null", [0, 5])344@pytest.mark.parametrize("n_values", [3, 10, 250])345def test_hist_rand(n_values: int, n_null: int) -> None:346s_rand = pl.Series([None] * n_null, dtype=pl.Int64)347s_values = pl.Series(np.random.randint(0, 100, n_values), dtype=pl.Int64)348if s_values.n_unique() == 1:349pytest.skip("Identical values not tested.")350s = pl.concat((s_rand, s_values))351out = s.hist(bin_count=10)352353bp = out["breakpoint"]354count = out["count"]355min_edge = s.min() - (s.max() - s.min()) * 0.001 # type: ignore[operator]356for i in range(out.height):357if i == 0:358lower = min_edge359else:360lower = bp[i - 1]361upper = bp[i]362363assert ((s <= upper) & (s > lower)).sum() == count[i]364365366def test_hist_floating_point() -> None:367# This test hits the specific floating point case where the bin width should be368# 5.7, but it is represented by 5.6999999. The result is that an item equal to the369# upper bound (72) exceeds the maximum bins. This tests the code path that catches370# this case.371n_values = 3372n_null = 50373374np.random.seed(2)375s_rand = pl.Series([None] * n_null, dtype=pl.Int64)376s_values = pl.Series(np.random.randint(0, 100, n_values), dtype=pl.Int64)377s = pl.concat((s_rand, s_values))378out = s.hist(bin_count=10)379min_edge = s.min() - (s.max() - s.min()) * 0.001 # type: ignore[operator]380381bp = out["breakpoint"]382count = out["count"]383for i in range(out.height):384if i == 0:385lower = min_edge386else:387lower = bp[i - 1]388upper = bp[i]389390assert ((s <= upper) & (s > lower)).sum() == count[i]391392393def test_hist_max_boundary_19998() -> None:394s = pl.Series(395[3969514.988509739183,39730738.098872148617,39841400.15705103004,39949093.06982022727,400]401)402result = s.hist(bin_count=50)403assert result["count"].sum() == 4404405406def test_hist_max_boundary_20133() -> None:407# Given a set of values that result in bin index to be a floating point number that408# is represented as 5.000000000000001409s = pl.Series(410[4116.197601318359375,41283.5203145345052,413]414)415416# When histogram is calculated417result = s.hist(bin_count=5)418419# Then there is no exception (previously was possible to get index out of bounds420# here) and all the numbers fit into at least one of the bins421assert result["count"].sum() == 2422423424def test_hist_same_values_20030() -> None:425out = pl.Series([1, 1]).hist(bin_count=2)426expected = pl.DataFrame(427{428"breakpoint": pl.Series([1.0, 1.5], dtype=pl.Float64),429"category": pl.Series(["[0.5, 1.0]", "(1.0, 1.5]"], dtype=pl.Categorical),430"count": pl.Series([2, 0], dtype=pl.get_index_type()),431}432)433assert_frame_equal(out, expected)434435436def test_hist_breakpoint_accuracy() -> None:437s = pl.Series([1, 2, 3, 4])438out = s.hist(bin_count=3)439expected = pl.DataFrame(440{441"breakpoint": pl.Series([2.0, 3.0, 4.0], dtype=pl.Float64),442"category": pl.Series(443["[1.0, 2.0]", "(2.0, 3.0]", "(3.0, 4.0]"], dtype=pl.Categorical444),445"count": pl.Series([2, 1, 1], dtype=pl.get_index_type()),446}447)448assert_frame_equal(out, expected)449450451def test_hist_ensure_max_value_20879() -> None:452s = pl.Series([-1 / 3, 0, 1, 2, 3, 7])453result = s.hist(bin_count=3)454expected = pl.DataFrame(455{456"breakpoint": pl.Series(457[4582.0 + 1 / 9,4594.0 + 5 / 9,4607.0,461],462dtype=pl.Float64,463),464"category": pl.Series(465[466"[-0.333333, 2.111111]",467"(2.111111, 4.555556]",468"(4.555556, 7.0]",469],470dtype=pl.Categorical,471),472"count": pl.Series([4, 1, 1], dtype=pl.get_index_type()),473}474)475assert_frame_equal(result, expected)476477478def test_hist_ignore_nans_21082() -> None:479s = pl.Series([0.0, float("nan"), 0.5, float("nan"), 1.0])480result = s.hist(bins=[-0.001, 0.25, 0.5, 0.75, 1.0])481expected = pl.DataFrame(482{483"breakpoint": pl.Series([0.25, 0.5, 0.75, 1.0], dtype=pl.Float64),484"category": pl.Series(485[486"[-0.001, 0.25]",487"(0.25, 0.5]",488"(0.5, 0.75]",489"(0.75, 1.0]",490],491dtype=pl.Categorical,492),493"count": pl.Series([1, 1, 0, 1], dtype=pl.get_index_type()),494}495)496assert_frame_equal(result, expected)497498499def test_hist_include_lower_22056() -> None:500s = pl.Series("a", [1, 5])501result = s.hist(bins=[1, 5], include_category=True)502expected = pl.DataFrame(503{504"breakpoint": pl.Series([5.0], dtype=pl.Float64),505"category": pl.Series(["[1.0, 5.0]"], dtype=pl.Categorical),506"count": pl.Series([2], dtype=pl.get_index_type()),507}508)509assert_frame_equal(result, expected)510511512def test_hist_ulp_edge_22234() -> None:513# Uniform path514s = pl.Series([1.0, 1e-16, 1.3e-16, -1.0])515result = s.hist(bin_count=2)516assert result["count"].to_list() == [1, 3]517518# Manual path519result = s.hist(bins=[-1, 0, 1])520assert result["count"].to_list() == [1, 3]521522523