Path: blob/main/py-polars/tests/unit/operations/test_hist.py
6939 views
from __future__ import annotations12import numpy as np3import pytest45import polars as pl6from polars.exceptions import ComputeError7from polars.testing import assert_frame_equal89inf = float("inf")101112def test_hist_empty_data_no_inputs() -> None:13s = pl.Series([], dtype=pl.UInt8)1415# No bins or edges specified: 10 bins around unit interval16expected = pl.DataFrame(17{18"breakpoint": pl.Series(19[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], dtype=pl.Float6420),21"category": pl.Series(22[23"[0.0, 0.1]",24"(0.1, 0.2]",25"(0.2, 0.3]",26"(0.3, 0.4]",27"(0.4, 0.5]",28"(0.5, 0.6]",29"(0.6, 0.7]",30"(0.7, 0.8]",31"(0.8, 0.9]",32"(0.9, 1.0]",33],34dtype=pl.Categorical,35),36"count": pl.Series([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=pl.UInt32),37}38)39result = s.hist()40assert_frame_equal(result, expected)414243def test_hist_empty_data_empty_bins() -> None:44s = pl.Series([], dtype=pl.UInt8)4546# No bins or edges specified: 10 bins around unit interval47expected = pl.DataFrame(48{49"breakpoint": pl.Series([], dtype=pl.Float64),50"category": pl.Series([], dtype=pl.Categorical),51"count": pl.Series([], dtype=pl.UInt32),52}53)54result = s.hist(bins=[])55assert_frame_equal(result, expected)565758def test_hist_empty_data_single_bin_edge() -> None:59s = pl.Series([], dtype=pl.UInt8)6061# No bins or edges specified: 10 bins around unit interval62expected = pl.DataFrame(63{64"breakpoint": pl.Series([], dtype=pl.Float64),65"category": pl.Series([], dtype=pl.Categorical),66"count": pl.Series([], dtype=pl.UInt32),67}68)69result = s.hist(bins=[2])70assert_frame_equal(result, expected)717273def test_hist_empty_data_valid_edges() -> None:74s = pl.Series([], dtype=pl.UInt8)7576# No bins or edges specified: 10 bins around unit interval77expected = pl.DataFrame(78{79"breakpoint": pl.Series([2.0, 3.0], dtype=pl.Float64),80"category": pl.Series(["[1.0, 2.0]", "(2.0, 3.0]"], dtype=pl.Categorical),81"count": pl.Series([0, 0], dtype=pl.UInt32),82}83)84result = s.hist(bins=[1, 2, 3])85assert_frame_equal(result, expected)868788def test_hist_empty_data_invalid_edges() -> None:89s = pl.Series([], dtype=pl.UInt8)90with pytest.raises(ComputeError, match="bins must increase monotonically"):91s.hist(bins=[1, 0]) # invalid order929394def test_hist_empty_data_bad_bin_count() -> None:95s = pl.Series([], dtype=pl.UInt8)96with pytest.raises(OverflowError, match="can't convert negative int to unsigned"):97s.hist(bin_count=-1) # invalid order9899100def test_hist_empty_data_zero_bin_count() -> None:101s = pl.Series([], dtype=pl.UInt8)102expected = pl.DataFrame(103{104"breakpoint": pl.Series([], dtype=pl.Float64),105"category": pl.Series([], dtype=pl.Categorical),106"count": pl.Series([], dtype=pl.UInt32),107}108)109result = s.hist(bin_count=0)110assert_frame_equal(result, expected)111112113def test_hist_empty_data_single_bin_count() -> None:114s = pl.Series([], dtype=pl.UInt8)115expected = pl.DataFrame(116{117"breakpoint": pl.Series([1.0], dtype=pl.Float64),118"category": pl.Series(["[0.0, 1.0]"], dtype=pl.Categorical),119"count": pl.Series([0], dtype=pl.UInt32),120}121)122result = s.hist(bin_count=1)123assert_frame_equal(result, expected)124125126def test_hist_empty_data_valid_bin_count() -> None:127s = pl.Series([], dtype=pl.UInt8)128expected = pl.DataFrame(129{130"breakpoint": pl.Series([0.2, 0.4, 0.6, 0.8, 1.0], dtype=pl.Float64),131"category": pl.Series(132[133"[0.0, 0.2]",134"(0.2, 0.4]",135"(0.4, 0.6]",136"(0.6, 0.8]",137"(0.8, 1.0]",138],139dtype=pl.Categorical,140),141"count": pl.Series([0, 0, 0, 0, 0], dtype=pl.UInt32),142}143)144result = s.hist(bin_count=5)145assert_frame_equal(result, expected)146147148def test_hist_invalid_bin_count() -> None:149s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)150with pytest.raises(OverflowError, match="can't convert negative int to unsigned"):151s.hist(bin_count=-1) # invalid order152153154def test_hist_invalid_bins() -> None:155s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)156with pytest.raises(ComputeError, match="bins must increase monotonically"):157s.hist(bins=[1, 0]) # invalid order158159160def test_hist_bin_outside_data() -> None:161s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)162result = s.hist(bins=[-10, -9])163expected = pl.DataFrame(164{165"breakpoint": pl.Series([-9.0], dtype=pl.Float64),166"category": pl.Series(["[-10.0, -9.0]"], dtype=pl.Categorical),167"count": pl.Series([0], dtype=pl.UInt32),168}169)170assert_frame_equal(result, expected)171172173def test_hist_bins_between_data() -> None:174s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)175result = s.hist(bins=[4.5, 10.5])176expected = pl.DataFrame(177{178"breakpoint": pl.Series([10.5], dtype=pl.Float64),179"category": pl.Series(["[4.5, 10.5]"], dtype=pl.Categorical),180"count": pl.Series([0], dtype=pl.UInt32),181}182)183assert_frame_equal(result, expected)184185186def test_hist_bins_first_edge() -> None:187s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)188result = s.hist(bins=[2, 3, 4])189expected = pl.DataFrame(190{191"breakpoint": pl.Series([3.0, 4.0], dtype=pl.Float64),192"category": pl.Series(["[2.0, 3.0]", "(3.0, 4.0]"], dtype=pl.Categorical),193"count": pl.Series([1, 0], dtype=pl.UInt32),194}195)196assert_frame_equal(result, expected)197198199def test_hist_bins_last_edge() -> None:200s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)201result = s.hist(bins=[-4, 0, 99, 100])202expected = pl.DataFrame(203{204"breakpoint": pl.Series([0.0, 99.0, 100.0], dtype=pl.Float64),205"category": pl.Series(206[207"[-4.0, 0.0]",208"(0.0, 99.0]",209"(99.0, 100.0]",210],211dtype=pl.Categorical,212),213"count": pl.Series([1, 3, 0], dtype=pl.UInt32),214}215)216assert_frame_equal(result, expected)217218219def test_hist_single_value_single_bin_count() -> None:220s = pl.Series([1], dtype=pl.Int32)221result = s.hist(bin_count=1)222expected = pl.DataFrame(223{224"breakpoint": pl.Series([1.5], dtype=pl.Float64),225"category": pl.Series(["[0.5, 1.5]"], dtype=pl.Categorical),226"count": pl.Series([1], dtype=pl.UInt32),227}228)229assert_frame_equal(result, expected)230231232def test_hist_single_bin_count() -> None:233s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)234result = s.hist(bin_count=1)235expected = pl.DataFrame(236{237"breakpoint": pl.Series([99.0], dtype=pl.Float64),238"category": pl.Series(["[-5.0, 99.0]"], dtype=pl.Categorical),239"count": pl.Series([5], dtype=pl.UInt32),240}241)242assert_frame_equal(result, expected)243244245def test_hist_partial_covering() -> None:246s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)247result = s.hist(bins=[-1.5, 2.5, 50, 105])248expected = pl.DataFrame(249{250"breakpoint": pl.Series([2.5, 50.0, 105.0], dtype=pl.Float64),251"category": pl.Series(252["[-1.5, 2.5]", "(2.5, 50.0]", "(50.0, 105.0]"], dtype=pl.Categorical253),254"count": pl.Series([3, 0, 1], dtype=pl.UInt32),255}256)257assert_frame_equal(result, expected)258259260def test_hist_full_covering() -> None:261s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)262result = s.hist(bins=[-5.5, 2.5, 50, 105])263expected = pl.DataFrame(264{265"breakpoint": pl.Series([2.5, 50.0, 105.0], dtype=pl.Float64),266"category": pl.Series(267["[-5.5, 2.5]", "(2.5, 50.0]", "(50.0, 105.0]"], dtype=pl.Categorical268),269"count": pl.Series([4, 0, 1], dtype=pl.UInt32),270}271)272assert_frame_equal(result, expected)273274275def test_hist_more_bins_than_data() -> None:276s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)277result = s.hist(bin_count=8)278279# manually compute breaks280span = 99 - (-5)281width = span / 8282breaks = [-5 + width * i for i in range(8 + 1)]283categories = [f"({breaks[i]}, {breaks[i + 1]}]" for i in range(8)]284categories[0] = f"[{categories[0][1:]}"285286expected = pl.DataFrame(287{288"breakpoint": pl.Series(breaks[1:], dtype=pl.Float64),289"category": pl.Series(categories, dtype=pl.Categorical),290"count": pl.Series([4, 0, 0, 0, 0, 0, 0, 1], dtype=pl.UInt32),291}292)293assert_frame_equal(result, expected)294295296def test_hist() -> None:297s = pl.Series("a", [1, 3, 8, 8, 2, 1, 3])298out = s.hist(bin_count=4)299expected = pl.DataFrame(300{301"breakpoint": pl.Series([2.75, 4.5, 6.25, 8.0], dtype=pl.Float64),302"category": pl.Series(303["[1.0, 2.75]", "(2.75, 4.5]", "(4.5, 6.25]", "(6.25, 8.0]"],304dtype=pl.Categorical,305),306"count": pl.Series([3, 2, 0, 2], dtype=pl.get_index_type()),307}308)309assert_frame_equal(out, expected)310311312def test_hist_all_null() -> None:313s = pl.Series([None], dtype=pl.Float64)314out = s.hist()315expected = pl.DataFrame(316{317"breakpoint": pl.Series(318[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], dtype=pl.Float64319),320"category": pl.Series(321[322"[0.0, 0.1]",323"(0.1, 0.2]",324"(0.2, 0.3]",325"(0.3, 0.4]",326"(0.4, 0.5]",327"(0.5, 0.6]",328"(0.6, 0.7]",329"(0.7, 0.8]",330"(0.8, 0.9]",331"(0.9, 1.0]",332],333dtype=pl.Categorical,334),335"count": pl.Series([0] * 10, dtype=pl.get_index_type()),336}337)338assert_frame_equal(out, expected)339340341@pytest.mark.parametrize("n_null", [0, 5])342@pytest.mark.parametrize("n_values", [3, 10, 250])343def test_hist_rand(n_values: int, n_null: int) -> None:344s_rand = pl.Series([None] * n_null, dtype=pl.Int64)345s_values = pl.Series(np.random.randint(0, 100, n_values), dtype=pl.Int64)346if s_values.n_unique() == 1:347pytest.skip("Identical values not tested.")348s = pl.concat((s_rand, s_values))349out = s.hist(bin_count=10)350351bp = out["breakpoint"]352count = out["count"]353min_edge = s.min() - (s.max() - s.min()) * 0.001 # type: ignore[operator]354for i in range(out.height):355if i == 0:356lower = min_edge357else:358lower = bp[i - 1]359upper = bp[i]360361assert ((s <= upper) & (s > lower)).sum() == count[i]362363364def test_hist_floating_point() -> None:365# This test hits the specific floating point case where the bin width should be366# 5.7, but it is represented by 5.6999999. The result is that an item equal to the367# upper bound (72) exceeds the maximum bins. This tests the code path that catches368# this case.369n_values = 3370n_null = 50371372np.random.seed(2)373s_rand = pl.Series([None] * n_null, dtype=pl.Int64)374s_values = pl.Series(np.random.randint(0, 100, n_values), dtype=pl.Int64)375s = pl.concat((s_rand, s_values))376out = s.hist(bin_count=10)377min_edge = s.min() - (s.max() - s.min()) * 0.001 # type: ignore[operator]378379bp = out["breakpoint"]380count = out["count"]381for i in range(out.height):382if i == 0:383lower = min_edge384else:385lower = bp[i - 1]386upper = bp[i]387388assert ((s <= upper) & (s > lower)).sum() == count[i]389390391def test_hist_max_boundary_19998() -> None:392s = pl.Series(393[3949514.988509739183,39530738.098872148617,39641400.15705103004,39749093.06982022727,398]399)400result = s.hist(bin_count=50)401assert result["count"].sum() == 4402403404def test_hist_max_boundary_20133() -> None:405# Given a set of values that result in bin index to be a floating point number that406# is represented as 5.000000000000001407s = pl.Series(408[4096.197601318359375,41083.5203145345052,411]412)413414# When histogram is calculated415result = s.hist(bin_count=5)416417# Then there is no exception (previously was possible to get index out of bounds418# here) and all the numbers fit into at least one of the bins419assert result["count"].sum() == 2420421422def test_hist_same_values_20030() -> None:423out = pl.Series([1, 1]).hist(bin_count=2)424expected = pl.DataFrame(425{426"breakpoint": pl.Series([1.0, 1.5], dtype=pl.Float64),427"category": pl.Series(["[0.5, 1.0]", "(1.0, 1.5]"], dtype=pl.Categorical),428"count": pl.Series([2, 0], dtype=pl.get_index_type()),429}430)431assert_frame_equal(out, expected)432433434def test_hist_breakpoint_accuracy() -> None:435s = pl.Series([1, 2, 3, 4])436out = s.hist(bin_count=3)437expected = pl.DataFrame(438{439"breakpoint": pl.Series([2.0, 3.0, 4.0], dtype=pl.Float64),440"category": pl.Series(441["[1.0, 2.0]", "(2.0, 3.0]", "(3.0, 4.0]"], dtype=pl.Categorical442),443"count": pl.Series([2, 1, 1], dtype=pl.get_index_type()),444}445)446assert_frame_equal(out, expected)447448449def test_hist_ensure_max_value_20879() -> None:450s = pl.Series([-1 / 3, 0, 1, 2, 3, 7])451result = s.hist(bin_count=3)452expected = pl.DataFrame(453{454"breakpoint": pl.Series(455[4562.0 + 1 / 9,4574.0 + 5 / 9,4587.0,459],460dtype=pl.Float64,461),462"category": pl.Series(463[464"[-0.333333, 2.111111]",465"(2.111111, 4.555556]",466"(4.555556, 7.0]",467],468dtype=pl.Categorical,469),470"count": pl.Series([4, 1, 1], dtype=pl.get_index_type()),471}472)473assert_frame_equal(result, expected)474475476def test_hist_ignore_nans_21082() -> None:477s = pl.Series([0.0, float("nan"), 0.5, float("nan"), 1.0])478result = s.hist(bins=[-0.001, 0.25, 0.5, 0.75, 1.0])479expected = pl.DataFrame(480{481"breakpoint": pl.Series([0.25, 0.5, 0.75, 1.0], dtype=pl.Float64),482"category": pl.Series(483[484"[-0.001, 0.25]",485"(0.25, 0.5]",486"(0.5, 0.75]",487"(0.75, 1.0]",488],489dtype=pl.Categorical,490),491"count": pl.Series([1, 1, 0, 1], dtype=pl.get_index_type()),492}493)494assert_frame_equal(result, expected)495496497def test_hist_include_lower_22056() -> None:498s = pl.Series("a", [1, 5])499result = s.hist(bins=[1, 5], include_category=True)500expected = pl.DataFrame(501{502"breakpoint": pl.Series([5.0], dtype=pl.Float64),503"category": pl.Series(["[1.0, 5.0]"], dtype=pl.Categorical),504"count": pl.Series([2], dtype=pl.get_index_type()),505}506)507assert_frame_equal(result, expected)508509510def test_hist_ulp_edge_22234() -> None:511# Uniform path512s = pl.Series([1.0, 1e-16, 1.3e-16, -1.0])513result = s.hist(bin_count=2)514assert result["count"].to_list() == [1, 3]515516# Manual path517result = s.hist(bins=[-1, 0, 1])518assert result["count"].to_list() == [1, 3]519520521