Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/test_hist.py
8422 views
1
from __future__ import annotations
2
3
import numpy as np
4
import pytest
5
6
import polars as pl
7
from polars.exceptions import ComputeError
8
from polars.testing import assert_frame_equal
9
10
inf = float("inf")
11
12
13
def test_hist_empty_data_no_inputs() -> None:
14
s = pl.Series([], dtype=pl.UInt8)
15
16
# No bins or edges specified: 10 bins around unit interval
17
expected = pl.DataFrame(
18
{
19
"breakpoint": pl.Series(
20
[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], dtype=pl.Float64
21
),
22
"category": pl.Series(
23
[
24
"[0.0, 0.1]",
25
"(0.1, 0.2]",
26
"(0.2, 0.3]",
27
"(0.3, 0.4]",
28
"(0.4, 0.5]",
29
"(0.5, 0.6]",
30
"(0.6, 0.7]",
31
"(0.7, 0.8]",
32
"(0.8, 0.9]",
33
"(0.9, 1.0]",
34
],
35
dtype=pl.Categorical,
36
),
37
"count": pl.Series(
38
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=pl.get_index_type()
39
),
40
}
41
)
42
result = s.hist()
43
assert_frame_equal(result, expected)
44
45
46
def test_hist_empty_data_empty_bins() -> None:
47
s = pl.Series([], dtype=pl.UInt8)
48
49
# No bins or edges specified: 10 bins around unit interval
50
expected = pl.DataFrame(
51
{
52
"breakpoint": pl.Series([], dtype=pl.Float64),
53
"category": pl.Series([], dtype=pl.Categorical),
54
"count": pl.Series([], dtype=pl.get_index_type()),
55
}
56
)
57
result = s.hist(bins=[])
58
assert_frame_equal(result, expected)
59
60
61
def test_hist_empty_data_single_bin_edge() -> None:
62
s = pl.Series([], dtype=pl.UInt8)
63
64
# No bins or edges specified: 10 bins around unit interval
65
expected = pl.DataFrame(
66
{
67
"breakpoint": pl.Series([], dtype=pl.Float64),
68
"category": pl.Series([], dtype=pl.Categorical),
69
"count": pl.Series([], dtype=pl.get_index_type()),
70
}
71
)
72
result = s.hist(bins=[2])
73
assert_frame_equal(result, expected)
74
75
76
def test_hist_empty_data_valid_edges() -> None:
77
s = pl.Series([], dtype=pl.UInt8)
78
79
# No bins or edges specified: 10 bins around unit interval
80
expected = pl.DataFrame(
81
{
82
"breakpoint": pl.Series([2.0, 3.0], dtype=pl.Float64),
83
"category": pl.Series(["[1.0, 2.0]", "(2.0, 3.0]"], dtype=pl.Categorical),
84
"count": pl.Series([0, 0], dtype=pl.get_index_type()),
85
}
86
)
87
result = s.hist(bins=[1, 2, 3])
88
assert_frame_equal(result, expected)
89
90
91
def test_hist_empty_data_invalid_edges() -> None:
92
s = pl.Series([], dtype=pl.UInt8)
93
with pytest.raises(ComputeError, match="bins must increase monotonically"):
94
s.hist(bins=[1, 0]) # invalid order
95
96
97
def test_hist_empty_data_bad_bin_count() -> None:
98
s = pl.Series([], dtype=pl.UInt8)
99
with pytest.raises(OverflowError, match="can't convert negative int to unsigned"):
100
s.hist(bin_count=-1) # invalid order
101
102
103
def test_hist_empty_data_zero_bin_count() -> None:
104
s = pl.Series([], dtype=pl.UInt8)
105
expected = pl.DataFrame(
106
{
107
"breakpoint": pl.Series([], dtype=pl.Float64),
108
"category": pl.Series([], dtype=pl.Categorical),
109
"count": pl.Series([], dtype=pl.get_index_type()),
110
}
111
)
112
result = s.hist(bin_count=0)
113
assert_frame_equal(result, expected)
114
115
116
def test_hist_empty_data_single_bin_count() -> None:
117
s = pl.Series([], dtype=pl.UInt8)
118
expected = pl.DataFrame(
119
{
120
"breakpoint": pl.Series([1.0], dtype=pl.Float64),
121
"category": pl.Series(["[0.0, 1.0]"], dtype=pl.Categorical),
122
"count": pl.Series([0], dtype=pl.get_index_type()),
123
}
124
)
125
result = s.hist(bin_count=1)
126
assert_frame_equal(result, expected)
127
128
129
def test_hist_empty_data_valid_bin_count() -> None:
130
s = pl.Series([], dtype=pl.UInt8)
131
expected = pl.DataFrame(
132
{
133
"breakpoint": pl.Series([0.2, 0.4, 0.6, 0.8, 1.0], dtype=pl.Float64),
134
"category": pl.Series(
135
[
136
"[0.0, 0.2]",
137
"(0.2, 0.4]",
138
"(0.4, 0.6]",
139
"(0.6, 0.8]",
140
"(0.8, 1.0]",
141
],
142
dtype=pl.Categorical,
143
),
144
"count": pl.Series([0, 0, 0, 0, 0], dtype=pl.get_index_type()),
145
}
146
)
147
result = s.hist(bin_count=5)
148
assert_frame_equal(result, expected)
149
150
151
def test_hist_invalid_bin_count() -> None:
152
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
153
with pytest.raises(OverflowError, match="can't convert negative int to unsigned"):
154
s.hist(bin_count=-1) # invalid order
155
156
157
def test_hist_invalid_bins() -> None:
158
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
159
with pytest.raises(ComputeError, match="bins must increase monotonically"):
160
s.hist(bins=[1, 0]) # invalid order
161
162
163
def test_hist_bin_outside_data() -> None:
164
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
165
result = s.hist(bins=[-10, -9])
166
expected = pl.DataFrame(
167
{
168
"breakpoint": pl.Series([-9.0], dtype=pl.Float64),
169
"category": pl.Series(["[-10.0, -9.0]"], dtype=pl.Categorical),
170
"count": pl.Series([0], dtype=pl.get_index_type()),
171
}
172
)
173
assert_frame_equal(result, expected)
174
175
176
def test_hist_bins_between_data() -> None:
177
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
178
result = s.hist(bins=[4.5, 10.5])
179
expected = pl.DataFrame(
180
{
181
"breakpoint": pl.Series([10.5], dtype=pl.Float64),
182
"category": pl.Series(["[4.5, 10.5]"], dtype=pl.Categorical),
183
"count": pl.Series([0], dtype=pl.get_index_type()),
184
}
185
)
186
assert_frame_equal(result, expected)
187
188
189
def test_hist_bins_first_edge() -> None:
190
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
191
result = s.hist(bins=[2, 3, 4])
192
expected = pl.DataFrame(
193
{
194
"breakpoint": pl.Series([3.0, 4.0], dtype=pl.Float64),
195
"category": pl.Series(["[2.0, 3.0]", "(3.0, 4.0]"], dtype=pl.Categorical),
196
"count": pl.Series([1, 0], dtype=pl.get_index_type()),
197
}
198
)
199
assert_frame_equal(result, expected)
200
201
202
def test_hist_bins_last_edge() -> None:
203
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
204
result = s.hist(bins=[-4, 0, 99, 100])
205
expected = pl.DataFrame(
206
{
207
"breakpoint": pl.Series([0.0, 99.0, 100.0], dtype=pl.Float64),
208
"category": pl.Series(
209
[
210
"[-4.0, 0.0]",
211
"(0.0, 99.0]",
212
"(99.0, 100.0]",
213
],
214
dtype=pl.Categorical,
215
),
216
"count": pl.Series([1, 3, 0], dtype=pl.get_index_type()),
217
}
218
)
219
assert_frame_equal(result, expected)
220
221
222
def test_hist_single_value_single_bin_count() -> None:
223
s = pl.Series([1], dtype=pl.Int32)
224
result = s.hist(bin_count=1)
225
expected = pl.DataFrame(
226
{
227
"breakpoint": pl.Series([1.5], dtype=pl.Float64),
228
"category": pl.Series(["[0.5, 1.5]"], dtype=pl.Categorical),
229
"count": pl.Series([1], dtype=pl.get_index_type()),
230
}
231
)
232
assert_frame_equal(result, expected)
233
234
235
def test_hist_single_bin_count() -> None:
236
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
237
result = s.hist(bin_count=1)
238
expected = pl.DataFrame(
239
{
240
"breakpoint": pl.Series([99.0], dtype=pl.Float64),
241
"category": pl.Series(["[-5.0, 99.0]"], dtype=pl.Categorical),
242
"count": pl.Series([5], dtype=pl.get_index_type()),
243
}
244
)
245
assert_frame_equal(result, expected)
246
247
248
def test_hist_partial_covering() -> None:
249
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
250
result = s.hist(bins=[-1.5, 2.5, 50, 105])
251
expected = pl.DataFrame(
252
{
253
"breakpoint": pl.Series([2.5, 50.0, 105.0], dtype=pl.Float64),
254
"category": pl.Series(
255
["[-1.5, 2.5]", "(2.5, 50.0]", "(50.0, 105.0]"], dtype=pl.Categorical
256
),
257
"count": pl.Series([3, 0, 1], dtype=pl.get_index_type()),
258
}
259
)
260
assert_frame_equal(result, expected)
261
262
263
def test_hist_full_covering() -> None:
264
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
265
result = s.hist(bins=[-5.5, 2.5, 50, 105])
266
expected = pl.DataFrame(
267
{
268
"breakpoint": pl.Series([2.5, 50.0, 105.0], dtype=pl.Float64),
269
"category": pl.Series(
270
["[-5.5, 2.5]", "(2.5, 50.0]", "(50.0, 105.0]"], dtype=pl.Categorical
271
),
272
"count": pl.Series([4, 0, 1], dtype=pl.get_index_type()),
273
}
274
)
275
assert_frame_equal(result, expected)
276
277
278
def test_hist_more_bins_than_data() -> None:
279
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
280
result = s.hist(bin_count=8)
281
282
# manually compute breaks
283
span = 99 - (-5)
284
width = span / 8
285
breaks = [-5 + width * i for i in range(8 + 1)]
286
categories = [f"({breaks[i]}, {breaks[i + 1]}]" for i in range(8)]
287
categories[0] = f"[{categories[0][1:]}"
288
289
expected = pl.DataFrame(
290
{
291
"breakpoint": pl.Series(breaks[1:], dtype=pl.Float64),
292
"category": pl.Series(categories, dtype=pl.Categorical),
293
"count": pl.Series([4, 0, 0, 0, 0, 0, 0, 1], dtype=pl.get_index_type()),
294
}
295
)
296
assert_frame_equal(result, expected)
297
298
299
def test_hist() -> None:
300
s = pl.Series("a", [1, 3, 8, 8, 2, 1, 3])
301
out = s.hist(bin_count=4)
302
expected = pl.DataFrame(
303
{
304
"breakpoint": pl.Series([2.75, 4.5, 6.25, 8.0], dtype=pl.Float64),
305
"category": pl.Series(
306
["[1.0, 2.75]", "(2.75, 4.5]", "(4.5, 6.25]", "(6.25, 8.0]"],
307
dtype=pl.Categorical,
308
),
309
"count": pl.Series([3, 2, 0, 2], dtype=pl.get_index_type()),
310
}
311
)
312
assert_frame_equal(out, expected)
313
314
315
def test_hist_all_null() -> None:
316
s = pl.Series([None], dtype=pl.Float64)
317
out = s.hist()
318
expected = pl.DataFrame(
319
{
320
"breakpoint": pl.Series(
321
[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], dtype=pl.Float64
322
),
323
"category": pl.Series(
324
[
325
"[0.0, 0.1]",
326
"(0.1, 0.2]",
327
"(0.2, 0.3]",
328
"(0.3, 0.4]",
329
"(0.4, 0.5]",
330
"(0.5, 0.6]",
331
"(0.6, 0.7]",
332
"(0.7, 0.8]",
333
"(0.8, 0.9]",
334
"(0.9, 1.0]",
335
],
336
dtype=pl.Categorical,
337
),
338
"count": pl.Series([0] * 10, dtype=pl.get_index_type()),
339
}
340
)
341
assert_frame_equal(out, expected)
342
343
344
@pytest.mark.parametrize("n_null", [0, 5])
345
@pytest.mark.parametrize("n_values", [3, 10, 250])
346
def test_hist_rand(n_values: int, n_null: int) -> None:
347
s_rand = pl.Series([None] * n_null, dtype=pl.Int64)
348
s_values = pl.Series(np.random.randint(0, 100, n_values), dtype=pl.Int64)
349
if s_values.n_unique() == 1:
350
pytest.skip("Identical values not tested.")
351
s = pl.concat((s_rand, s_values))
352
out = s.hist(bin_count=10)
353
354
bp = out["breakpoint"]
355
count = out["count"]
356
min_edge = s.min() - (s.max() - s.min()) * 0.001 # type: ignore[operator]
357
for i in range(out.height):
358
if i == 0:
359
lower = min_edge
360
else:
361
lower = bp[i - 1]
362
upper = bp[i]
363
364
assert ((s <= upper) & (s > lower)).sum() == count[i]
365
366
367
def test_hist_floating_point() -> None:
368
# This test hits the specific floating point case where the bin width should be
369
# 5.7, but it is represented by 5.6999999. The result is that an item equal to the
370
# upper bound (72) exceeds the maximum bins. This tests the code path that catches
371
# this case.
372
n_values = 3
373
n_null = 50
374
375
np.random.seed(2)
376
s_rand = pl.Series([None] * n_null, dtype=pl.Int64)
377
s_values = pl.Series(np.random.randint(0, 100, n_values), dtype=pl.Int64)
378
s = pl.concat((s_rand, s_values))
379
out = s.hist(bin_count=10)
380
min_edge = s.min() - (s.max() - s.min()) * 0.001 # type: ignore[operator]
381
382
bp = out["breakpoint"]
383
count = out["count"]
384
for i in range(out.height):
385
if i == 0:
386
lower = min_edge
387
else:
388
lower = bp[i - 1]
389
upper = bp[i]
390
391
assert ((s <= upper) & (s > lower)).sum() == count[i]
392
393
394
def test_hist_max_boundary_19998() -> None:
395
s = pl.Series(
396
[
397
9514.988509739183,
398
30738.098872148617,
399
41400.15705103004,
400
49093.06982022727,
401
]
402
)
403
result = s.hist(bin_count=50)
404
assert result["count"].sum() == 4
405
406
407
def test_hist_max_boundary_20133() -> None:
408
# Given a set of values that result in bin index to be a floating point number that
409
# is represented as 5.000000000000001
410
s = pl.Series(
411
[
412
6.197601318359375,
413
83.5203145345052,
414
]
415
)
416
417
# When histogram is calculated
418
result = s.hist(bin_count=5)
419
420
# Then there is no exception (previously was possible to get index out of bounds
421
# here) and all the numbers fit into at least one of the bins
422
assert result["count"].sum() == 2
423
424
425
def test_hist_same_values_20030() -> None:
426
out = pl.Series([1, 1]).hist(bin_count=2)
427
expected = pl.DataFrame(
428
{
429
"breakpoint": pl.Series([1.0, 1.5], dtype=pl.Float64),
430
"category": pl.Series(["[0.5, 1.0]", "(1.0, 1.5]"], dtype=pl.Categorical),
431
"count": pl.Series([2, 0], dtype=pl.get_index_type()),
432
}
433
)
434
assert_frame_equal(out, expected)
435
436
437
def test_hist_breakpoint_accuracy() -> None:
438
s = pl.Series([1, 2, 3, 4])
439
out = s.hist(bin_count=3)
440
expected = pl.DataFrame(
441
{
442
"breakpoint": pl.Series([2.0, 3.0, 4.0], dtype=pl.Float64),
443
"category": pl.Series(
444
["[1.0, 2.0]", "(2.0, 3.0]", "(3.0, 4.0]"], dtype=pl.Categorical
445
),
446
"count": pl.Series([2, 1, 1], dtype=pl.get_index_type()),
447
}
448
)
449
assert_frame_equal(out, expected)
450
451
452
def test_hist_ensure_max_value_20879() -> None:
453
s = pl.Series([-1 / 3, 0, 1, 2, 3, 7])
454
result = s.hist(bin_count=3)
455
expected = pl.DataFrame(
456
{
457
"breakpoint": pl.Series(
458
[
459
2.0 + 1 / 9,
460
4.0 + 5 / 9,
461
7.0,
462
],
463
dtype=pl.Float64,
464
),
465
"category": pl.Series(
466
[
467
"[-0.333333, 2.111111]",
468
"(2.111111, 4.555556]",
469
"(4.555556, 7.0]",
470
],
471
dtype=pl.Categorical,
472
),
473
"count": pl.Series([4, 1, 1], dtype=pl.get_index_type()),
474
}
475
)
476
assert_frame_equal(result, expected)
477
478
479
def test_hist_ignore_nans_21082() -> None:
480
s = pl.Series([0.0, float("nan"), 0.5, float("nan"), 1.0])
481
result = s.hist(bins=[-0.001, 0.25, 0.5, 0.75, 1.0])
482
expected = pl.DataFrame(
483
{
484
"breakpoint": pl.Series([0.25, 0.5, 0.75, 1.0], dtype=pl.Float64),
485
"category": pl.Series(
486
[
487
"[-0.001, 0.25]",
488
"(0.25, 0.5]",
489
"(0.5, 0.75]",
490
"(0.75, 1.0]",
491
],
492
dtype=pl.Categorical,
493
),
494
"count": pl.Series([1, 1, 0, 1], dtype=pl.get_index_type()),
495
}
496
)
497
assert_frame_equal(result, expected)
498
499
500
def test_hist_include_lower_22056() -> None:
501
s = pl.Series("a", [1, 5])
502
result = s.hist(bins=[1, 5], include_category=True)
503
expected = pl.DataFrame(
504
{
505
"breakpoint": pl.Series([5.0], dtype=pl.Float64),
506
"category": pl.Series(["[1.0, 5.0]"], dtype=pl.Categorical),
507
"count": pl.Series([2], dtype=pl.get_index_type()),
508
}
509
)
510
assert_frame_equal(result, expected)
511
512
513
def test_hist_ulp_edge_22234() -> None:
514
# Uniform path
515
s = pl.Series([1.0, 1e-16, 1.3e-16, -1.0])
516
result = s.hist(bin_count=2)
517
assert result["count"].to_list() == [1, 3]
518
519
# Manual path
520
result = s.hist(bins=[-1, 0, 1])
521
assert result["count"].to_list() == [1, 3]
522
523