Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/test_hist.py
6939 views
1
from __future__ import annotations
2
3
import numpy as np
4
import pytest
5
6
import polars as pl
7
from polars.exceptions import ComputeError
8
from polars.testing import assert_frame_equal
9
10
inf = float("inf")
11
12
13
def test_hist_empty_data_no_inputs() -> None:
14
s = pl.Series([], dtype=pl.UInt8)
15
16
# No bins or edges specified: 10 bins around unit interval
17
expected = pl.DataFrame(
18
{
19
"breakpoint": pl.Series(
20
[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], dtype=pl.Float64
21
),
22
"category": pl.Series(
23
[
24
"[0.0, 0.1]",
25
"(0.1, 0.2]",
26
"(0.2, 0.3]",
27
"(0.3, 0.4]",
28
"(0.4, 0.5]",
29
"(0.5, 0.6]",
30
"(0.6, 0.7]",
31
"(0.7, 0.8]",
32
"(0.8, 0.9]",
33
"(0.9, 1.0]",
34
],
35
dtype=pl.Categorical,
36
),
37
"count": pl.Series([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=pl.UInt32),
38
}
39
)
40
result = s.hist()
41
assert_frame_equal(result, expected)
42
43
44
def test_hist_empty_data_empty_bins() -> None:
45
s = pl.Series([], dtype=pl.UInt8)
46
47
# No bins or edges specified: 10 bins around unit interval
48
expected = pl.DataFrame(
49
{
50
"breakpoint": pl.Series([], dtype=pl.Float64),
51
"category": pl.Series([], dtype=pl.Categorical),
52
"count": pl.Series([], dtype=pl.UInt32),
53
}
54
)
55
result = s.hist(bins=[])
56
assert_frame_equal(result, expected)
57
58
59
def test_hist_empty_data_single_bin_edge() -> None:
60
s = pl.Series([], dtype=pl.UInt8)
61
62
# No bins or edges specified: 10 bins around unit interval
63
expected = pl.DataFrame(
64
{
65
"breakpoint": pl.Series([], dtype=pl.Float64),
66
"category": pl.Series([], dtype=pl.Categorical),
67
"count": pl.Series([], dtype=pl.UInt32),
68
}
69
)
70
result = s.hist(bins=[2])
71
assert_frame_equal(result, expected)
72
73
74
def test_hist_empty_data_valid_edges() -> None:
75
s = pl.Series([], dtype=pl.UInt8)
76
77
# No bins or edges specified: 10 bins around unit interval
78
expected = pl.DataFrame(
79
{
80
"breakpoint": pl.Series([2.0, 3.0], dtype=pl.Float64),
81
"category": pl.Series(["[1.0, 2.0]", "(2.0, 3.0]"], dtype=pl.Categorical),
82
"count": pl.Series([0, 0], dtype=pl.UInt32),
83
}
84
)
85
result = s.hist(bins=[1, 2, 3])
86
assert_frame_equal(result, expected)
87
88
89
def test_hist_empty_data_invalid_edges() -> None:
90
s = pl.Series([], dtype=pl.UInt8)
91
with pytest.raises(ComputeError, match="bins must increase monotonically"):
92
s.hist(bins=[1, 0]) # invalid order
93
94
95
def test_hist_empty_data_bad_bin_count() -> None:
96
s = pl.Series([], dtype=pl.UInt8)
97
with pytest.raises(OverflowError, match="can't convert negative int to unsigned"):
98
s.hist(bin_count=-1) # invalid order
99
100
101
def test_hist_empty_data_zero_bin_count() -> None:
102
s = pl.Series([], dtype=pl.UInt8)
103
expected = pl.DataFrame(
104
{
105
"breakpoint": pl.Series([], dtype=pl.Float64),
106
"category": pl.Series([], dtype=pl.Categorical),
107
"count": pl.Series([], dtype=pl.UInt32),
108
}
109
)
110
result = s.hist(bin_count=0)
111
assert_frame_equal(result, expected)
112
113
114
def test_hist_empty_data_single_bin_count() -> None:
115
s = pl.Series([], dtype=pl.UInt8)
116
expected = pl.DataFrame(
117
{
118
"breakpoint": pl.Series([1.0], dtype=pl.Float64),
119
"category": pl.Series(["[0.0, 1.0]"], dtype=pl.Categorical),
120
"count": pl.Series([0], dtype=pl.UInt32),
121
}
122
)
123
result = s.hist(bin_count=1)
124
assert_frame_equal(result, expected)
125
126
127
def test_hist_empty_data_valid_bin_count() -> None:
128
s = pl.Series([], dtype=pl.UInt8)
129
expected = pl.DataFrame(
130
{
131
"breakpoint": pl.Series([0.2, 0.4, 0.6, 0.8, 1.0], dtype=pl.Float64),
132
"category": pl.Series(
133
[
134
"[0.0, 0.2]",
135
"(0.2, 0.4]",
136
"(0.4, 0.6]",
137
"(0.6, 0.8]",
138
"(0.8, 1.0]",
139
],
140
dtype=pl.Categorical,
141
),
142
"count": pl.Series([0, 0, 0, 0, 0], dtype=pl.UInt32),
143
}
144
)
145
result = s.hist(bin_count=5)
146
assert_frame_equal(result, expected)
147
148
149
def test_hist_invalid_bin_count() -> None:
150
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
151
with pytest.raises(OverflowError, match="can't convert negative int to unsigned"):
152
s.hist(bin_count=-1) # invalid order
153
154
155
def test_hist_invalid_bins() -> None:
156
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
157
with pytest.raises(ComputeError, match="bins must increase monotonically"):
158
s.hist(bins=[1, 0]) # invalid order
159
160
161
def test_hist_bin_outside_data() -> None:
162
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
163
result = s.hist(bins=[-10, -9])
164
expected = pl.DataFrame(
165
{
166
"breakpoint": pl.Series([-9.0], dtype=pl.Float64),
167
"category": pl.Series(["[-10.0, -9.0]"], dtype=pl.Categorical),
168
"count": pl.Series([0], dtype=pl.UInt32),
169
}
170
)
171
assert_frame_equal(result, expected)
172
173
174
def test_hist_bins_between_data() -> None:
175
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
176
result = s.hist(bins=[4.5, 10.5])
177
expected = pl.DataFrame(
178
{
179
"breakpoint": pl.Series([10.5], dtype=pl.Float64),
180
"category": pl.Series(["[4.5, 10.5]"], dtype=pl.Categorical),
181
"count": pl.Series([0], dtype=pl.UInt32),
182
}
183
)
184
assert_frame_equal(result, expected)
185
186
187
def test_hist_bins_first_edge() -> None:
188
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
189
result = s.hist(bins=[2, 3, 4])
190
expected = pl.DataFrame(
191
{
192
"breakpoint": pl.Series([3.0, 4.0], dtype=pl.Float64),
193
"category": pl.Series(["[2.0, 3.0]", "(3.0, 4.0]"], dtype=pl.Categorical),
194
"count": pl.Series([1, 0], dtype=pl.UInt32),
195
}
196
)
197
assert_frame_equal(result, expected)
198
199
200
def test_hist_bins_last_edge() -> None:
201
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
202
result = s.hist(bins=[-4, 0, 99, 100])
203
expected = pl.DataFrame(
204
{
205
"breakpoint": pl.Series([0.0, 99.0, 100.0], dtype=pl.Float64),
206
"category": pl.Series(
207
[
208
"[-4.0, 0.0]",
209
"(0.0, 99.0]",
210
"(99.0, 100.0]",
211
],
212
dtype=pl.Categorical,
213
),
214
"count": pl.Series([1, 3, 0], dtype=pl.UInt32),
215
}
216
)
217
assert_frame_equal(result, expected)
218
219
220
def test_hist_single_value_single_bin_count() -> None:
221
s = pl.Series([1], dtype=pl.Int32)
222
result = s.hist(bin_count=1)
223
expected = pl.DataFrame(
224
{
225
"breakpoint": pl.Series([1.5], dtype=pl.Float64),
226
"category": pl.Series(["[0.5, 1.5]"], dtype=pl.Categorical),
227
"count": pl.Series([1], dtype=pl.UInt32),
228
}
229
)
230
assert_frame_equal(result, expected)
231
232
233
def test_hist_single_bin_count() -> None:
234
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
235
result = s.hist(bin_count=1)
236
expected = pl.DataFrame(
237
{
238
"breakpoint": pl.Series([99.0], dtype=pl.Float64),
239
"category": pl.Series(["[-5.0, 99.0]"], dtype=pl.Categorical),
240
"count": pl.Series([5], dtype=pl.UInt32),
241
}
242
)
243
assert_frame_equal(result, expected)
244
245
246
def test_hist_partial_covering() -> None:
247
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
248
result = s.hist(bins=[-1.5, 2.5, 50, 105])
249
expected = pl.DataFrame(
250
{
251
"breakpoint": pl.Series([2.5, 50.0, 105.0], dtype=pl.Float64),
252
"category": pl.Series(
253
["[-1.5, 2.5]", "(2.5, 50.0]", "(50.0, 105.0]"], dtype=pl.Categorical
254
),
255
"count": pl.Series([3, 0, 1], dtype=pl.UInt32),
256
}
257
)
258
assert_frame_equal(result, expected)
259
260
261
def test_hist_full_covering() -> None:
262
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
263
result = s.hist(bins=[-5.5, 2.5, 50, 105])
264
expected = pl.DataFrame(
265
{
266
"breakpoint": pl.Series([2.5, 50.0, 105.0], dtype=pl.Float64),
267
"category": pl.Series(
268
["[-5.5, 2.5]", "(2.5, 50.0]", "(50.0, 105.0]"], dtype=pl.Categorical
269
),
270
"count": pl.Series([4, 0, 1], dtype=pl.UInt32),
271
}
272
)
273
assert_frame_equal(result, expected)
274
275
276
def test_hist_more_bins_than_data() -> None:
277
s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
278
result = s.hist(bin_count=8)
279
280
# manually compute breaks
281
span = 99 - (-5)
282
width = span / 8
283
breaks = [-5 + width * i for i in range(8 + 1)]
284
categories = [f"({breaks[i]}, {breaks[i + 1]}]" for i in range(8)]
285
categories[0] = f"[{categories[0][1:]}"
286
287
expected = pl.DataFrame(
288
{
289
"breakpoint": pl.Series(breaks[1:], dtype=pl.Float64),
290
"category": pl.Series(categories, dtype=pl.Categorical),
291
"count": pl.Series([4, 0, 0, 0, 0, 0, 0, 1], dtype=pl.UInt32),
292
}
293
)
294
assert_frame_equal(result, expected)
295
296
297
def test_hist() -> None:
298
s = pl.Series("a", [1, 3, 8, 8, 2, 1, 3])
299
out = s.hist(bin_count=4)
300
expected = pl.DataFrame(
301
{
302
"breakpoint": pl.Series([2.75, 4.5, 6.25, 8.0], dtype=pl.Float64),
303
"category": pl.Series(
304
["[1.0, 2.75]", "(2.75, 4.5]", "(4.5, 6.25]", "(6.25, 8.0]"],
305
dtype=pl.Categorical,
306
),
307
"count": pl.Series([3, 2, 0, 2], dtype=pl.get_index_type()),
308
}
309
)
310
assert_frame_equal(out, expected)
311
312
313
def test_hist_all_null() -> None:
314
s = pl.Series([None], dtype=pl.Float64)
315
out = s.hist()
316
expected = pl.DataFrame(
317
{
318
"breakpoint": pl.Series(
319
[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], dtype=pl.Float64
320
),
321
"category": pl.Series(
322
[
323
"[0.0, 0.1]",
324
"(0.1, 0.2]",
325
"(0.2, 0.3]",
326
"(0.3, 0.4]",
327
"(0.4, 0.5]",
328
"(0.5, 0.6]",
329
"(0.6, 0.7]",
330
"(0.7, 0.8]",
331
"(0.8, 0.9]",
332
"(0.9, 1.0]",
333
],
334
dtype=pl.Categorical,
335
),
336
"count": pl.Series([0] * 10, dtype=pl.get_index_type()),
337
}
338
)
339
assert_frame_equal(out, expected)
340
341
342
@pytest.mark.parametrize("n_null", [0, 5])
343
@pytest.mark.parametrize("n_values", [3, 10, 250])
344
def test_hist_rand(n_values: int, n_null: int) -> None:
345
s_rand = pl.Series([None] * n_null, dtype=pl.Int64)
346
s_values = pl.Series(np.random.randint(0, 100, n_values), dtype=pl.Int64)
347
if s_values.n_unique() == 1:
348
pytest.skip("Identical values not tested.")
349
s = pl.concat((s_rand, s_values))
350
out = s.hist(bin_count=10)
351
352
bp = out["breakpoint"]
353
count = out["count"]
354
min_edge = s.min() - (s.max() - s.min()) * 0.001 # type: ignore[operator]
355
for i in range(out.height):
356
if i == 0:
357
lower = min_edge
358
else:
359
lower = bp[i - 1]
360
upper = bp[i]
361
362
assert ((s <= upper) & (s > lower)).sum() == count[i]
363
364
365
def test_hist_floating_point() -> None:
366
# This test hits the specific floating point case where the bin width should be
367
# 5.7, but it is represented by 5.6999999. The result is that an item equal to the
368
# upper bound (72) exceeds the maximum bins. This tests the code path that catches
369
# this case.
370
n_values = 3
371
n_null = 50
372
373
np.random.seed(2)
374
s_rand = pl.Series([None] * n_null, dtype=pl.Int64)
375
s_values = pl.Series(np.random.randint(0, 100, n_values), dtype=pl.Int64)
376
s = pl.concat((s_rand, s_values))
377
out = s.hist(bin_count=10)
378
min_edge = s.min() - (s.max() - s.min()) * 0.001 # type: ignore[operator]
379
380
bp = out["breakpoint"]
381
count = out["count"]
382
for i in range(out.height):
383
if i == 0:
384
lower = min_edge
385
else:
386
lower = bp[i - 1]
387
upper = bp[i]
388
389
assert ((s <= upper) & (s > lower)).sum() == count[i]
390
391
392
def test_hist_max_boundary_19998() -> None:
393
s = pl.Series(
394
[
395
9514.988509739183,
396
30738.098872148617,
397
41400.15705103004,
398
49093.06982022727,
399
]
400
)
401
result = s.hist(bin_count=50)
402
assert result["count"].sum() == 4
403
404
405
def test_hist_max_boundary_20133() -> None:
406
# Given a set of values that result in bin index to be a floating point number that
407
# is represented as 5.000000000000001
408
s = pl.Series(
409
[
410
6.197601318359375,
411
83.5203145345052,
412
]
413
)
414
415
# When histogram is calculated
416
result = s.hist(bin_count=5)
417
418
# Then there is no exception (previously was possible to get index out of bounds
419
# here) and all the numbers fit into at least one of the bins
420
assert result["count"].sum() == 2
421
422
423
def test_hist_same_values_20030() -> None:
424
out = pl.Series([1, 1]).hist(bin_count=2)
425
expected = pl.DataFrame(
426
{
427
"breakpoint": pl.Series([1.0, 1.5], dtype=pl.Float64),
428
"category": pl.Series(["[0.5, 1.0]", "(1.0, 1.5]"], dtype=pl.Categorical),
429
"count": pl.Series([2, 0], dtype=pl.get_index_type()),
430
}
431
)
432
assert_frame_equal(out, expected)
433
434
435
def test_hist_breakpoint_accuracy() -> None:
436
s = pl.Series([1, 2, 3, 4])
437
out = s.hist(bin_count=3)
438
expected = pl.DataFrame(
439
{
440
"breakpoint": pl.Series([2.0, 3.0, 4.0], dtype=pl.Float64),
441
"category": pl.Series(
442
["[1.0, 2.0]", "(2.0, 3.0]", "(3.0, 4.0]"], dtype=pl.Categorical
443
),
444
"count": pl.Series([2, 1, 1], dtype=pl.get_index_type()),
445
}
446
)
447
assert_frame_equal(out, expected)
448
449
450
def test_hist_ensure_max_value_20879() -> None:
451
s = pl.Series([-1 / 3, 0, 1, 2, 3, 7])
452
result = s.hist(bin_count=3)
453
expected = pl.DataFrame(
454
{
455
"breakpoint": pl.Series(
456
[
457
2.0 + 1 / 9,
458
4.0 + 5 / 9,
459
7.0,
460
],
461
dtype=pl.Float64,
462
),
463
"category": pl.Series(
464
[
465
"[-0.333333, 2.111111]",
466
"(2.111111, 4.555556]",
467
"(4.555556, 7.0]",
468
],
469
dtype=pl.Categorical,
470
),
471
"count": pl.Series([4, 1, 1], dtype=pl.get_index_type()),
472
}
473
)
474
assert_frame_equal(result, expected)
475
476
477
def test_hist_ignore_nans_21082() -> None:
478
s = pl.Series([0.0, float("nan"), 0.5, float("nan"), 1.0])
479
result = s.hist(bins=[-0.001, 0.25, 0.5, 0.75, 1.0])
480
expected = pl.DataFrame(
481
{
482
"breakpoint": pl.Series([0.25, 0.5, 0.75, 1.0], dtype=pl.Float64),
483
"category": pl.Series(
484
[
485
"[-0.001, 0.25]",
486
"(0.25, 0.5]",
487
"(0.5, 0.75]",
488
"(0.75, 1.0]",
489
],
490
dtype=pl.Categorical,
491
),
492
"count": pl.Series([1, 1, 0, 1], dtype=pl.get_index_type()),
493
}
494
)
495
assert_frame_equal(result, expected)
496
497
498
def test_hist_include_lower_22056() -> None:
499
s = pl.Series("a", [1, 5])
500
result = s.hist(bins=[1, 5], include_category=True)
501
expected = pl.DataFrame(
502
{
503
"breakpoint": pl.Series([5.0], dtype=pl.Float64),
504
"category": pl.Series(["[1.0, 5.0]"], dtype=pl.Categorical),
505
"count": pl.Series([2], dtype=pl.get_index_type()),
506
}
507
)
508
assert_frame_equal(result, expected)
509
510
511
def test_hist_ulp_edge_22234() -> None:
512
# Uniform path
513
s = pl.Series([1.0, 1e-16, 1.3e-16, -1.0])
514
result = s.hist(bin_count=2)
515
assert result["count"].to_list() == [1, 3]
516
517
# Manual path
518
result = s.hist(bins=[-1, 0, 1])
519
assert result["count"].to_list() == [1, 3]
520
521