CoCalc -- test

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/tests/unit/operations/test_hist.py
⁶⁹³⁹ views
1
from __future__ import annotations
2

3
import numpy as np
4
import pytest
5

6
import polars as pl
7
from polars.exceptions import ComputeError
8
from polars.testing import assert_frame_equal
9

10
inf = float("inf")
11

12

13
def test_hist_empty_data_no_inputs() -> None:
14
    s = pl.Series([], dtype=pl.UInt8)
15

16
    # No bins or edges specified: 10 bins around unit interval
17
    expected = pl.DataFrame(
18
        {
19
            "breakpoint": pl.Series(
20
                [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], dtype=pl.Float64
21
            ),
22
            "category": pl.Series(
23
                [
24
                    "[0.0, 0.1]",
25
                    "(0.1, 0.2]",
26
                    "(0.2, 0.3]",
27
                    "(0.3, 0.4]",
28
                    "(0.4, 0.5]",
29
                    "(0.5, 0.6]",
30
                    "(0.6, 0.7]",
31
                    "(0.7, 0.8]",
32
                    "(0.8, 0.9]",
33
                    "(0.9, 1.0]",
34
                ],
35
                dtype=pl.Categorical,
36
            ),
37
            "count": pl.Series([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=pl.UInt32),
38
        }
39
    )
40
    result = s.hist()
41
    assert_frame_equal(result, expected)
42

43

44
def test_hist_empty_data_empty_bins() -> None:
45
    s = pl.Series([], dtype=pl.UInt8)
46

47
    # No bins or edges specified: 10 bins around unit interval
48
    expected = pl.DataFrame(
49
        {
50
            "breakpoint": pl.Series([], dtype=pl.Float64),
51
            "category": pl.Series([], dtype=pl.Categorical),
52
            "count": pl.Series([], dtype=pl.UInt32),
53
        }
54
    )
55
    result = s.hist(bins=[])
56
    assert_frame_equal(result, expected)
57

58

59
def test_hist_empty_data_single_bin_edge() -> None:
60
    s = pl.Series([], dtype=pl.UInt8)
61

62
    # No bins or edges specified: 10 bins around unit interval
63
    expected = pl.DataFrame(
64
        {
65
            "breakpoint": pl.Series([], dtype=pl.Float64),
66
            "category": pl.Series([], dtype=pl.Categorical),
67
            "count": pl.Series([], dtype=pl.UInt32),
68
        }
69
    )
70
    result = s.hist(bins=[2])
71
    assert_frame_equal(result, expected)
72

73

74
def test_hist_empty_data_valid_edges() -> None:
75
    s = pl.Series([], dtype=pl.UInt8)
76

77
    # No bins or edges specified: 10 bins around unit interval
78
    expected = pl.DataFrame(
79
        {
80
            "breakpoint": pl.Series([2.0, 3.0], dtype=pl.Float64),
81
            "category": pl.Series(["[1.0, 2.0]", "(2.0, 3.0]"], dtype=pl.Categorical),
82
            "count": pl.Series([0, 0], dtype=pl.UInt32),
83
        }
84
    )
85
    result = s.hist(bins=[1, 2, 3])
86
    assert_frame_equal(result, expected)
87

88

89
def test_hist_empty_data_invalid_edges() -> None:
90
    s = pl.Series([], dtype=pl.UInt8)
91
    with pytest.raises(ComputeError, match="bins must increase monotonically"):
92
        s.hist(bins=[1, 0])  # invalid order
93

94

95
def test_hist_empty_data_bad_bin_count() -> None:
96
    s = pl.Series([], dtype=pl.UInt8)
97
    with pytest.raises(OverflowError, match="can't convert negative int to unsigned"):
98
        s.hist(bin_count=-1)  # invalid order
99

100

101
def test_hist_empty_data_zero_bin_count() -> None:
102
    s = pl.Series([], dtype=pl.UInt8)
103
    expected = pl.DataFrame(
104
        {
105
            "breakpoint": pl.Series([], dtype=pl.Float64),
106
            "category": pl.Series([], dtype=pl.Categorical),
107
            "count": pl.Series([], dtype=pl.UInt32),
108
        }
109
    )
110
    result = s.hist(bin_count=0)
111
    assert_frame_equal(result, expected)
112

113

114
def test_hist_empty_data_single_bin_count() -> None:
115
    s = pl.Series([], dtype=pl.UInt8)
116
    expected = pl.DataFrame(
117
        {
118
            "breakpoint": pl.Series([1.0], dtype=pl.Float64),
119
            "category": pl.Series(["[0.0, 1.0]"], dtype=pl.Categorical),
120
            "count": pl.Series([0], dtype=pl.UInt32),
121
        }
122
    )
123
    result = s.hist(bin_count=1)
124
    assert_frame_equal(result, expected)
125

126

127
def test_hist_empty_data_valid_bin_count() -> None:
128
    s = pl.Series([], dtype=pl.UInt8)
129
    expected = pl.DataFrame(
130
        {
131
            "breakpoint": pl.Series([0.2, 0.4, 0.6, 0.8, 1.0], dtype=pl.Float64),
132
            "category": pl.Series(
133
                [
134
                    "[0.0, 0.2]",
135
                    "(0.2, 0.4]",
136
                    "(0.4, 0.6]",
137
                    "(0.6, 0.8]",
138
                    "(0.8, 1.0]",
139
                ],
140
                dtype=pl.Categorical,
141
            ),
142
            "count": pl.Series([0, 0, 0, 0, 0], dtype=pl.UInt32),
143
        }
144
    )
145
    result = s.hist(bin_count=5)
146
    assert_frame_equal(result, expected)
147

148

149
def test_hist_invalid_bin_count() -> None:
150
    s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
151
    with pytest.raises(OverflowError, match="can't convert negative int to unsigned"):
152
        s.hist(bin_count=-1)  # invalid order
153

154

155
def test_hist_invalid_bins() -> None:
156
    s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
157
    with pytest.raises(ComputeError, match="bins must increase monotonically"):
158
        s.hist(bins=[1, 0])  # invalid order
159

160

161
def test_hist_bin_outside_data() -> None:
162
    s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
163
    result = s.hist(bins=[-10, -9])
164
    expected = pl.DataFrame(
165
        {
166
            "breakpoint": pl.Series([-9.0], dtype=pl.Float64),
167
            "category": pl.Series(["[-10.0, -9.0]"], dtype=pl.Categorical),
168
            "count": pl.Series([0], dtype=pl.UInt32),
169
        }
170
    )
171
    assert_frame_equal(result, expected)
172

173

174
def test_hist_bins_between_data() -> None:
175
    s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
176
    result = s.hist(bins=[4.5, 10.5])
177
    expected = pl.DataFrame(
178
        {
179
            "breakpoint": pl.Series([10.5], dtype=pl.Float64),
180
            "category": pl.Series(["[4.5, 10.5]"], dtype=pl.Categorical),
181
            "count": pl.Series([0], dtype=pl.UInt32),
182
        }
183
    )
184
    assert_frame_equal(result, expected)
185

186

187
def test_hist_bins_first_edge() -> None:
188
    s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
189
    result = s.hist(bins=[2, 3, 4])
190
    expected = pl.DataFrame(
191
        {
192
            "breakpoint": pl.Series([3.0, 4.0], dtype=pl.Float64),
193
            "category": pl.Series(["[2.0, 3.0]", "(3.0, 4.0]"], dtype=pl.Categorical),
194
            "count": pl.Series([1, 0], dtype=pl.UInt32),
195
        }
196
    )
197
    assert_frame_equal(result, expected)
198

199

200
def test_hist_bins_last_edge() -> None:
201
    s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
202
    result = s.hist(bins=[-4, 0, 99, 100])
203
    expected = pl.DataFrame(
204
        {
205
            "breakpoint": pl.Series([0.0, 99.0, 100.0], dtype=pl.Float64),
206
            "category": pl.Series(
207
                [
208
                    "[-4.0, 0.0]",
209
                    "(0.0, 99.0]",
210
                    "(99.0, 100.0]",
211
                ],
212
                dtype=pl.Categorical,
213
            ),
214
            "count": pl.Series([1, 3, 0], dtype=pl.UInt32),
215
        }
216
    )
217
    assert_frame_equal(result, expected)
218

219

220
def test_hist_single_value_single_bin_count() -> None:
221
    s = pl.Series([1], dtype=pl.Int32)
222
    result = s.hist(bin_count=1)
223
    expected = pl.DataFrame(
224
        {
225
            "breakpoint": pl.Series([1.5], dtype=pl.Float64),
226
            "category": pl.Series(["[0.5, 1.5]"], dtype=pl.Categorical),
227
            "count": pl.Series([1], dtype=pl.UInt32),
228
        }
229
    )
230
    assert_frame_equal(result, expected)
231

232

233
def test_hist_single_bin_count() -> None:
234
    s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
235
    result = s.hist(bin_count=1)
236
    expected = pl.DataFrame(
237
        {
238
            "breakpoint": pl.Series([99.0], dtype=pl.Float64),
239
            "category": pl.Series(["[-5.0, 99.0]"], dtype=pl.Categorical),
240
            "count": pl.Series([5], dtype=pl.UInt32),
241
        }
242
    )
243
    assert_frame_equal(result, expected)
244

245

246
def test_hist_partial_covering() -> None:
247
    s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
248
    result = s.hist(bins=[-1.5, 2.5, 50, 105])
249
    expected = pl.DataFrame(
250
        {
251
            "breakpoint": pl.Series([2.5, 50.0, 105.0], dtype=pl.Float64),
252
            "category": pl.Series(
253
                ["[-1.5, 2.5]", "(2.5, 50.0]", "(50.0, 105.0]"], dtype=pl.Categorical
254
            ),
255
            "count": pl.Series([3, 0, 1], dtype=pl.UInt32),
256
        }
257
    )
258
    assert_frame_equal(result, expected)
259

260

261
def test_hist_full_covering() -> None:
262
    s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
263
    result = s.hist(bins=[-5.5, 2.5, 50, 105])
264
    expected = pl.DataFrame(
265
        {
266
            "breakpoint": pl.Series([2.5, 50.0, 105.0], dtype=pl.Float64),
267
            "category": pl.Series(
268
                ["[-5.5, 2.5]", "(2.5, 50.0]", "(50.0, 105.0]"], dtype=pl.Categorical
269
            ),
270
            "count": pl.Series([4, 0, 1], dtype=pl.UInt32),
271
        }
272
    )
273
    assert_frame_equal(result, expected)
274

275

276
def test_hist_more_bins_than_data() -> None:
277
    s = pl.Series([-5, 2, 0, 1, 99], dtype=pl.Int32)
278
    result = s.hist(bin_count=8)
279

280
    # manually compute breaks
281
    span = 99 - (-5)
282
    width = span / 8
283
    breaks = [-5 + width * i for i in range(8 + 1)]
284
    categories = [f"({breaks[i]}, {breaks[i + 1]}]" for i in range(8)]
285
    categories[0] = f"[{categories[0][1:]}"
286

287
    expected = pl.DataFrame(
288
        {
289
            "breakpoint": pl.Series(breaks[1:], dtype=pl.Float64),
290
            "category": pl.Series(categories, dtype=pl.Categorical),
291
            "count": pl.Series([4, 0, 0, 0, 0, 0, 0, 1], dtype=pl.UInt32),
292
        }
293
    )
294
    assert_frame_equal(result, expected)
295

296

297
def test_hist() -> None:
298
    s = pl.Series("a", [1, 3, 8, 8, 2, 1, 3])
299
    out = s.hist(bin_count=4)
300
    expected = pl.DataFrame(
301
        {
302
            "breakpoint": pl.Series([2.75, 4.5, 6.25, 8.0], dtype=pl.Float64),
303
            "category": pl.Series(
304
                ["[1.0, 2.75]", "(2.75, 4.5]", "(4.5, 6.25]", "(6.25, 8.0]"],
305
                dtype=pl.Categorical,
306
            ),
307
            "count": pl.Series([3, 2, 0, 2], dtype=pl.get_index_type()),
308
        }
309
    )
310
    assert_frame_equal(out, expected)
311

312

313
def test_hist_all_null() -> None:
314
    s = pl.Series([None], dtype=pl.Float64)
315
    out = s.hist()
316
    expected = pl.DataFrame(
317
        {
318
            "breakpoint": pl.Series(
319
                [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], dtype=pl.Float64
320
            ),
321
            "category": pl.Series(
322
                [
323
                    "[0.0, 0.1]",
324
                    "(0.1, 0.2]",
325
                    "(0.2, 0.3]",
326
                    "(0.3, 0.4]",
327
                    "(0.4, 0.5]",
328
                    "(0.5, 0.6]",
329
                    "(0.6, 0.7]",
330
                    "(0.7, 0.8]",
331
                    "(0.8, 0.9]",
332
                    "(0.9, 1.0]",
333
                ],
334
                dtype=pl.Categorical,
335
            ),
336
            "count": pl.Series([0] * 10, dtype=pl.get_index_type()),
337
        }
338
    )
339
    assert_frame_equal(out, expected)
340

341

342
@pytest.mark.parametrize("n_null", [0, 5])
343
@pytest.mark.parametrize("n_values", [3, 10, 250])
344
def test_hist_rand(n_values: int, n_null: int) -> None:
345
    s_rand = pl.Series([None] * n_null, dtype=pl.Int64)
346
    s_values = pl.Series(np.random.randint(0, 100, n_values), dtype=pl.Int64)
347
    if s_values.n_unique() == 1:
348
        pytest.skip("Identical values not tested.")
349
    s = pl.concat((s_rand, s_values))
350
    out = s.hist(bin_count=10)
351

352
    bp = out["breakpoint"]
353
    count = out["count"]
354
    min_edge = s.min() - (s.max() - s.min()) * 0.001  # type: ignore[operator]
355
    for i in range(out.height):
356
        if i == 0:
357
            lower = min_edge
358
        else:
359
            lower = bp[i - 1]
360
        upper = bp[i]
361

362
        assert ((s <= upper) & (s > lower)).sum() == count[i]
363

364

365
def test_hist_floating_point() -> None:
366
    # This test hits the specific floating point case where the bin width should be
367
    # 5.7, but it is represented by 5.6999999. The result is that an item equal to the
368
    # upper bound (72) exceeds the maximum bins. This tests the code path that catches
369
    # this case.
370
    n_values = 3
371
    n_null = 50
372

373
    np.random.seed(2)
374
    s_rand = pl.Series([None] * n_null, dtype=pl.Int64)
375
    s_values = pl.Series(np.random.randint(0, 100, n_values), dtype=pl.Int64)
376
    s = pl.concat((s_rand, s_values))
377
    out = s.hist(bin_count=10)
378
    min_edge = s.min() - (s.max() - s.min()) * 0.001  # type: ignore[operator]
379

380
    bp = out["breakpoint"]
381
    count = out["count"]
382
    for i in range(out.height):
383
        if i == 0:
384
            lower = min_edge
385
        else:
386
            lower = bp[i - 1]
387
        upper = bp[i]
388

389
        assert ((s <= upper) & (s > lower)).sum() == count[i]
390

391

392
def test_hist_max_boundary_19998() -> None:
393
    s = pl.Series(
394
        [
395
            9514.988509739183,
396
            30738.098872148617,
397
            41400.15705103004,
398
            49093.06982022727,
399
        ]
400
    )
401
    result = s.hist(bin_count=50)
402
    assert result["count"].sum() == 4
403

404

405
def test_hist_max_boundary_20133() -> None:
406
    # Given a set of values that result in bin index to be a floating point number that
407
    # is represented as 5.000000000000001
408
    s = pl.Series(
409
        [
410
            6.197601318359375,
411
            83.5203145345052,
412
        ]
413
    )
414

415
    # When histogram is calculated
416
    result = s.hist(bin_count=5)
417

418
    # Then there is no exception (previously was possible to get index out of bounds
419
    # here) and all the numbers fit into at least one of the bins
420
    assert result["count"].sum() == 2
421

422

423
def test_hist_same_values_20030() -> None:
424
    out = pl.Series([1, 1]).hist(bin_count=2)
425
    expected = pl.DataFrame(
426
        {
427
            "breakpoint": pl.Series([1.0, 1.5], dtype=pl.Float64),
428
            "category": pl.Series(["[0.5, 1.0]", "(1.0, 1.5]"], dtype=pl.Categorical),
429
            "count": pl.Series([2, 0], dtype=pl.get_index_type()),
430
        }
431
    )
432
    assert_frame_equal(out, expected)
433

434

435
def test_hist_breakpoint_accuracy() -> None:
436
    s = pl.Series([1, 2, 3, 4])
437
    out = s.hist(bin_count=3)
438
    expected = pl.DataFrame(
439
        {
440
            "breakpoint": pl.Series([2.0, 3.0, 4.0], dtype=pl.Float64),
441
            "category": pl.Series(
442
                ["[1.0, 2.0]", "(2.0, 3.0]", "(3.0, 4.0]"], dtype=pl.Categorical
443
            ),
444
            "count": pl.Series([2, 1, 1], dtype=pl.get_index_type()),
445
        }
446
    )
447
    assert_frame_equal(out, expected)
448

449

450
def test_hist_ensure_max_value_20879() -> None:
451
    s = pl.Series([-1 / 3, 0, 1, 2, 3, 7])
452
    result = s.hist(bin_count=3)
453
    expected = pl.DataFrame(
454
        {
455
            "breakpoint": pl.Series(
456
                [
457
                    2.0 + 1 / 9,
458
                    4.0 + 5 / 9,
459
                    7.0,
460
                ],
461
                dtype=pl.Float64,
462
            ),
463
            "category": pl.Series(
464
                [
465
                    "[-0.333333, 2.111111]",
466
                    "(2.111111, 4.555556]",
467
                    "(4.555556, 7.0]",
468
                ],
469
                dtype=pl.Categorical,
470
            ),
471
            "count": pl.Series([4, 1, 1], dtype=pl.get_index_type()),
472
        }
473
    )
474
    assert_frame_equal(result, expected)
475

476

477
def test_hist_ignore_nans_21082() -> None:
478
    s = pl.Series([0.0, float("nan"), 0.5, float("nan"), 1.0])
479
    result = s.hist(bins=[-0.001, 0.25, 0.5, 0.75, 1.0])
480
    expected = pl.DataFrame(
481
        {
482
            "breakpoint": pl.Series([0.25, 0.5, 0.75, 1.0], dtype=pl.Float64),
483
            "category": pl.Series(
484
                [
485
                    "[-0.001, 0.25]",
486
                    "(0.25, 0.5]",
487
                    "(0.5, 0.75]",
488
                    "(0.75, 1.0]",
489
                ],
490
                dtype=pl.Categorical,
491
            ),
492
            "count": pl.Series([1, 1, 0, 1], dtype=pl.get_index_type()),
493
        }
494
    )
495
    assert_frame_equal(result, expected)
496

497

498
def test_hist_include_lower_22056() -> None:
499
    s = pl.Series("a", [1, 5])
500
    result = s.hist(bins=[1, 5], include_category=True)
501
    expected = pl.DataFrame(
502
        {
503
            "breakpoint": pl.Series([5.0], dtype=pl.Float64),
504
            "category": pl.Series(["[1.0, 5.0]"], dtype=pl.Categorical),
505
            "count": pl.Series([2], dtype=pl.get_index_type()),
506
        }
507
    )
508
    assert_frame_equal(result, expected)
509

510

511
def test_hist_ulp_edge_22234() -> None:
512
    # Uniform path
513
    s = pl.Series([1.0, 1e-16, 1.3e-16, -1.0])
514
    result = s.hist(bin_count=2)
515
    assert result["count"].to_list() == [1, 3]
516

517
    # Manual path
518
    result = s.hist(bins=[-1, 0, 1])
519
    assert result["count"].to_list() == [1, 3]
520

521
Product

Resources

Company