CoCalc -- cardinality

GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-utils/src/cardinality_sketch.rs
⁶⁹³⁹ views
1
use crate::algebraic_ops::alg_add_f64;
2

3
// Computes 2^-n by directly subtracting from the IEEE754 double exponent.
4
fn inv_pow2(n: u8) -> f64 {
5
    let base = f64::to_bits(1.0);
6
    f64::from_bits(base - ((n as u64) << 52))
7
}
8

9
/// HyperLogLog in Practice: Algorithmic Engineering of
10
/// a State of The Art Cardinality Estimation Algorithm
11
/// Stefan Heule, Marc Nunkesser, Alexander Hall
12
///
13
/// We use m = 256 which gives a relative error of ~6.5% of the cardinality
14
/// estimate. We don't bother with stuffing the counts in 6 bits, byte access is
15
/// fast.
16
///
17
/// The bias correction described in the paper is not implemented, so this is
18
/// somewhere in between HyperLogLog and HyperLogLog++.
19
#[derive(Clone)]
20
pub struct CardinalitySketch {
21
    buckets: Box<[u8; 256]>,
22
}
23

24
impl Default for CardinalitySketch {
25
    fn default() -> Self {
26
        Self::new()
27
    }
28
}
29

30
impl CardinalitySketch {
31
    pub fn new() -> Self {
32
        Self {
33
            // This compiles to alloc_zeroed directly.
34
            buckets: vec![0u8; 256].into_boxed_slice().try_into().unwrap(),
35
        }
36
    }
37

38
    /// Add a new hash to the sketch.
39
    pub fn insert(&mut self, mut h: u64) {
40
        const ARBITRARY_ODD: u64 = 0x902813a5785dc787;
41
        // We multiply by this arbitrarily chosen odd number and then take the
42
        // top bits to ensure the sketch is influenced by all bits of the hash.
43
        h = h.wrapping_mul(ARBITRARY_ODD);
44
        let idx = (h >> 56) as usize;
45
        let p = 1 + (h << 8).leading_zeros() as u8;
46
        self.buckets[idx] = self.buckets[idx].max(p);
47
    }
48

49
    pub fn combine(&mut self, other: &CardinalitySketch) {
50
        *self.buckets = std::array::from_fn(|i| std::cmp::max(self.buckets[i], other.buckets[i]));
51
    }
52

53
    pub fn estimate(&self) -> usize {
54
        let m = 256.0;
55
        let alpha_m = 0.7123 / (1.0 + 1.079 / m);
56

57
        let mut sum = 0.0;
58
        let mut num_zero = 0;
59
        for x in self.buckets.iter() {
60
            sum = alg_add_f64(sum, inv_pow2(*x));
61
            num_zero += (*x == 0) as usize;
62
        }
63

64
        let est = (alpha_m * m * m) / sum;
65
        let corr_est = if est <= 5.0 / 2.0 * m && num_zero != 0 {
66
            // Small cardinality estimate, full 64-bit logarithm is overkill.
67
            m * (m as f32 / num_zero as f32).ln() as f64
68
        } else {
69
            est
70
        };
71

72
        corr_est as usize
73
    }
74
}
75

76
Product

Resources

Company