Path: blob/main/crates/polars-utils/src/cardinality_sketch.rs
8446 views
use crate::algebraic_ops::alg_add_f64;12// Computes 2^-n by directly subtracting from the IEEE754 double exponent.3fn inv_pow2(n: u8) -> f64 {4let base = f64::to_bits(1.0);5f64::from_bits(base - ((n as u64) << 52))6}78/// HyperLogLog in Practice: Algorithmic Engineering of9/// a State of The Art Cardinality Estimation Algorithm10/// Stefan Heule, Marc Nunkesser, Alexander Hall11///12/// We use m = 256 which gives a relative error of ~6.5% of the cardinality13/// estimate. We don't bother with stuffing the counts in 6 bits, byte access is14/// fast.15///16/// The bias correction described in the paper is not implemented, so this is17/// somewhere in between HyperLogLog and HyperLogLog++.18#[derive(Clone)]19pub struct CardinalitySketch {20buckets: [u8; 256],21}2223impl Default for CardinalitySketch {24fn default() -> Self {25Self::new()26}27}2829impl CardinalitySketch {30pub fn new() -> Self {31Self {32buckets: [0u8; 256],33}34}3536/// Add a new hash to the sketch.37pub fn insert(&mut self, mut h: u64) {38const ARBITRARY_ODD: u64 = 0x902813a5785dc787;39// We multiply by this arbitrarily chosen odd number and then take the40// top bits to ensure the sketch is influenced by all bits of the hash.41h = h.wrapping_mul(ARBITRARY_ODD);42let idx = (h >> 56) as usize;43let p = 1 + (h << 8).leading_zeros() as u8;44self.buckets[idx] = self.buckets[idx].max(p);45}4647pub fn combine(&mut self, other: &CardinalitySketch) {48self.buckets = std::array::from_fn(|i| std::cmp::max(self.buckets[i], other.buckets[i]));49}5051pub fn estimate(&self) -> usize {52let m = 256.0;53let alpha_m = 0.7123 / (1.0 + 1.079 / m);5455let mut sum = 0.0;56let mut num_zero = 0;57for x in self.buckets.iter() {58sum = alg_add_f64(sum, inv_pow2(*x));59num_zero += (*x == 0) as usize;60}6162let est = (alpha_m * m * m) / sum;63let corr_est = if est <= 5.0 / 2.0 * m && num_zero != 0 {64// Small cardinality estimate, full 64-bit logarithm is overkill.65m * (m as f32 / num_zero as f32).ln() as f6466} else {67est68};6970if num_zero == self.buckets.len() {71072} else {73((corr_est + 0.5) as usize).max(1)74}75}7677pub fn into_state(self) -> [u8; 256] {78self.buckets79}8081pub fn from_state(buckets: [u8; 256]) -> Self {82Self { buckets }83}84}858687