Path: blob/main/crates/polars-compute/src/unique/primitive.rs
6939 views
use std::ops::{Add, RangeInclusive, Sub};12use arrow::array::PrimitiveArray;3use arrow::bitmap::bitmask::BitMask;4use arrow::bitmap::{BitmapBuilder, MutableBitmap};5use arrow::datatypes::ArrowDataType;6use arrow::types::NativeType;7use num_traits::{FromPrimitive, ToPrimitive};8use polars_utils::total_ord::TotalOrd;910use super::RangedUniqueKernel;1112/// A specialized unique kernel for [`PrimitiveArray`] for when all values are in a small known13/// range.14pub struct PrimitiveRangedUniqueState<T: NativeType> {15seen: Seen,16range: RangeInclusive<T>,17}1819enum Seen {20Small(u128),21Large(MutableBitmap),22}2324impl Seen {25pub fn from_size(size: usize) -> Self {26if size <= 128 {27Self::Small(0)28} else {29Self::Large(MutableBitmap::from_len_zeroed(size))30}31}3233fn num_seen(&self) -> usize {34match self {35Seen::Small(v) => v.count_ones() as usize,36Seen::Large(v) => v.set_bits(),37}38}3940fn has_seen_null(&self, size: usize) -> bool {41match self {42Self::Small(v) => v >> (size - 1) != 0,43Self::Large(v) => v.get(size - 1),44}45}46}4748impl<T: NativeType> PrimitiveRangedUniqueState<T>49where50T: Add<T, Output = T> + Sub<T, Output = T> + ToPrimitive + FromPrimitive,51{52pub fn new(min_value: T, max_value: T) -> Self {53let size = (max_value - min_value).to_usize().unwrap();54// Range is inclusive55let size = size + 1;56// One value is left for null57let size = size + 1;5859Self {60seen: Seen::from_size(size),61range: min_value..=max_value,62}63}6465fn size(&self) -> usize {66(*self.range.end() - *self.range.start())67.to_usize()68.unwrap()69+ 170}71}7273impl<T: NativeType> RangedUniqueKernel for PrimitiveRangedUniqueState<T>74where75T: Add<T, Output = T> + Sub<T, Output = T> + ToPrimitive + FromPrimitive,76{77type Array = PrimitiveArray<T>;7879fn has_seen_all(&self) -> bool {80let size = self.size();81match &self.seen {82Seen::Small(v) if size == 128 => !v == 0,83Seen::Small(v) => *v == ((1 << size) - 1),84Seen::Large(v) => BitMask::new(v.as_slice(), 0, size).unset_bits() == 0,85}86}8788fn append(&mut self, array: &Self::Array) {89let size = self.size();90match array.validity().as_ref().filter(|v| v.unset_bits() > 0) {91None => {92const STEP_SIZE: usize = 512;9394let mut i = 0;95let values = array.values().as_slice();9697match self.seen {98Seen::Small(ref mut seen) => {99// Check every so often whether we have already seen all the values.100while *seen != ((1 << (size - 1)) - 1) && i < values.len() {101for v in values[i..].iter().take(STEP_SIZE) {102if cfg!(debug_assertions) {103assert!(TotalOrd::tot_ge(v, self.range.start()));104assert!(TotalOrd::tot_le(v, self.range.end()));105}106107let v = *v - *self.range.start();108let v = unsafe { v.to_usize().unwrap_unchecked() };109*seen |= 1 << v;110}111112i += STEP_SIZE;113}114},115Seen::Large(ref mut seen) => {116// Check every so often whether we have already seen all the values.117while BitMask::new(seen.as_slice(), 0, size - 1).unset_bits() > 0118&& i < values.len()119{120for v in values[i..].iter().take(STEP_SIZE) {121if cfg!(debug_assertions) {122assert!(TotalOrd::tot_ge(v, self.range.start()));123assert!(TotalOrd::tot_le(v, self.range.end()));124}125126let v = *v - *self.range.start();127let v = unsafe { v.to_usize().unwrap_unchecked() };128seen.set(v, true);129}130131i += STEP_SIZE;132}133},134}135},136Some(_) => {137let iter = array.non_null_values_iter();138139match self.seen {140Seen::Small(ref mut seen) => {141*seen |= 1 << (size - 1);142143for v in iter {144if cfg!(debug_assertions) {145assert!(TotalOrd::tot_ge(&v, self.range.start()));146assert!(TotalOrd::tot_le(&v, self.range.end()));147}148149let v = v - *self.range.start();150let v = unsafe { v.to_usize().unwrap_unchecked() };151*seen |= 1 << v;152}153},154Seen::Large(ref mut seen) => {155seen.set(size - 1, true);156157for v in iter {158if cfg!(debug_assertions) {159assert!(TotalOrd::tot_ge(&v, self.range.start()));160assert!(TotalOrd::tot_le(&v, self.range.end()));161}162163let v = v - *self.range.start();164let v = unsafe { v.to_usize().unwrap_unchecked() };165seen.set(v, true);166}167},168}169},170}171}172173fn append_state(&mut self, other: &Self) {174debug_assert_eq!(self.size(), other.size());175match (&mut self.seen, &other.seen) {176(Seen::Small(lhs), Seen::Small(rhs)) => *lhs |= rhs,177(Seen::Large(lhs), Seen::Large(rhs)) => {178let mut lhs = lhs;179<&mut MutableBitmap as std::ops::BitOrAssign<&MutableBitmap>>::bitor_assign(180&mut lhs, rhs,181)182},183_ => unreachable!(),184}185}186187fn finalize_unique(self) -> Self::Array {188let size = self.size();189let seen = self.seen;190191let has_null = seen.has_seen_null(size);192let num_values = seen.num_seen();193let mut values = Vec::with_capacity(num_values);194195let mut offset = 0;196match seen {197Seen::Small(mut v) => {198while v != 0 {199let shift = v.trailing_zeros();200offset += shift as u8;201values.push(*self.range.start() + T::from_u8(offset).unwrap());202203v >>= shift + 1;204offset += 1;205}206},207Seen::Large(v) => {208for offset in v.freeze().true_idx_iter() {209values.push(*self.range.start() + T::from_usize(offset).unwrap());210}211},212}213214let validity = if has_null {215let mut validity = BitmapBuilder::new();216validity.extend_constant(values.len() - 1, true);217validity.push(false);218// The null has already been pushed.219*values.last_mut().unwrap() = T::zeroed();220Some(validity.freeze())221} else {222None223};224225PrimitiveArray::new(ArrowDataType::from(T::PRIMITIVE), values.into(), validity)226}227228fn finalize_n_unique(&self) -> usize {229self.seen.num_seen()230}231232fn finalize_n_unique_non_null(&self) -> usize {233self.seen.num_seen() - usize::from(self.seen.has_seen_null(self.size()))234}235}236237238