Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-compute/src/cardinality.rs
6939 views
1
use arrow::array::{
2
Array, BinaryArray, BinaryViewArray, BooleanArray, FixedSizeBinaryArray, PrimitiveArray,
3
Utf8Array, Utf8ViewArray,
4
};
5
use arrow::datatypes::PhysicalType;
6
use arrow::types::Offset;
7
use arrow::with_match_primitive_type_full;
8
use polars_utils::total_ord::ToTotalOrd;
9
10
use crate::hyperloglogplus::HyperLogLog;
11
12
/// Get an estimate for the *cardinality* of the array (i.e. the number of unique values)
13
///
14
/// This is not currently implemented for nested types.
15
pub fn estimate_cardinality(array: &dyn Array) -> usize {
16
if array.is_empty() {
17
return 0;
18
}
19
20
if array.null_count() == array.len() {
21
return 1;
22
}
23
24
// Estimate the cardinality with HyperLogLog
25
use PhysicalType as PT;
26
match array.dtype().to_physical_type() {
27
PT::Null => 1,
28
29
PT::Boolean => {
30
let mut cardinality = 0;
31
32
let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
33
34
cardinality += usize::from(array.has_nulls());
35
36
if let Some(unset_bits) = array.values().lazy_unset_bits() {
37
cardinality += 1 + usize::from(unset_bits != array.len());
38
} else {
39
cardinality += 2;
40
}
41
42
cardinality
43
},
44
45
PT::Primitive(primitive_type) => with_match_primitive_type_full!(primitive_type, |$T| {
46
let mut hll = HyperLogLog::new();
47
48
let array = array
49
.as_any()
50
.downcast_ref::<PrimitiveArray<$T>>()
51
.unwrap();
52
53
if array.has_nulls() {
54
for v in array.iter() {
55
let v = v.copied().unwrap_or_default();
56
hll.add(&v.to_total_ord());
57
}
58
} else {
59
for v in array.values_iter() {
60
hll.add(&v.to_total_ord());
61
}
62
}
63
64
hll.count()
65
}),
66
PT::FixedSizeBinary => {
67
let mut hll = HyperLogLog::new();
68
69
let array = array
70
.as_any()
71
.downcast_ref::<FixedSizeBinaryArray>()
72
.unwrap();
73
74
if array.has_nulls() {
75
for v in array.iter() {
76
let v = v.unwrap_or_default();
77
hll.add(v);
78
}
79
} else {
80
for v in array.values_iter() {
81
hll.add(v);
82
}
83
}
84
85
hll.count()
86
},
87
PT::Binary => {
88
binary_offset_array_estimate(array.as_any().downcast_ref::<BinaryArray<i32>>().unwrap())
89
},
90
PT::LargeBinary => {
91
binary_offset_array_estimate(array.as_any().downcast_ref::<BinaryArray<i64>>().unwrap())
92
},
93
PT::Utf8 => binary_offset_array_estimate(
94
&array
95
.as_any()
96
.downcast_ref::<Utf8Array<i32>>()
97
.unwrap()
98
.to_binary(),
99
),
100
PT::LargeUtf8 => binary_offset_array_estimate(
101
&array
102
.as_any()
103
.downcast_ref::<Utf8Array<i64>>()
104
.unwrap()
105
.to_binary(),
106
),
107
PT::BinaryView => {
108
binary_view_array_estimate(array.as_any().downcast_ref::<BinaryViewArray>().unwrap())
109
},
110
PT::Utf8View => binary_view_array_estimate(
111
&array
112
.as_any()
113
.downcast_ref::<Utf8ViewArray>()
114
.unwrap()
115
.to_binview(),
116
),
117
PT::List => unimplemented!(),
118
PT::FixedSizeList => unimplemented!(),
119
PT::LargeList => unimplemented!(),
120
PT::Struct => unimplemented!(),
121
PT::Union => unimplemented!(),
122
PT::Map => unimplemented!(),
123
PT::Dictionary(_) => unimplemented!(),
124
}
125
}
126
127
fn binary_offset_array_estimate<O: Offset>(array: &BinaryArray<O>) -> usize {
128
let mut hll = HyperLogLog::new();
129
130
if array.has_nulls() {
131
for v in array.iter() {
132
let v = v.unwrap_or_default();
133
hll.add(v);
134
}
135
} else {
136
for v in array.values_iter() {
137
hll.add(v);
138
}
139
}
140
141
hll.count()
142
}
143
144
fn binary_view_array_estimate(array: &BinaryViewArray) -> usize {
145
let mut hll = HyperLogLog::new();
146
147
if array.has_nulls() {
148
for v in array.iter() {
149
let v = v.unwrap_or_default();
150
hll.add(v);
151
}
152
} else {
153
for v in array.values_iter() {
154
hll.add(v);
155
}
156
}
157
158
hll.count()
159
}
160
161