Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-compute/src/unique/distinct.rs
7884 views
1
/// Implementations for {n,arg}-unique on [`Array`] that can be amortized over several invocations.
2
use arrow::array::{Array, BinaryViewArray, BooleanArray, PrimitiveArray, StaticArray};
3
use arrow::bitmap::bitmask::BitMask;
4
use arrow::datatypes::ArrowDataType;
5
use arrow::legacy::prelude::LargeBinaryArray;
6
use arrow::types::{NativeType, PrimitiveType};
7
use polars_utils::aliases::PlHashSet;
8
use polars_utils::float16::pf16;
9
use polars_utils::total_ord::{TotalEq, TotalHash, TotalOrdWrap};
10
use polars_utils::{IdxSize, UnitVec};
11
12
pub trait AmortizedUnique: Send + Sync + 'static {
13
fn new_empty(&self) -> Box<dyn AmortizedUnique>;
14
15
/// Retain indices of items that are unique.
16
///
17
/// This is always stable.
18
///
19
/// # Safety
20
///
21
/// All indices i should be 0 <= i < values.len()
22
unsafe fn retain_unique(&mut self, values: &dyn Array, idxs: &mut UnitVec<IdxSize>);
23
24
/// Get the indices of unique items in an array slice.
25
///
26
/// This is always stable.
27
fn arg_unique(
28
&mut self,
29
values: &dyn Array,
30
idxs: &mut UnitVec<IdxSize>,
31
start: IdxSize,
32
length: IdxSize,
33
);
34
35
/// Get the number of unique items in an array at `idxs`.
36
///
37
/// # Safety
38
///
39
/// All indices i should be 0 <= i < values.len()
40
unsafe fn n_unique_idx(&mut self, values: &dyn Array, idxs: &[IdxSize]) -> IdxSize;
41
42
/// Get the number of unique items in an array slice.
43
fn n_unique_slice(&mut self, values: &dyn Array, start: IdxSize, length: IdxSize) -> IdxSize;
44
}
45
46
pub fn amortized_unique_from_dtype(dtype: &ArrowDataType) -> Box<dyn AmortizedUnique> {
47
use arrow::datatypes::PhysicalType as P;
48
match dtype.to_physical_type() {
49
P::Null => Box::new(NullUnique) as _,
50
P::Boolean => Box::new(BooleanUnique) as _,
51
P::Primitive(pt) => match pt {
52
PrimitiveType::Int8 => Box::new(PrimitiveArgUnique::<i8>::default()) as _,
53
PrimitiveType::Int16 => Box::new(PrimitiveArgUnique::<i16>::default()) as _,
54
PrimitiveType::Int32 => Box::new(PrimitiveArgUnique::<i32>::default()) as _,
55
PrimitiveType::Int64 => Box::new(PrimitiveArgUnique::<i64>::default()) as _,
56
PrimitiveType::Int128 => Box::new(PrimitiveArgUnique::<i128>::default()) as _,
57
PrimitiveType::UInt8 => Box::new(PrimitiveArgUnique::<u8>::default()) as _,
58
PrimitiveType::UInt16 => Box::new(PrimitiveArgUnique::<u16>::default()) as _,
59
PrimitiveType::UInt32 => Box::new(PrimitiveArgUnique::<u32>::default()) as _,
60
PrimitiveType::UInt64 => Box::new(PrimitiveArgUnique::<u64>::default()) as _,
61
PrimitiveType::UInt128 => Box::new(PrimitiveArgUnique::<u128>::default()) as _,
62
PrimitiveType::Float16 => Box::new(PrimitiveArgUnique::<pf16>::default()) as _,
63
PrimitiveType::Float32 => Box::new(PrimitiveArgUnique::<f32>::default()) as _,
64
PrimitiveType::Float64 => Box::new(PrimitiveArgUnique::<f64>::default()) as _,
65
PrimitiveType::Int256 => unreachable!(),
66
PrimitiveType::DaysMs => unreachable!(),
67
PrimitiveType::MonthDayNano => unreachable!(),
68
PrimitiveType::MonthDayMillis => unreachable!(),
69
},
70
P::BinaryView => Box::new(BinaryViewUnique::default()) as _,
71
P::LargeBinary => Box::new(BinaryUnique::default()) as _,
72
73
P::Dictionary(_) => unreachable!(),
74
P::Binary => unreachable!(),
75
P::FixedSizeBinary => unreachable!(),
76
P::Utf8 => unreachable!(),
77
P::LargeUtf8 => unreachable!(),
78
P::List => unreachable!(),
79
P::Union => unreachable!(),
80
P::Map => unreachable!(),
81
82
// Should be handled through BinaryView.
83
P::Utf8View => unreachable!(),
84
85
// Should be handled through row encoding.
86
P::FixedSizeList => unreachable!(),
87
P::LargeList => unreachable!(),
88
P::Struct => unreachable!(),
89
}
90
}
91
92
struct NullUnique;
93
struct BooleanUnique;
94
#[derive(Default)]
95
struct PrimitiveArgUnique<T>(
96
PlHashSet<TotalOrdWrap<T>>,
97
PlHashSet<Option<TotalOrdWrap<T>>>,
98
);
99
#[derive(Default)]
100
struct BinaryViewUnique(PlHashSet<&'static [u8]>, PlHashSet<Option<&'static [u8]>>);
101
#[derive(Default)]
102
struct BinaryUnique(PlHashSet<&'static [u8]>, PlHashSet<Option<&'static [u8]>>);
103
104
impl AmortizedUnique for NullUnique {
105
fn new_empty(&self) -> Box<dyn AmortizedUnique> {
106
Box::new(NullUnique)
107
}
108
109
unsafe fn retain_unique(&mut self, _values: &dyn Array, idxs: &mut UnitVec<IdxSize>) {
110
if !idxs.is_empty() {
111
*idxs = UnitVec::from_slice(&[idxs[0]]);
112
}
113
}
114
115
fn arg_unique(
116
&mut self,
117
values: &dyn Array,
118
idxs: &mut UnitVec<IdxSize>,
119
start: IdxSize,
120
length: IdxSize,
121
) {
122
assert!(start.saturating_add(length) as usize <= values.len());
123
if length > 0 {
124
idxs.push(start);
125
}
126
}
127
128
unsafe fn n_unique_idx(&mut self, _values: &dyn Array, idxs: &[IdxSize]) -> IdxSize {
129
IdxSize::from(!idxs.is_empty())
130
}
131
132
fn n_unique_slice(&mut self, values: &dyn Array, start: IdxSize, length: IdxSize) -> IdxSize {
133
assert!(start.saturating_add(length) as usize <= values.len());
134
IdxSize::from(length > 0)
135
}
136
}
137
138
impl AmortizedUnique for BooleanUnique {
139
fn new_empty(&self) -> Box<dyn AmortizedUnique> {
140
Box::new(BooleanUnique)
141
}
142
143
unsafe fn retain_unique(&mut self, values: &dyn Array, idxs: &mut UnitVec<IdxSize>) {
144
if idxs.len() <= 1 {
145
return;
146
}
147
148
let values = values.as_any().downcast_ref::<BooleanArray>().unwrap();
149
150
if values.has_nulls() {
151
let mut seen = 0u8;
152
idxs.retain(|i| {
153
if seen == 0b111 {
154
return false;
155
}
156
157
// SAFETY: function invariant.
158
let v = match unsafe { values.get_unchecked(i as usize) } {
159
None => 1 << 0,
160
Some(false) => 1 << 1,
161
Some(true) => 1 << 2,
162
};
163
164
let keep = seen & v == 0;
165
seen |= v;
166
keep
167
});
168
} else {
169
let values = values.values();
170
if values.set_bits() == 0 || values.unset_bits() == 0 {
171
*idxs = UnitVec::from_slice(&[idxs[0]]);
172
return;
173
}
174
175
// SAFETY: function invariant.
176
let fst = unsafe { values.get_bit_unchecked(idxs[0] as usize) };
177
*idxs = match idxs[1..]
178
.iter()
179
// SAFETY: function invariant.
180
.position(|&i| fst != unsafe { values.get_bit_unchecked(i as usize) })
181
{
182
None => UnitVec::from_slice(&[idxs[0]]),
183
Some(i) => UnitVec::from_slice(&[idxs[0], idxs[1 + i]]),
184
};
185
}
186
}
187
188
fn arg_unique(
189
&mut self,
190
values: &dyn Array,
191
idxs: &mut UnitVec<IdxSize>,
192
start: IdxSize,
193
length: IdxSize,
194
) {
195
if length <= 1 {
196
if length == 1 {
197
idxs.push(start);
198
}
199
return;
200
}
201
202
assert!(start.saturating_add(length) as usize <= values.len());
203
let values = values.as_any().downcast_ref::<BooleanArray>().unwrap();
204
205
if values.has_nulls() {
206
let mut seen = 0u8;
207
idxs.extend((start..start + length).filter(|i| {
208
if seen == 0b111 {
209
return false;
210
}
211
212
// SAFETY: asserted before.
213
let v = match unsafe { values.get_unchecked(*i as usize) } {
214
None => 1 << 0,
215
Some(false) => 1 << 1,
216
Some(true) => 1 << 2,
217
};
218
219
let keep = seen & v == 0;
220
seen |= v;
221
keep
222
}));
223
} else {
224
let values = values.values();
225
if values.set_bits() == 0 || values.unset_bits() == 0 {
226
*idxs = UnitVec::from_slice(&[start]);
227
return;
228
}
229
230
let values = BitMask::from_bitmap(values);
231
let values = values.sliced(start as usize, length as usize);
232
233
let leading_zeros = values.leading_zeros();
234
if leading_zeros == values.len() {
235
*idxs = UnitVec::from_slice(&[start]);
236
} else if leading_zeros == 0 {
237
let leading_ones = values.leading_ones();
238
if leading_ones == values.len() {
239
*idxs = UnitVec::from_slice(&[start]);
240
} else {
241
*idxs = UnitVec::from_slice(&[start, start + leading_ones as IdxSize]);
242
}
243
} else {
244
*idxs = UnitVec::from_slice(&[start, start + leading_zeros as IdxSize]);
245
}
246
}
247
}
248
249
unsafe fn n_unique_idx(&mut self, values: &dyn Array, idxs: &[IdxSize]) -> IdxSize {
250
if idxs.len() <= 1 {
251
return idxs.len() as IdxSize;
252
}
253
254
let values = values.as_any().downcast_ref::<BooleanArray>().unwrap();
255
256
if values.has_nulls() {
257
let mut seen = 0u8;
258
for &i in idxs {
259
if seen == 0b111 {
260
break;
261
}
262
// SAFETY: function invariant.
263
seen |= match unsafe { values.get_unchecked(i as usize) } {
264
None => 1 << 0,
265
Some(false) => 1 << 1,
266
Some(true) => 1 << 2,
267
};
268
}
269
IdxSize::from(seen.count_ones())
270
} else {
271
let values = values.values();
272
if values.set_bits() == 0 || values.unset_bits() == 0 {
273
return 1;
274
}
275
276
// SAFETY: function invariant.
277
let fst = unsafe { values.get_bit_unchecked(idxs[0] as usize) };
278
for &i in &idxs[1..] {
279
// SAFETY: function invariant.
280
if fst != unsafe { values.get_bit_unchecked(i as usize) } {
281
return 2;
282
}
283
}
284
1
285
}
286
}
287
288
fn n_unique_slice(&mut self, values: &dyn Array, start: IdxSize, length: IdxSize) -> IdxSize {
289
if length <= 1 {
290
return length;
291
}
292
293
let values = values.as_any().downcast_ref::<BooleanArray>().unwrap();
294
assert!(start.saturating_add(length) as usize <= values.len());
295
296
if values.has_nulls() {
297
let validity = BitMask::from_bitmap(values.validity().unwrap());
298
let values = BitMask::from_bitmap(values.values());
299
300
let validity = validity.sliced(start as usize, length as usize);
301
let values = values.sliced(start as usize, length as usize);
302
303
let num_valid = validity.set_bits();
304
if num_valid == 0 {
305
return 1;
306
}
307
308
if num_valid as IdxSize == length {
309
let num_trues = values.set_bits() as IdxSize;
310
1 + IdxSize::from(num_trues != length && num_trues != 0)
311
} else {
312
let num_trues = values.num_intersections_with(validity);
313
2 + IdxSize::from(num_trues != num_valid && num_trues != 0)
314
}
315
} else {
316
let values = values.values();
317
if values.set_bits() == 0 || values.unset_bits() == 0 {
318
return 1;
319
}
320
321
let values = BitMask::from_bitmap(values);
322
let values = values.sliced(start as usize, length as usize);
323
let num_trues = values.set_bits();
324
1 + IdxSize::from(num_trues != 0 && num_trues != values.len())
325
}
326
}
327
}
328
329
impl<T: NativeType + TotalHash + TotalEq> AmortizedUnique for PrimitiveArgUnique<T> {
330
fn new_empty(&self) -> Box<dyn AmortizedUnique> {
331
Box::new(PrimitiveArgUnique::<T>::default())
332
}
333
334
unsafe fn retain_unique(&mut self, values: &dyn Array, idxs: &mut UnitVec<IdxSize>) {
335
if idxs.len() <= 1 {
336
return;
337
}
338
339
let values = values.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap();
340
341
if values.has_nulls() {
342
self.1.clear();
343
idxs.retain(|i| {
344
// SAFETY: function invariant.
345
let value = unsafe { values.get_unchecked(i as usize) };
346
let value = value.map(TotalOrdWrap);
347
self.1.insert(value)
348
});
349
} else {
350
self.0.clear();
351
let values = values.values().as_slice();
352
idxs.retain(|i| {
353
// SAFETY: function invariant.
354
let value = *unsafe { values.get_unchecked(i as usize) };
355
let value = TotalOrdWrap(value);
356
self.0.insert(value)
357
});
358
}
359
}
360
361
fn arg_unique(
362
&mut self,
363
values: &dyn Array,
364
idxs: &mut UnitVec<IdxSize>,
365
start: IdxSize,
366
length: IdxSize,
367
) {
368
if length <= 1 {
369
if length == 1 {
370
idxs.push(start);
371
}
372
return;
373
}
374
375
let values = values.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap();
376
assert!(start.saturating_add(length) as usize <= values.len());
377
378
if values.has_nulls() {
379
self.1.clear();
380
idxs.extend((start..start + length).filter(|i| {
381
// SAFETY: asserted before.
382
let value = unsafe { values.get_unchecked(*i as usize) };
383
let value = value.map(TotalOrdWrap);
384
self.1.insert(value)
385
}));
386
} else {
387
self.0.clear();
388
let values = values.values().as_slice();
389
idxs.extend(
390
values[start as usize..][..length as usize]
391
.iter()
392
.enumerate()
393
.filter_map(|(i, value)| {
394
let value = TotalOrdWrap(*value);
395
self.0.insert(value).then_some(i as IdxSize + start)
396
}),
397
);
398
}
399
}
400
401
unsafe fn n_unique_idx(&mut self, values: &dyn Array, idxs: &[IdxSize]) -> IdxSize {
402
if idxs.len() <= 1 {
403
return idxs.len() as IdxSize;
404
}
405
406
let values = values.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap();
407
408
if values.has_nulls() {
409
self.1.clear();
410
self.1.extend(idxs.iter().map(|&i| {
411
// SAFETY: function invariant.
412
let value = unsafe { values.get_unchecked(i as usize) };
413
value.map(TotalOrdWrap)
414
}));
415
self.1.len() as IdxSize
416
} else {
417
let values = values.values();
418
self.0.clear();
419
self.0.extend(idxs.iter().map(|&i| {
420
// SAFETY: function invariant.
421
let value = *unsafe { values.get_unchecked(i as usize) };
422
TotalOrdWrap(value)
423
}));
424
self.0.len() as IdxSize
425
}
426
}
427
428
fn n_unique_slice(&mut self, values: &dyn Array, start: IdxSize, length: IdxSize) -> IdxSize {
429
if length <= 1 {
430
return length;
431
}
432
433
let values = values.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap();
434
assert!(start.saturating_add(length) as usize <= values.len());
435
436
if values.has_nulls() {
437
self.1.clear();
438
self.1.extend((start..start + length).map(|i| {
439
// SAFETY: asserted before.
440
let value = unsafe { values.get_unchecked(i as usize) };
441
value.map(TotalOrdWrap)
442
}));
443
self.1.len() as IdxSize
444
} else {
445
let values = values.values();
446
self.0.clear();
447
self.0.extend(
448
values[start as usize..][..length as usize]
449
.iter()
450
.map(|&v| TotalOrdWrap(v)),
451
);
452
self.0.len() as IdxSize
453
}
454
}
455
}
456
457
impl AmortizedUnique for BinaryViewUnique {
458
fn new_empty(&self) -> Box<dyn AmortizedUnique> {
459
Box::new(BinaryViewUnique::default())
460
}
461
462
fn arg_unique(
463
&mut self,
464
values: &dyn Array,
465
idxs: &mut UnitVec<IdxSize>,
466
start: IdxSize,
467
length: IdxSize,
468
) {
469
if length <= 1 {
470
if length == 1 {
471
idxs.push(start);
472
}
473
return;
474
}
475
476
let values = values.as_any().downcast_ref::<BinaryViewArray>().unwrap();
477
assert!(start.saturating_add(length) as usize <= values.len());
478
479
if values.has_nulls() {
480
self.1.reserve(length as usize);
481
idxs.extend((start..start + length).filter(|i| {
482
// SAFETY: asserted before.
483
let value = unsafe { values.get_unchecked(*i as usize) };
484
// SAFETY: Gets cleared at end of the scope.
485
let value =
486
value.map(|v| unsafe { std::mem::transmute::<&[u8], &'static [u8]>(v) });
487
self.1.insert(value)
488
}));
489
self.1.clear();
490
} else {
491
self.0.reserve(length as usize);
492
if values.total_buffer_len() == 0 {
493
let views = values.views().as_slice();
494
idxs.extend(
495
views[start as usize..][..length as usize]
496
.iter()
497
.enumerate()
498
.filter_map(|(i, value)| {
499
debug_assert!(value.is_inline());
500
501
// SAFETY: buffer length == 0.
502
let value = unsafe { value.get_inlined_slice_unchecked() };
503
// SAFETY: Gets cleared at end of the scope.
504
let value =
505
unsafe { std::mem::transmute::<&[u8], &'static [u8]>(value) };
506
self.0.insert(value).then_some(i as IdxSize + start)
507
}),
508
);
509
} else {
510
idxs.extend((start..start + length).filter(|i| {
511
// SAFETY: asserted before.
512
let value = unsafe { values.value_unchecked(*i as usize) };
513
// SAFETY: Gets cleared at end of the scope.
514
let value = unsafe { std::mem::transmute::<&[u8], &'static [u8]>(value) };
515
self.0.insert(value)
516
}));
517
}
518
self.0.clear();
519
}
520
}
521
522
unsafe fn retain_unique(&mut self, values: &dyn Array, idxs: &mut UnitVec<IdxSize>) {
523
if idxs.len() <= 1 {
524
return;
525
}
526
527
let values = values.as_any().downcast_ref::<BinaryViewArray>().unwrap();
528
if values.has_nulls() {
529
self.1.reserve(idxs.len());
530
idxs.retain(|i| {
531
// SAFETY: asserted before.
532
let value = unsafe { values.get_unchecked(i as usize) };
533
// SAFETY: Gets cleared at end of the scope.
534
let value =
535
value.map(|v| unsafe { std::mem::transmute::<&[u8], &'static [u8]>(v) });
536
self.1.insert(value)
537
});
538
self.1.clear();
539
} else {
540
self.0.reserve(idxs.len());
541
if values.total_buffer_len() == 0 {
542
let views = values.views().as_slice();
543
idxs.retain(|i| {
544
let value = unsafe { views.get_unchecked(i as usize) };
545
debug_assert!(value.is_inline());
546
547
// SAFETY: buffer length == 0.
548
let value = unsafe { value.get_inlined_slice_unchecked() };
549
// SAFETY: Gets cleared at end of the scope.
550
let value = unsafe { std::mem::transmute::<&[u8], &'static [u8]>(value) };
551
self.0.insert(value)
552
});
553
} else {
554
idxs.retain(|i| {
555
// SAFETY: asserted before.
556
let value = unsafe { values.value_unchecked(i as usize) };
557
// SAFETY: Gets cleared at end of the scope.
558
let value = unsafe { std::mem::transmute::<&[u8], &'static [u8]>(value) };
559
self.0.insert(value)
560
});
561
}
562
self.0.clear();
563
}
564
}
565
566
unsafe fn n_unique_idx(&mut self, values: &dyn Array, idxs: &[IdxSize]) -> IdxSize {
567
if idxs.len() <= 1 {
568
return idxs.len() as IdxSize;
569
}
570
571
let values = values.as_any().downcast_ref::<BinaryViewArray>().unwrap();
572
573
if values.has_nulls() {
574
self.1.reserve(idxs.len());
575
self.1.extend(idxs.iter().map(|&i| {
576
// SAFETY: function invariant.
577
let value = unsafe { values.get_unchecked(i as usize) };
578
// SAFETY: Gets cleared at end of the scope.
579
value.map(|v| unsafe { std::mem::transmute::<&[u8], &'static [u8]>(v) })
580
}));
581
let out = self.1.len() as IdxSize;
582
self.1.clear();
583
out
584
} else {
585
self.0.reserve(idxs.len());
586
if values.total_buffer_len() == 0 {
587
let views = values.views().as_slice();
588
self.0.extend(idxs.iter().map(|&i| {
589
let value = unsafe { views.get_unchecked(i as usize) };
590
debug_assert!(value.is_inline());
591
592
// SAFETY: buffer length == 0.
593
let value = unsafe { value.get_inlined_slice_unchecked() };
594
// SAFETY: Gets cleared at end of the scope.
595
unsafe { std::mem::transmute::<&[u8], &'static [u8]>(value) }
596
}));
597
} else {
598
self.0.extend(idxs.iter().map(|&i| {
599
// SAFETY: function invariant.
600
let value = unsafe { values.value_unchecked(i as usize) };
601
// SAFETY: Gets cleared at end of the scope.
602
unsafe { std::mem::transmute::<&[u8], &'static [u8]>(value) }
603
}));
604
}
605
let out = self.0.len() as IdxSize;
606
self.0.clear();
607
out
608
}
609
}
610
611
fn n_unique_slice(&mut self, values: &dyn Array, start: IdxSize, length: IdxSize) -> IdxSize {
612
if length <= 1 {
613
return length;
614
}
615
616
let values = values.as_any().downcast_ref::<BinaryViewArray>().unwrap();
617
assert!(start.saturating_add(length) as usize <= values.len());
618
619
if values.has_nulls() {
620
self.1.reserve(length as usize);
621
self.1.extend((start..start + length).map(|i| {
622
// SAFETY: asserted before.
623
let value = unsafe { values.get_unchecked(i as usize) };
624
// SAFETY: Gets cleared at end of the scope.
625
value.map(|v| unsafe { std::mem::transmute::<&[u8], &'static [u8]>(v) })
626
}));
627
let out = self.1.len() as IdxSize;
628
self.1.clear();
629
out
630
} else {
631
self.0.reserve(length as usize);
632
if values.total_buffer_len() == 0 {
633
let views = values.views().as_slice();
634
self.0.extend(
635
views[start as usize..][..length as usize]
636
.iter()
637
.map(|value| {
638
debug_assert!(value.is_inline());
639
640
// SAFETY: buffer length == 0.
641
let value = unsafe { value.get_inlined_slice_unchecked() };
642
// SAFETY: Gets cleared at end of the scope.
643
unsafe { std::mem::transmute::<&[u8], &'static [u8]>(value) }
644
}),
645
);
646
} else {
647
self.0.extend((start..start + length).map(|i| {
648
// SAFETY: asserted before.
649
let value = unsafe { values.value_unchecked(i as usize) };
650
// SAFETY: Gets cleared at end of the scope.
651
unsafe { std::mem::transmute::<&[u8], &'static [u8]>(value) }
652
}));
653
}
654
let out = self.0.len() as IdxSize;
655
self.0.clear();
656
out
657
}
658
}
659
}
660
661
impl AmortizedUnique for BinaryUnique {
662
fn new_empty(&self) -> Box<dyn AmortizedUnique> {
663
Box::new(BinaryUnique::default())
664
}
665
666
fn arg_unique(
667
&mut self,
668
values: &dyn Array,
669
idxs: &mut UnitVec<IdxSize>,
670
start: IdxSize,
671
length: IdxSize,
672
) {
673
if length <= 1 {
674
if length == 1 {
675
idxs.push(start);
676
}
677
return;
678
}
679
680
let values = values.as_any().downcast_ref::<LargeBinaryArray>().unwrap();
681
assert!(start.saturating_add(length) as usize <= values.len());
682
683
if values.has_nulls() {
684
self.1.reserve(length as usize);
685
idxs.extend((start..start + length).filter(|i| {
686
// SAFETY: asserted before.
687
let value = unsafe { values.get_unchecked(*i as usize) };
688
// SAFETY: Gets cleared at end of the scope.
689
let value =
690
value.map(|v| unsafe { std::mem::transmute::<&[u8], &'static [u8]>(v) });
691
self.1.insert(value)
692
}));
693
self.1.clear();
694
} else {
695
self.0.reserve(length as usize);
696
idxs.extend((start..start + length).filter(|i| {
697
// SAFETY: asserted before.
698
let value = unsafe { values.value_unchecked(*i as usize) };
699
let value = unsafe { std::mem::transmute::<&[u8], &'static [u8]>(value) };
700
self.0.insert(value)
701
}));
702
self.0.clear();
703
}
704
}
705
706
unsafe fn retain_unique(&mut self, values: &dyn Array, idxs: &mut UnitVec<IdxSize>) {
707
if idxs.len() <= 1 {
708
return;
709
}
710
711
let values = values.as_any().downcast_ref::<LargeBinaryArray>().unwrap();
712
713
if values.has_nulls() {
714
self.1.reserve(idxs.len());
715
idxs.retain(|i| {
716
// SAFETY: function invariant.
717
let value = unsafe { values.get_unchecked(i as usize) };
718
// SAFETY: Gets cleared at end of the scope.
719
let value =
720
value.map(|v| unsafe { std::mem::transmute::<&[u8], &'static [u8]>(v) });
721
self.1.insert(value)
722
});
723
self.1.clear();
724
} else {
725
self.0.reserve(idxs.len());
726
idxs.retain(|i| {
727
// SAFETY: function invariant.
728
let value = unsafe { values.value_unchecked(i as usize) };
729
let value = unsafe { std::mem::transmute::<&[u8], &'static [u8]>(value) };
730
self.0.insert(value)
731
});
732
self.0.clear();
733
}
734
}
735
736
unsafe fn n_unique_idx(&mut self, values: &dyn Array, idxs: &[IdxSize]) -> IdxSize {
737
if idxs.len() <= 1 {
738
return idxs.len() as IdxSize;
739
}
740
741
let values = values.as_any().downcast_ref::<LargeBinaryArray>().unwrap();
742
743
if values.has_nulls() {
744
self.1.reserve(idxs.len());
745
self.1.extend(idxs.iter().map(|&i| {
746
// SAFETY: function invariant.
747
let value = unsafe { values.get_unchecked(i as usize) };
748
// SAFETY: Gets cleared at end of the scope.
749
value.map(|v| unsafe { std::mem::transmute::<&[u8], &'static [u8]>(v) })
750
}));
751
let out = self.1.len() as IdxSize;
752
self.1.clear();
753
out
754
} else {
755
self.0.reserve(idxs.len());
756
self.0.extend(idxs.iter().map(|&i| {
757
// SAFETY: function invariant.
758
let value = unsafe { values.value_unchecked(i as usize) };
759
// SAFETY: Gets cleared at end of the scope.
760
unsafe { std::mem::transmute::<&[u8], &'static [u8]>(value) }
761
}));
762
let out = self.0.len() as IdxSize;
763
self.0.clear();
764
out
765
}
766
}
767
768
fn n_unique_slice(&mut self, values: &dyn Array, start: IdxSize, length: IdxSize) -> IdxSize {
769
if length <= 1 {
770
return length;
771
}
772
773
let values = values.as_any().downcast_ref::<LargeBinaryArray>().unwrap();
774
assert!(start.saturating_add(length) as usize <= values.len());
775
776
if values.has_nulls() {
777
self.1.reserve(length as usize);
778
self.1.extend((start..start + length).map(|i| {
779
// SAFETY: asserted before.
780
let value = unsafe { values.get_unchecked(i as usize) };
781
// SAFETY: Gets cleared at end of the scope.
782
value.map(|v| unsafe { std::mem::transmute::<&[u8], &'static [u8]>(v) })
783
}));
784
let out = self.1.len() as IdxSize;
785
self.1.clear();
786
out
787
} else {
788
self.0.reserve(length as usize);
789
self.0.extend((start..start + length).map(|i| {
790
// SAFETY: asserted before.
791
let value = unsafe { values.value_unchecked(i as usize) };
792
// SAFETY: Gets cleared at end of the scope.
793
unsafe { std::mem::transmute::<&[u8], &'static [u8]>(value) }
794
}));
795
let out = self.0.len() as IdxSize;
796
self.0.clear();
797
out
798
}
799
}
800
}
801
802