Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-arrow/src/offset.rs
6939 views
1
#![allow(unsafe_op_in_unsafe_fn)]
2
//! Contains the declaration of [`Offset`]
3
use std::hint::unreachable_unchecked;
4
use std::ops::Deref;
5
6
use polars_error::{PolarsError, PolarsResult, polars_bail, polars_err};
7
8
use crate::array::Splitable;
9
use crate::buffer::Buffer;
10
pub use crate::types::Offset;
11
12
/// A wrapper type of [`Vec<O>`] representing the invariants of Arrow's offsets.
13
/// It is guaranteed to (sound to assume that):
14
/// * every element is `>= 0`
15
/// * element at position `i` is >= than element at position `i-1`.
16
#[derive(Debug, Clone, PartialEq, Eq)]
17
pub struct Offsets<O: Offset>(Vec<O>);
18
19
impl<O: Offset> Default for Offsets<O> {
20
#[inline]
21
fn default() -> Self {
22
Self::new()
23
}
24
}
25
26
impl<O: Offset> Deref for Offsets<O> {
27
type Target = [O];
28
29
fn deref(&self) -> &Self::Target {
30
self.as_slice()
31
}
32
}
33
34
impl<O: Offset> TryFrom<Vec<O>> for Offsets<O> {
35
type Error = PolarsError;
36
37
#[inline]
38
fn try_from(offsets: Vec<O>) -> Result<Self, Self::Error> {
39
try_check_offsets(&offsets)?;
40
Ok(Self(offsets))
41
}
42
}
43
44
impl<O: Offset> TryFrom<Buffer<O>> for OffsetsBuffer<O> {
45
type Error = PolarsError;
46
47
#[inline]
48
fn try_from(offsets: Buffer<O>) -> Result<Self, Self::Error> {
49
try_check_offsets(&offsets)?;
50
Ok(Self(offsets))
51
}
52
}
53
54
impl<O: Offset> TryFrom<Vec<O>> for OffsetsBuffer<O> {
55
type Error = PolarsError;
56
57
#[inline]
58
fn try_from(offsets: Vec<O>) -> Result<Self, Self::Error> {
59
try_check_offsets(&offsets)?;
60
Ok(Self(offsets.into()))
61
}
62
}
63
64
impl<O: Offset> From<Offsets<O>> for OffsetsBuffer<O> {
65
#[inline]
66
fn from(offsets: Offsets<O>) -> Self {
67
Self(offsets.0.into())
68
}
69
}
70
71
impl<O: Offset> Offsets<O> {
72
/// Returns an empty [`Offsets`] (i.e. with a single element, the zero)
73
#[inline]
74
pub fn new() -> Self {
75
Self(vec![O::zero()])
76
}
77
78
/// Returns an [`Offsets`] whose all lengths are zero.
79
#[inline]
80
pub fn new_zeroed(length: usize) -> Self {
81
Self(vec![O::zero(); length + 1])
82
}
83
84
/// Creates a new [`Offsets`] from an iterator of lengths
85
#[inline]
86
pub fn try_from_iter<I: IntoIterator<Item = usize>>(iter: I) -> PolarsResult<Self> {
87
let iterator = iter.into_iter();
88
let (lower, _) = iterator.size_hint();
89
let mut offsets = Self::with_capacity(lower);
90
for item in iterator {
91
offsets.try_push(item)?
92
}
93
Ok(offsets)
94
}
95
96
/// Returns a new [`Offsets`] with a capacity, allocating at least `capacity + 1` entries.
97
pub fn with_capacity(capacity: usize) -> Self {
98
let mut offsets = Vec::with_capacity(capacity + 1);
99
offsets.push(O::zero());
100
Self(offsets)
101
}
102
103
/// Returns the capacity of [`Offsets`].
104
pub fn capacity(&self) -> usize {
105
self.0.capacity() - 1
106
}
107
108
/// Reserves `additional` entries.
109
pub fn reserve(&mut self, additional: usize) {
110
self.0.reserve(additional);
111
}
112
113
/// Shrinks the capacity of self to fit.
114
pub fn shrink_to_fit(&mut self) {
115
self.0.shrink_to_fit();
116
}
117
118
/// Pushes a new element with a given length.
119
/// # Error
120
/// This function errors iff the new last item is larger than what `O` supports.
121
/// # Implementation
122
/// This function:
123
/// * checks that this length does not overflow
124
#[inline]
125
pub fn try_push(&mut self, length: usize) -> PolarsResult<()> {
126
if O::IS_LARGE {
127
let length = O::from_as_usize(length);
128
let old_length = self.last();
129
let new_length = *old_length + length;
130
self.0.push(new_length);
131
Ok(())
132
} else {
133
let length =
134
O::from_usize(length).ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
135
136
let old_length = self.last();
137
let new_length = old_length
138
.checked_add(&length)
139
.ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
140
self.0.push(new_length);
141
Ok(())
142
}
143
}
144
145
/// Returns [`Offsets`] assuming that `offsets` fulfills its invariants
146
///
147
/// # Safety
148
/// This is safe iff the invariants of this struct are guaranteed in `offsets`.
149
#[inline]
150
pub unsafe fn new_unchecked(offsets: Vec<O>) -> Self {
151
#[cfg(debug_assertions)]
152
{
153
let mut prev_offset = O::default();
154
let mut is_monotonely_increasing = true;
155
for offset in &offsets {
156
is_monotonely_increasing &= *offset >= prev_offset;
157
prev_offset = *offset;
158
}
159
assert!(
160
is_monotonely_increasing,
161
"Unsafe precondition violated. Invariant of offsets broken."
162
);
163
}
164
165
Self(offsets)
166
}
167
168
/// Returns the last offset of this container.
169
#[inline]
170
pub fn last(&self) -> &O {
171
match self.0.last() {
172
Some(element) => element,
173
None => unsafe { unreachable_unchecked() },
174
}
175
}
176
177
/// Returns a `length` corresponding to the position `index`
178
/// # Panic
179
/// This function panics iff `index >= self.len_proxy()`
180
#[inline]
181
pub fn length_at(&self, index: usize) -> usize {
182
let (start, end) = self.start_end(index);
183
end - start
184
}
185
186
/// Returns a range (start, end) corresponding to the position `index`
187
/// # Panic
188
/// This function panics iff `index >= self.len_proxy()`
189
#[inline]
190
pub fn start_end(&self, index: usize) -> (usize, usize) {
191
// soundness: the invariant of the function
192
assert!(index < self.len_proxy());
193
unsafe { self.start_end_unchecked(index) }
194
}
195
196
/// Returns a range (start, end) corresponding to the position `index`
197
///
198
/// # Safety
199
/// `index` must be `< self.len()`
200
#[inline]
201
pub unsafe fn start_end_unchecked(&self, index: usize) -> (usize, usize) {
202
// soundness: the invariant of the function
203
let start = self.0.get_unchecked(index).to_usize();
204
let end = self.0.get_unchecked(index + 1).to_usize();
205
(start, end)
206
}
207
208
/// Returns the length an array with these offsets would be.
209
#[inline]
210
pub fn len_proxy(&self) -> usize {
211
self.0.len() - 1
212
}
213
214
/// Returns the byte slice stored in this buffer
215
#[inline]
216
pub fn as_slice(&self) -> &[O] {
217
self.0.as_slice()
218
}
219
220
/// Pops the last element
221
#[inline]
222
pub fn pop(&mut self) -> Option<O> {
223
if self.len_proxy() == 0 {
224
None
225
} else {
226
self.0.pop()
227
}
228
}
229
230
/// Extends itself with `additional` elements equal to the last offset.
231
/// This is useful to extend offsets with empty values, e.g. for null slots.
232
#[inline]
233
pub fn extend_constant(&mut self, additional: usize) {
234
let offset = *self.last();
235
if additional == 1 {
236
self.0.push(offset)
237
} else {
238
self.0.resize(self.0.len() + additional, offset)
239
}
240
}
241
242
/// Try to create a new [`Offsets`] from a sequence of `lengths`
243
/// # Errors
244
/// This function errors iff this operation overflows for the maximum value of `O`.
245
#[inline]
246
pub fn try_from_lengths<I: Iterator<Item = usize>>(lengths: I) -> PolarsResult<Self> {
247
let mut self_ = Self::with_capacity(lengths.size_hint().0);
248
self_.try_extend_from_lengths(lengths)?;
249
Ok(self_)
250
}
251
252
/// Try extend from an iterator of lengths
253
/// # Errors
254
/// This function errors iff this operation overflows for the maximum value of `O`.
255
#[inline]
256
pub fn try_extend_from_lengths<I: Iterator<Item = usize>>(
257
&mut self,
258
lengths: I,
259
) -> PolarsResult<()> {
260
let mut total_length = 0;
261
let mut offset = *self.last();
262
let original_offset = offset.to_usize();
263
264
let lengths = lengths.map(|length| {
265
total_length += length;
266
O::from_as_usize(length)
267
});
268
269
let offsets = lengths.map(|length| {
270
offset += length; // this may overflow, checked below
271
offset
272
});
273
self.0.extend(offsets);
274
275
let last_offset = original_offset
276
.checked_add(total_length)
277
.ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
278
O::from_usize(last_offset).ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
279
Ok(())
280
}
281
282
/// Extends itself from another [`Offsets`]
283
/// # Errors
284
/// This function errors iff this operation overflows for the maximum value of `O`.
285
pub fn try_extend_from_self(&mut self, other: &Self) -> PolarsResult<()> {
286
let mut length = *self.last();
287
let other_length = *other.last();
288
// check if the operation would overflow
289
length
290
.checked_add(&other_length)
291
.ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
292
293
let lengths = other.as_slice().windows(2).map(|w| w[1] - w[0]);
294
let offsets = lengths.map(|new_length| {
295
length += new_length;
296
length
297
});
298
self.0.extend(offsets);
299
Ok(())
300
}
301
302
/// Extends itself from another [`Offsets`] sliced by `start, length`
303
/// # Errors
304
/// This function errors iff this operation overflows for the maximum value of `O`.
305
pub fn try_extend_from_slice(
306
&mut self,
307
other: &OffsetsBuffer<O>,
308
start: usize,
309
length: usize,
310
) -> PolarsResult<()> {
311
if length == 0 {
312
return Ok(());
313
}
314
let other = &other.0[start..start + length + 1];
315
let other_length = other.last().expect("Length to be non-zero");
316
let mut length = *self.last();
317
// check if the operation would overflow
318
length
319
.checked_add(other_length)
320
.ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
321
322
let lengths = other.windows(2).map(|w| w[1] - w[0]);
323
let offsets = lengths.map(|new_length| {
324
length += new_length;
325
length
326
});
327
self.0.extend(offsets);
328
Ok(())
329
}
330
331
/// Returns the inner [`Vec`].
332
#[inline]
333
pub fn into_inner(self) -> Vec<O> {
334
self.0
335
}
336
}
337
338
/// Checks that `offsets` is monotonically increasing.
339
fn try_check_offsets<O: Offset>(offsets: &[O]) -> PolarsResult<()> {
340
// this code is carefully constructed to auto-vectorize, don't change naively!
341
match offsets.first() {
342
None => polars_bail!(ComputeError: "offsets must have at least one element"),
343
Some(first) => {
344
if *first < O::zero() {
345
polars_bail!(ComputeError: "offsets must be larger than 0")
346
}
347
let mut previous = *first;
348
let mut any_invalid = false;
349
350
// This loop will auto-vectorize because there is not any break,
351
// an invalid value will be returned once the whole offsets buffer is processed.
352
for offset in offsets {
353
if previous > *offset {
354
any_invalid = true
355
}
356
previous = *offset;
357
}
358
359
if any_invalid {
360
polars_bail!(ComputeError: "offsets must be monotonically increasing")
361
} else {
362
Ok(())
363
}
364
},
365
}
366
}
367
368
/// A wrapper type of [`Buffer<O>`] that is guaranteed to:
369
/// * Always contain an element
370
/// * Every element is `>= 0`
371
/// * element at position `i` is >= than element at position `i-1`.
372
#[derive(Clone, PartialEq, Debug)]
373
pub struct OffsetsBuffer<O: Offset>(Buffer<O>);
374
375
impl<O: Offset> Default for OffsetsBuffer<O> {
376
#[inline]
377
fn default() -> Self {
378
Self(vec![O::zero()].into())
379
}
380
}
381
382
impl<O: Offset> OffsetsBuffer<O> {
383
/// # Safety
384
/// This is safe iff the invariants of this struct are guaranteed in `offsets`.
385
#[inline]
386
pub unsafe fn new_unchecked(offsets: Buffer<O>) -> Self {
387
Self(offsets)
388
}
389
390
/// Returns an empty [`OffsetsBuffer`] (i.e. with a single element, the zero)
391
#[inline]
392
pub fn new() -> Self {
393
Self(vec![O::zero()].into())
394
}
395
396
#[inline]
397
pub fn one_with_length(length: O) -> Self {
398
Self(vec![O::zero(), length].into())
399
}
400
401
/// Copy-on-write API to convert [`OffsetsBuffer`] into [`Offsets`].
402
#[inline]
403
pub fn into_mut(self) -> either::Either<Self, Offsets<O>> {
404
self.0
405
.into_mut()
406
// SAFETY: Offsets and OffsetsBuffer share invariants
407
.map_right(|offsets| unsafe { Offsets::new_unchecked(offsets) })
408
.map_left(Self)
409
}
410
411
/// Returns a reference to its internal [`Buffer`].
412
#[inline]
413
pub fn buffer(&self) -> &Buffer<O> {
414
&self.0
415
}
416
417
/// Returns what the length an array with these offsets would be.
418
#[inline]
419
pub fn len_proxy(&self) -> usize {
420
self.0.len() - 1
421
}
422
423
/// Returns the number of offsets in this container.
424
#[inline]
425
pub fn len(&self) -> usize {
426
self.0.len()
427
}
428
429
/// Returns the byte slice stored in this buffer
430
#[inline]
431
pub fn as_slice(&self) -> &[O] {
432
self.0.as_slice()
433
}
434
435
/// Returns the range of the offsets.
436
#[inline]
437
pub fn range(&self) -> O {
438
*self.last() - *self.first()
439
}
440
441
/// Returns the first offset.
442
#[inline]
443
pub fn first(&self) -> &O {
444
match self.0.first() {
445
Some(element) => element,
446
None => unsafe { unreachable_unchecked() },
447
}
448
}
449
450
/// Returns the last offset.
451
#[inline]
452
pub fn last(&self) -> &O {
453
match self.0.last() {
454
Some(element) => element,
455
None => unsafe { unreachable_unchecked() },
456
}
457
}
458
459
/// Returns a `length` corresponding to the position `index`
460
/// # Panic
461
/// This function panics iff `index >= self.len_proxy()`
462
#[inline]
463
pub fn length_at(&self, index: usize) -> usize {
464
let (start, end) = self.start_end(index);
465
end - start
466
}
467
468
/// Returns a range (start, end) corresponding to the position `index`
469
/// # Panic
470
/// This function panics iff `index >= self.len_proxy()`
471
#[inline]
472
pub fn start_end(&self, index: usize) -> (usize, usize) {
473
// soundness: the invariant of the function
474
assert!(index < self.len_proxy());
475
unsafe { self.start_end_unchecked(index) }
476
}
477
478
/// Returns a range (start, end) corresponding to the position `index`
479
///
480
/// # Safety
481
/// `index` must be `< self.len()`
482
#[inline]
483
pub unsafe fn start_end_unchecked(&self, index: usize) -> (usize, usize) {
484
// soundness: the invariant of the function
485
let start = self.0.get_unchecked(index).to_usize();
486
let end = self.0.get_unchecked(index + 1).to_usize();
487
(start, end)
488
}
489
490
/// Slices this [`OffsetsBuffer`].
491
/// # Panics
492
/// Panics if `offset + length` is larger than `len`
493
/// or `length == 0`.
494
#[inline]
495
pub fn slice(&mut self, offset: usize, length: usize) {
496
assert!(length > 0);
497
self.0.slice(offset, length);
498
}
499
500
/// Slices this [`OffsetsBuffer`] starting at `offset`.
501
///
502
/// # Safety
503
/// The caller must ensure `offset + length <= self.len()`
504
#[inline]
505
pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
506
self.0.slice_unchecked(offset, length);
507
}
508
509
/// Returns an iterator with the lengths of the offsets
510
#[inline]
511
pub fn lengths(&self) -> impl ExactSizeIterator<Item = usize> + '_ {
512
self.0.windows(2).map(|w| (w[1] - w[0]).to_usize())
513
}
514
515
/// Returns `(offset, len)` pairs.
516
#[inline]
517
pub fn offset_and_length_iter(&self) -> impl ExactSizeIterator<Item = (usize, usize)> + '_ {
518
self.windows(2).map(|x| {
519
let [l, r] = x else { unreachable!() };
520
let l = l.to_usize();
521
let r = r.to_usize();
522
(l, r - l)
523
})
524
}
525
526
/// Offset and length of the primitive (leaf) array for a double+ nested list for every outer
527
/// row.
528
pub fn leaf_ranges_iter(
529
offsets: &[Self],
530
) -> impl Iterator<Item = core::ops::Range<usize>> + '_ {
531
let others = &offsets[1..];
532
533
offsets[0].windows(2).map(move |x| {
534
let [l, r] = x else { unreachable!() };
535
let mut l = l.to_usize();
536
let mut r = r.to_usize();
537
538
for o in others {
539
let slc = o.as_slice();
540
l = slc[l].to_usize();
541
r = slc[r].to_usize();
542
}
543
544
l..r
545
})
546
}
547
548
/// Return the full range of the leaf array used by the list.
549
pub fn leaf_full_start_end(offsets: &[Self]) -> core::ops::Range<usize> {
550
let mut l = offsets[0].first().to_usize();
551
let mut r = offsets[0].last().to_usize();
552
553
for o in &offsets[1..] {
554
let slc = o.as_slice();
555
l = slc[l].to_usize();
556
r = slc[r].to_usize();
557
}
558
559
l..r
560
}
561
562
/// Returns the inner [`Buffer`].
563
#[inline]
564
pub fn into_inner(self) -> Buffer<O> {
565
self.0
566
}
567
568
/// Returns the offset difference between `start` and `end`.
569
#[inline]
570
pub fn delta(&self, start: usize, end: usize) -> usize {
571
assert!(start <= end);
572
573
(self.0[end + 1] - self.0[start]).to_usize()
574
}
575
}
576
577
impl From<&OffsetsBuffer<i32>> for OffsetsBuffer<i64> {
578
fn from(offsets: &OffsetsBuffer<i32>) -> Self {
579
// this conversion is lossless and uphelds all invariants
580
Self(
581
offsets
582
.buffer()
583
.iter()
584
.map(|x| *x as i64)
585
.collect::<Vec<_>>()
586
.into(),
587
)
588
}
589
}
590
591
impl TryFrom<&OffsetsBuffer<i64>> for OffsetsBuffer<i32> {
592
type Error = PolarsError;
593
594
fn try_from(offsets: &OffsetsBuffer<i64>) -> Result<Self, Self::Error> {
595
i32::try_from(*offsets.last()).map_err(|_| polars_err!(ComputeError: "overflow"))?;
596
597
// this conversion is lossless and uphelds all invariants
598
Ok(Self(
599
offsets
600
.buffer()
601
.iter()
602
.map(|x| *x as i32)
603
.collect::<Vec<_>>()
604
.into(),
605
))
606
}
607
}
608
609
impl From<Offsets<i32>> for Offsets<i64> {
610
fn from(offsets: Offsets<i32>) -> Self {
611
// this conversion is lossless and uphelds all invariants
612
Self(
613
offsets
614
.as_slice()
615
.iter()
616
.map(|x| *x as i64)
617
.collect::<Vec<_>>(),
618
)
619
}
620
}
621
622
impl TryFrom<Offsets<i64>> for Offsets<i32> {
623
type Error = PolarsError;
624
625
fn try_from(offsets: Offsets<i64>) -> Result<Self, Self::Error> {
626
i32::try_from(*offsets.last()).map_err(|_| polars_err!(ComputeError: "overflow"))?;
627
628
// this conversion is lossless and uphelds all invariants
629
Ok(Self(
630
offsets
631
.as_slice()
632
.iter()
633
.map(|x| *x as i32)
634
.collect::<Vec<_>>(),
635
))
636
}
637
}
638
639
impl<O: Offset> std::ops::Deref for OffsetsBuffer<O> {
640
type Target = [O];
641
642
#[inline]
643
fn deref(&self) -> &[O] {
644
self.0.as_slice()
645
}
646
}
647
648
impl<O: Offset> Splitable for OffsetsBuffer<O> {
649
fn check_bound(&self, offset: usize) -> bool {
650
offset <= self.len_proxy()
651
}
652
653
unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
654
let mut lhs = self.0.clone();
655
let mut rhs = self.0.clone();
656
657
lhs.slice(0, offset + 1);
658
rhs.slice(offset, self.0.len() - offset);
659
660
(Self(lhs), Self(rhs))
661
}
662
}
663
664