Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-arrow/src/offset.rs
8410 views
1
#![allow(unsafe_op_in_unsafe_fn)]
2
//! Contains the declaration of [`Offset`]
3
use std::hint::unreachable_unchecked;
4
use std::ops::Deref;
5
6
use polars_buffer::Buffer;
7
use polars_error::{PolarsError, PolarsResult, polars_bail, polars_err};
8
9
use crate::array::Splitable;
10
pub use crate::types::Offset;
11
12
/// A wrapper type of [`Vec<O>`] representing the invariants of Arrow's offsets.
13
/// It is guaranteed to (sound to assume that):
14
/// * every element is `>= 0`
15
/// * element at position `i` is >= than element at position `i-1`.
16
#[derive(Debug, Clone, PartialEq, Eq)]
17
pub struct Offsets<O: Offset>(Vec<O>);
18
19
impl<O: Offset> Default for Offsets<O> {
20
#[inline]
21
fn default() -> Self {
22
Self::new()
23
}
24
}
25
26
impl<O: Offset> Deref for Offsets<O> {
27
type Target = [O];
28
29
fn deref(&self) -> &Self::Target {
30
self.as_slice()
31
}
32
}
33
34
impl<O: Offset> TryFrom<Vec<O>> for Offsets<O> {
35
type Error = PolarsError;
36
37
#[inline]
38
fn try_from(offsets: Vec<O>) -> Result<Self, Self::Error> {
39
try_check_offsets(&offsets)?;
40
Ok(Self(offsets))
41
}
42
}
43
44
impl<O: Offset> TryFrom<Buffer<O>> for OffsetsBuffer<O> {
45
type Error = PolarsError;
46
47
#[inline]
48
fn try_from(offsets: Buffer<O>) -> Result<Self, Self::Error> {
49
try_check_offsets(&offsets)?;
50
Ok(Self(offsets))
51
}
52
}
53
54
impl<O: Offset> TryFrom<Vec<O>> for OffsetsBuffer<O> {
55
type Error = PolarsError;
56
57
#[inline]
58
fn try_from(offsets: Vec<O>) -> Result<Self, Self::Error> {
59
try_check_offsets(&offsets)?;
60
Ok(Self(offsets.into()))
61
}
62
}
63
64
impl<O: Offset> From<Offsets<O>> for OffsetsBuffer<O> {
65
#[inline]
66
fn from(offsets: Offsets<O>) -> Self {
67
Self(offsets.0.into())
68
}
69
}
70
71
impl<O: Offset> Offsets<O> {
72
/// Returns an empty [`Offsets`] (i.e. with a single element, the zero)
73
#[inline]
74
pub fn new() -> Self {
75
Self(vec![O::zero()])
76
}
77
78
/// Returns an [`Offsets`] whose all lengths are zero.
79
#[inline]
80
pub fn new_zeroed(length: usize) -> Self {
81
Self(vec![O::zero(); length + 1])
82
}
83
84
/// Creates a new [`Offsets`] from an iterator of lengths
85
#[inline]
86
pub fn try_from_iter<I: IntoIterator<Item = usize>>(iter: I) -> PolarsResult<Self> {
87
let iterator = iter.into_iter();
88
let (lower, _) = iterator.size_hint();
89
let mut offsets = Self::with_capacity(lower);
90
for item in iterator {
91
offsets.try_push(item)?
92
}
93
Ok(offsets)
94
}
95
96
/// Returns a new [`Offsets`] with a capacity, allocating at least `capacity + 1` entries.
97
pub fn with_capacity(capacity: usize) -> Self {
98
let mut offsets = Vec::with_capacity(capacity + 1);
99
offsets.push(O::zero());
100
Self(offsets)
101
}
102
103
/// Returns the capacity of [`Offsets`].
104
pub fn capacity(&self) -> usize {
105
self.0.capacity() - 1
106
}
107
108
/// Reserves `additional` entries.
109
pub fn reserve(&mut self, additional: usize) {
110
self.0.reserve(additional);
111
}
112
113
/// Shrinks the capacity of self to fit.
114
pub fn shrink_to_fit(&mut self) {
115
self.0.shrink_to_fit();
116
}
117
118
/// Pushes a new element with a given length.
119
/// # Error
120
/// This function errors iff the new last item is larger than what `O` supports.
121
/// # Implementation
122
/// This function:
123
/// * checks that this length does not overflow
124
#[inline]
125
pub fn try_push(&mut self, length: usize) -> PolarsResult<()> {
126
if O::IS_LARGE {
127
let length = O::from_as_usize(length);
128
let old_length = self.last();
129
let new_length = *old_length + length;
130
self.0.push(new_length);
131
Ok(())
132
} else {
133
let length =
134
O::from_usize(length).ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
135
136
let old_length = self.last();
137
let new_length = old_length
138
.checked_add(&length)
139
.ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
140
self.0.push(new_length);
141
Ok(())
142
}
143
}
144
145
/// Returns [`Offsets`] assuming that `offsets` fulfills its invariants
146
///
147
/// # Safety
148
/// This is safe iff the invariants of this struct are guaranteed in `offsets`.
149
#[inline]
150
pub unsafe fn new_unchecked(offsets: Vec<O>) -> Self {
151
#[cfg(debug_assertions)]
152
{
153
let mut prev_offset = O::default();
154
let mut is_monotonely_increasing = true;
155
for offset in &offsets {
156
is_monotonely_increasing &= *offset >= prev_offset;
157
prev_offset = *offset;
158
}
159
assert!(
160
is_monotonely_increasing,
161
"Unsafe precondition violated. Invariant of offsets broken."
162
);
163
}
164
165
Self(offsets)
166
}
167
168
/// Returns the last offset of this container.
169
#[inline]
170
pub fn last(&self) -> &O {
171
match self.0.last() {
172
Some(element) => element,
173
None => unsafe { unreachable_unchecked() },
174
}
175
}
176
177
/// Returns a `length` corresponding to the position `index`
178
/// # Panic
179
/// This function panics iff `index >= self.len_proxy()`
180
#[inline]
181
pub fn length_at(&self, index: usize) -> usize {
182
let (start, end) = self.start_end(index);
183
end - start
184
}
185
186
/// Returns a range (start, end) corresponding to the position `index`
187
/// # Panic
188
/// This function panics iff `index >= self.len_proxy()`
189
#[inline]
190
pub fn start_end(&self, index: usize) -> (usize, usize) {
191
// soundness: the invariant of the function
192
assert!(index < self.len_proxy());
193
unsafe { self.start_end_unchecked(index) }
194
}
195
196
/// Returns a range (start, end) corresponding to the position `index`
197
///
198
/// # Safety
199
/// `index` must be `< self.len()`
200
#[inline]
201
pub unsafe fn start_end_unchecked(&self, index: usize) -> (usize, usize) {
202
// soundness: the invariant of the function
203
let start = self.0.get_unchecked(index).to_usize();
204
let end = self.0.get_unchecked(index + 1).to_usize();
205
(start, end)
206
}
207
208
/// Returns the length an array with these offsets would be.
209
#[inline]
210
pub fn len_proxy(&self) -> usize {
211
self.0.len() - 1
212
}
213
214
/// Returns the byte slice stored in this buffer
215
#[inline]
216
pub fn as_slice(&self) -> &[O] {
217
self.0.as_slice()
218
}
219
220
/// Pops the last element
221
#[inline]
222
pub fn pop(&mut self) -> Option<O> {
223
if self.len_proxy() == 0 {
224
None
225
} else {
226
self.0.pop()
227
}
228
}
229
230
/// Extends itself with `additional` elements equal to the last offset.
231
/// This is useful to extend offsets with empty values, e.g. for null slots.
232
#[inline]
233
pub fn extend_constant(&mut self, additional: usize) {
234
let offset = *self.last();
235
if additional == 1 {
236
self.0.push(offset)
237
} else {
238
self.0.resize(self.0.len() + additional, offset)
239
}
240
}
241
242
/// Try to create a new [`Offsets`] from a sequence of `lengths`
243
/// # Errors
244
/// This function errors iff this operation overflows for the maximum value of `O`.
245
#[inline]
246
pub fn try_from_lengths<I: Iterator<Item = usize>>(lengths: I) -> PolarsResult<Self> {
247
let mut self_ = Self::with_capacity(lengths.size_hint().0);
248
self_.try_extend_from_lengths(lengths)?;
249
Ok(self_)
250
}
251
252
/// Try extend from an iterator of lengths
253
/// # Errors
254
/// This function errors iff this operation overflows for the maximum value of `O`.
255
#[inline]
256
pub fn try_extend_from_lengths<I: Iterator<Item = usize>>(
257
&mut self,
258
lengths: I,
259
) -> PolarsResult<()> {
260
let mut total_length = 0;
261
let mut offset = *self.last();
262
let original_offset = offset.to_usize();
263
264
let lengths = lengths.map(|length| {
265
total_length += length;
266
O::from_as_usize(length)
267
});
268
269
let offsets = lengths.map(|length| {
270
offset += length; // this may overflow, checked below
271
offset
272
});
273
self.0.extend(offsets);
274
275
let last_offset = original_offset
276
.checked_add(total_length)
277
.ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
278
O::from_usize(last_offset).ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
279
Ok(())
280
}
281
282
/// Extends itself from another [`Offsets`]
283
/// # Errors
284
/// This function errors iff this operation overflows for the maximum value of `O`.
285
pub fn try_extend_from_self(&mut self, other: &Self) -> PolarsResult<()> {
286
let mut length = *self.last();
287
let other_length = *other.last();
288
// check if the operation would overflow
289
length
290
.checked_add(&other_length)
291
.ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
292
293
let lengths = other.as_slice().windows(2).map(|w| w[1] - w[0]);
294
let offsets = lengths.map(|new_length| {
295
length += new_length;
296
length
297
});
298
self.0.extend(offsets);
299
Ok(())
300
}
301
302
/// Extends itself from another [`Offsets`] sliced by `start, length`
303
/// # Errors
304
/// This function errors iff this operation overflows for the maximum value of `O`.
305
pub fn try_extend_from_slice(
306
&mut self,
307
other: &OffsetsBuffer<O>,
308
start: usize,
309
length: usize,
310
) -> PolarsResult<()> {
311
if length == 0 {
312
return Ok(());
313
}
314
let other = &other.0[start..start + length + 1];
315
let other_length = other.last().expect("Length to be non-zero");
316
let mut length = *self.last();
317
// check if the operation would overflow
318
length
319
.checked_add(other_length)
320
.ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
321
322
let lengths = other.windows(2).map(|w| w[1] - w[0]);
323
let offsets = lengths.map(|new_length| {
324
length += new_length;
325
length
326
});
327
self.0.extend(offsets);
328
Ok(())
329
}
330
331
/// Returns the inner [`Vec`].
332
#[inline]
333
pub fn into_inner(self) -> Vec<O> {
334
self.0
335
}
336
}
337
338
/// Checks that `offsets` is monotonically increasing.
339
fn try_check_offsets<O: Offset>(offsets: &[O]) -> PolarsResult<()> {
340
// this code is carefully constructed to auto-vectorize, don't change naively!
341
match offsets.first() {
342
None => polars_bail!(ComputeError: "offsets must have at least one element"),
343
Some(first) => {
344
if *first < O::zero() {
345
polars_bail!(ComputeError: "offsets must be larger than 0")
346
}
347
let mut previous = *first;
348
let mut any_invalid = false;
349
350
// This loop will auto-vectorize because there is not any break,
351
// an invalid value will be returned once the whole offsets buffer is processed.
352
for offset in offsets {
353
if previous > *offset {
354
any_invalid = true
355
}
356
previous = *offset;
357
}
358
359
if any_invalid {
360
polars_bail!(ComputeError: "offsets must be monotonically increasing")
361
} else {
362
Ok(())
363
}
364
},
365
}
366
}
367
368
/// A wrapper type of [`Buffer<O>`] that is guaranteed to:
369
/// * Always contain an element
370
/// * Every element is `>= 0`
371
/// * element at position `i` is >= than element at position `i-1`.
372
#[derive(Clone, PartialEq, Debug)]
373
pub struct OffsetsBuffer<O: Offset>(Buffer<O>);
374
375
impl<O: Offset> Default for OffsetsBuffer<O> {
376
#[inline]
377
fn default() -> Self {
378
Self(vec![O::zero()].into())
379
}
380
}
381
382
impl<O: Offset> OffsetsBuffer<O> {
383
/// # Safety
384
/// This is safe iff the invariants of this struct are guaranteed in `offsets`.
385
#[inline]
386
pub unsafe fn new_unchecked(offsets: Buffer<O>) -> Self {
387
Self(offsets)
388
}
389
390
/// Returns an empty [`OffsetsBuffer`] (i.e. with a single element, the zero)
391
#[inline]
392
pub fn new() -> Self {
393
Self(vec![O::zero()].into())
394
}
395
396
#[inline]
397
pub fn one_with_length(length: O) -> Self {
398
Self(vec![O::zero(), length].into())
399
}
400
401
/// Copy-on-write API to convert [`OffsetsBuffer`] into [`Offsets`].
402
#[inline]
403
pub fn into_mut(self) -> either::Either<Self, Offsets<O>> {
404
self.0
405
.into_mut()
406
// SAFETY: Offsets and OffsetsBuffer share invariants
407
.map_right(|offsets| unsafe { Offsets::new_unchecked(offsets) })
408
.map_left(Self)
409
}
410
411
/// Returns a reference to its internal [`Buffer`].
412
#[inline]
413
pub fn buffer(&self) -> &Buffer<O> {
414
&self.0
415
}
416
417
/// Returns what the length an array with these offsets would be.
418
#[inline]
419
pub fn len_proxy(&self) -> usize {
420
self.0.len() - 1
421
}
422
423
/// Returns the number of offsets in this container.
424
#[inline]
425
pub fn len(&self) -> usize {
426
self.0.len()
427
}
428
429
/// Returns the byte slice stored in this buffer
430
#[inline]
431
pub fn as_slice(&self) -> &[O] {
432
self.0.as_slice()
433
}
434
435
/// Returns the range of the offsets.
436
#[inline]
437
pub fn range(&self) -> O {
438
*self.last() - *self.first()
439
}
440
441
/// Returns the first offset.
442
#[inline]
443
pub fn first(&self) -> &O {
444
match self.0.first() {
445
Some(element) => element,
446
None => unsafe { unreachable_unchecked() },
447
}
448
}
449
450
/// Returns the last offset.
451
#[inline]
452
pub fn last(&self) -> &O {
453
match self.0.last() {
454
Some(element) => element,
455
None => unsafe { unreachable_unchecked() },
456
}
457
}
458
459
/// Returns a `length` corresponding to the position `index`
460
/// # Panic
461
/// This function panics iff `index >= self.len_proxy()`
462
#[inline]
463
pub fn length_at(&self, index: usize) -> usize {
464
let (start, end) = self.start_end(index);
465
end - start
466
}
467
468
/// Returns a `length` corresponding to the position `index`
469
///
470
/// # Safety
471
/// `index` must be `< self.len()`
472
#[inline]
473
pub unsafe fn length_at_unchecked(&self, index: usize) -> usize {
474
let (start, end) = unsafe { self.start_end_unchecked(index) };
475
end - start
476
}
477
478
/// Returns a range (start, end) corresponding to the position `index`
479
/// # Panic
480
/// This function panics iff `index >= self.len_proxy()`
481
#[inline]
482
pub fn start_end(&self, index: usize) -> (usize, usize) {
483
// soundness: the invariant of the function
484
assert!(index < self.len_proxy());
485
unsafe { self.start_end_unchecked(index) }
486
}
487
488
/// Returns a range (start, end) corresponding to the position `index`
489
///
490
/// # Safety
491
/// `index` must be `< self.len()`
492
#[inline]
493
pub unsafe fn start_end_unchecked(&self, index: usize) -> (usize, usize) {
494
// soundness: the invariant of the function
495
let start = self.0.get_unchecked(index).to_usize();
496
let end = self.0.get_unchecked(index + 1).to_usize();
497
(start, end)
498
}
499
500
/// Slices this [`OffsetsBuffer`].
501
/// # Panics
502
/// Panics if `offset + length` is larger than `len`
503
/// or `length == 0`.
504
#[inline]
505
pub fn slice(&mut self, offset: usize, length: usize) {
506
assert!(length > 0);
507
self.0.slice_in_place(offset..offset + length);
508
}
509
510
/// Slices this [`OffsetsBuffer`] starting at `offset`.
511
///
512
/// # Safety
513
/// The caller must ensure `offset + length <= self.len()`
514
#[inline]
515
pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
516
self.0.slice_in_place_unchecked(offset..offset + length);
517
}
518
519
/// Returns an iterator with the lengths of the offsets
520
#[inline]
521
pub fn lengths(&self) -> impl ExactSizeIterator<Item = usize> + '_ {
522
self.0.windows(2).map(|w| (w[1] - w[0]).to_usize())
523
}
524
525
/// Returns `(offset, len)` pairs.
526
#[inline]
527
pub fn offset_and_length_iter(&self) -> impl ExactSizeIterator<Item = (usize, usize)> + '_ {
528
self.windows(2).map(|x| {
529
let [l, r] = x else { unreachable!() };
530
let l = l.to_usize();
531
let r = r.to_usize();
532
(l, r - l)
533
})
534
}
535
536
/// Offset and length of the primitive (leaf) array for a double+ nested list for every outer
537
/// row.
538
pub fn leaf_ranges_iter(
539
offsets: &[Self],
540
) -> impl Iterator<Item = core::ops::Range<usize>> + '_ {
541
let others = &offsets[1..];
542
543
offsets[0].windows(2).map(move |x| {
544
let [l, r] = x else { unreachable!() };
545
let mut l = l.to_usize();
546
let mut r = r.to_usize();
547
548
for o in others {
549
let slc = o.as_slice();
550
l = slc[l].to_usize();
551
r = slc[r].to_usize();
552
}
553
554
l..r
555
})
556
}
557
558
/// Return the full range of the leaf array used by the list.
559
pub fn leaf_full_start_end(offsets: &[Self]) -> core::ops::Range<usize> {
560
let mut l = offsets[0].first().to_usize();
561
let mut r = offsets[0].last().to_usize();
562
563
for o in &offsets[1..] {
564
let slc = o.as_slice();
565
l = slc[l].to_usize();
566
r = slc[r].to_usize();
567
}
568
569
l..r
570
}
571
572
/// Returns the inner [`Buffer`].
573
#[inline]
574
pub fn into_inner(self) -> Buffer<O> {
575
self.0
576
}
577
578
/// Returns the offset difference between `start` and `end`.
579
#[inline]
580
pub fn delta(&self, start: usize, end: usize) -> usize {
581
assert!(start <= end);
582
583
(self.0[end + 1] - self.0[start]).to_usize()
584
}
585
}
586
587
impl From<&OffsetsBuffer<i32>> for OffsetsBuffer<i64> {
588
fn from(offsets: &OffsetsBuffer<i32>) -> Self {
589
// this conversion is lossless and uphelds all invariants
590
Self(
591
offsets
592
.buffer()
593
.iter()
594
.map(|x| *x as i64)
595
.collect::<Vec<_>>()
596
.into(),
597
)
598
}
599
}
600
601
impl TryFrom<&OffsetsBuffer<i64>> for OffsetsBuffer<i32> {
602
type Error = PolarsError;
603
604
fn try_from(offsets: &OffsetsBuffer<i64>) -> Result<Self, Self::Error> {
605
i32::try_from(*offsets.last()).map_err(|_| polars_err!(ComputeError: "overflow"))?;
606
607
// this conversion is lossless and uphelds all invariants
608
Ok(Self(
609
offsets
610
.buffer()
611
.iter()
612
.map(|x| *x as i32)
613
.collect::<Vec<_>>()
614
.into(),
615
))
616
}
617
}
618
619
impl From<Offsets<i32>> for Offsets<i64> {
620
fn from(offsets: Offsets<i32>) -> Self {
621
// this conversion is lossless and uphelds all invariants
622
Self(
623
offsets
624
.as_slice()
625
.iter()
626
.map(|x| *x as i64)
627
.collect::<Vec<_>>(),
628
)
629
}
630
}
631
632
impl TryFrom<Offsets<i64>> for Offsets<i32> {
633
type Error = PolarsError;
634
635
fn try_from(offsets: Offsets<i64>) -> Result<Self, Self::Error> {
636
i32::try_from(*offsets.last()).map_err(|_| polars_err!(ComputeError: "overflow"))?;
637
638
// this conversion is lossless and uphelds all invariants
639
Ok(Self(
640
offsets
641
.as_slice()
642
.iter()
643
.map(|x| *x as i32)
644
.collect::<Vec<_>>(),
645
))
646
}
647
}
648
649
impl<O: Offset> std::ops::Deref for OffsetsBuffer<O> {
650
type Target = [O];
651
652
#[inline]
653
fn deref(&self) -> &[O] {
654
self.0.as_slice()
655
}
656
}
657
658
impl<O: Offset> Splitable for OffsetsBuffer<O> {
659
fn check_bound(&self, offset: usize) -> bool {
660
offset <= self.len_proxy()
661
}
662
663
unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
664
let mut lhs = self.0.clone();
665
let mut rhs = self.0.clone();
666
667
lhs.slice_in_place(..=offset);
668
rhs.slice_in_place(offset..);
669
670
(Self(lhs), Self(rhs))
671
}
672
}
673
674