Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-arrow/src/array/binary/mod.rs
6939 views
1
use either::Either;
2
3
use super::specification::try_check_offsets_bounds;
4
use super::{Array, GenericBinaryArray, Splitable};
5
use crate::array::iterator::NonNullValuesIter;
6
use crate::bitmap::Bitmap;
7
use crate::bitmap::utils::{BitmapIter, ZipValidity};
8
use crate::buffer::Buffer;
9
use crate::datatypes::ArrowDataType;
10
use crate::offset::{Offset, Offsets, OffsetsBuffer};
11
use crate::trusted_len::TrustedLen;
12
13
mod builder;
14
pub use builder::*;
15
mod ffi;
16
pub(super) mod fmt;
17
mod iterator;
18
pub use iterator::*;
19
mod from;
20
mod mutable_values;
21
pub use mutable_values::*;
22
mod mutable;
23
pub use mutable::*;
24
use polars_error::{PolarsResult, polars_bail};
25
#[cfg(feature = "proptest")]
26
pub mod proptest;
27
28
/// A [`BinaryArray`] is Arrow's semantically equivalent of an immutable `Vec<Option<Vec<u8>>>`.
29
/// It implements [`Array`].
30
///
31
/// The size of this struct is `O(1)`, as all data is stored behind an [`std::sync::Arc`].
32
/// # Example
33
/// ```
34
/// use polars_arrow::array::BinaryArray;
35
/// use polars_arrow::bitmap::Bitmap;
36
/// use polars_arrow::buffer::Buffer;
37
///
38
/// let array = BinaryArray::<i32>::from([Some([1, 2].as_ref()), None, Some([3].as_ref())]);
39
/// assert_eq!(array.value(0), &[1, 2]);
40
/// assert_eq!(array.iter().collect::<Vec<_>>(), vec![Some([1, 2].as_ref()), None, Some([3].as_ref())]);
41
/// assert_eq!(array.values_iter().collect::<Vec<_>>(), vec![[1, 2].as_ref(), &[], &[3]]);
42
/// // the underlying representation:
43
/// assert_eq!(array.values(), &Buffer::from(vec![1, 2, 3]));
44
/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 3]));
45
/// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true])));
46
/// ```
47
///
48
/// # Generic parameter
49
/// The generic parameter [`Offset`] can only be `i32` or `i64` and tradeoffs maximum array length with
50
/// memory usage:
51
/// * the sum of lengths of all elements cannot exceed `Offset::MAX`
52
/// * the total size of the underlying data is `array.len() * size_of::<Offset>() + sum of lengths of all elements`
53
///
54
/// # Safety
55
/// The following invariants hold:
56
/// * Two consecutive `offsets` cast (`as`) to `usize` are valid slices of `values`.
57
/// * `len` is equal to `validity.len()`, when defined.
58
#[derive(Clone)]
59
pub struct BinaryArray<O: Offset> {
60
dtype: ArrowDataType,
61
offsets: OffsetsBuffer<O>,
62
values: Buffer<u8>,
63
validity: Option<Bitmap>,
64
}
65
66
impl<O: Offset> BinaryArray<O> {
67
/// Returns a [`BinaryArray`] created from its internal representation.
68
///
69
/// # Errors
70
/// This function returns an error iff:
71
/// * The last offset is not equal to the values' length.
72
/// * the validity's length is not equal to `offsets.len()`.
73
/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`.
74
/// # Implementation
75
/// This function is `O(1)`
76
pub fn try_new(
77
dtype: ArrowDataType,
78
offsets: OffsetsBuffer<O>,
79
values: Buffer<u8>,
80
validity: Option<Bitmap>,
81
) -> PolarsResult<Self> {
82
try_check_offsets_bounds(&offsets, values.len())?;
83
84
if validity
85
.as_ref()
86
.is_some_and(|validity| validity.len() != offsets.len_proxy())
87
{
88
polars_bail!(ComputeError: "validity mask length must match the number of values")
89
}
90
91
if dtype.to_physical_type() != Self::default_dtype().to_physical_type() {
92
polars_bail!(ComputeError: "BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary")
93
}
94
95
Ok(Self {
96
dtype,
97
offsets,
98
values,
99
validity,
100
})
101
}
102
103
/// Creates a new [`BinaryArray`] without checking invariants.
104
///
105
/// # Safety
106
///
107
/// The invariants must be valid (see try_new).
108
pub unsafe fn new_unchecked(
109
dtype: ArrowDataType,
110
offsets: OffsetsBuffer<O>,
111
values: Buffer<u8>,
112
validity: Option<Bitmap>,
113
) -> Self {
114
Self {
115
dtype,
116
offsets,
117
values,
118
validity,
119
}
120
}
121
122
/// Creates a new [`BinaryArray`] from slices of `&[u8]`.
123
pub fn from_slice<T: AsRef<[u8]>, P: AsRef<[T]>>(slice: P) -> Self {
124
Self::from_trusted_len_values_iter(slice.as_ref().iter())
125
}
126
127
/// Creates a new [`BinaryArray`] from a slice of optional `&[u8]`.
128
// Note: this can't be `impl From` because Rust does not allow double `AsRef` on it.
129
pub fn from<T: AsRef<[u8]>, P: AsRef<[Option<T>]>>(slice: P) -> Self {
130
MutableBinaryArray::<O>::from(slice).into()
131
}
132
133
/// Returns an iterator of `Option<&[u8]>` over every element of this array.
134
pub fn iter(&self) -> ZipValidity<&[u8], BinaryValueIter<'_, O>, BitmapIter<'_>> {
135
ZipValidity::new_with_validity(self.values_iter(), self.validity.as_ref())
136
}
137
138
/// Returns an iterator of `&[u8]` over every element of this array, ignoring the validity
139
pub fn values_iter(&self) -> BinaryValueIter<'_, O> {
140
BinaryValueIter::new(self)
141
}
142
143
/// Returns an iterator of the non-null values.
144
#[inline]
145
pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, BinaryArray<O>> {
146
NonNullValuesIter::new(self, self.validity())
147
}
148
149
/// Returns the length of this array
150
#[inline]
151
pub fn len(&self) -> usize {
152
self.offsets.len_proxy()
153
}
154
155
/// Returns the element at index `i`
156
/// # Panics
157
/// iff `i >= self.len()`
158
#[inline]
159
pub fn value(&self, i: usize) -> &[u8] {
160
assert!(i < self.len());
161
unsafe { self.value_unchecked(i) }
162
}
163
164
/// Returns the element at index `i`
165
///
166
/// # Safety
167
/// Assumes that the `i < self.len`.
168
#[inline]
169
pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] {
170
// soundness: the invariant of the function
171
let (start, end) = self.offsets.start_end_unchecked(i);
172
173
// soundness: the invariant of the struct
174
self.values.get_unchecked(start..end)
175
}
176
177
/// Returns the element at index `i` or `None` if it is null
178
/// # Panics
179
/// iff `i >= self.len()`
180
#[inline]
181
pub fn get(&self, i: usize) -> Option<&[u8]> {
182
if !self.is_null(i) {
183
// soundness: Array::is_null panics if i >= self.len
184
unsafe { Some(self.value_unchecked(i)) }
185
} else {
186
None
187
}
188
}
189
190
/// Returns the [`ArrowDataType`] of this array.
191
#[inline]
192
pub fn dtype(&self) -> &ArrowDataType {
193
&self.dtype
194
}
195
196
/// Returns the values of this [`BinaryArray`].
197
#[inline]
198
pub fn values(&self) -> &Buffer<u8> {
199
&self.values
200
}
201
202
/// Returns the offsets of this [`BinaryArray`].
203
#[inline]
204
pub fn offsets(&self) -> &OffsetsBuffer<O> {
205
&self.offsets
206
}
207
208
/// The optional validity.
209
#[inline]
210
pub fn validity(&self) -> Option<&Bitmap> {
211
self.validity.as_ref()
212
}
213
214
/// Slices this [`BinaryArray`].
215
/// # Implementation
216
/// This function is `O(1)`.
217
/// # Panics
218
/// iff `offset + length > self.len()`.
219
pub fn slice(&mut self, offset: usize, length: usize) {
220
assert!(
221
offset + length <= self.len(),
222
"the offset of the new Buffer cannot exceed the existing length"
223
);
224
unsafe { self.slice_unchecked(offset, length) }
225
}
226
227
/// Slices this [`BinaryArray`].
228
/// # Implementation
229
/// This function is `O(1)`.
230
///
231
/// # Safety
232
/// The caller must ensure that `offset + length <= self.len()`.
233
pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
234
self.validity = self
235
.validity
236
.take()
237
.map(|bitmap| bitmap.sliced_unchecked(offset, length))
238
.filter(|bitmap| bitmap.unset_bits() > 0);
239
self.offsets.slice_unchecked(offset, length + 1);
240
}
241
242
impl_sliced!();
243
impl_mut_validity!();
244
impl_into_array!();
245
246
/// Returns its internal representation
247
#[must_use]
248
pub fn into_inner(self) -> (ArrowDataType, OffsetsBuffer<O>, Buffer<u8>, Option<Bitmap>) {
249
let Self {
250
dtype,
251
offsets,
252
values,
253
validity,
254
} = self;
255
(dtype, offsets, values, validity)
256
}
257
258
/// Try to convert this `BinaryArray` to a `MutableBinaryArray`
259
#[must_use]
260
pub fn into_mut(self) -> Either<Self, MutableBinaryArray<O>> {
261
use Either::*;
262
if let Some(bitmap) = self.validity {
263
match bitmap.into_mut() {
264
// SAFETY: invariants are preserved
265
Left(bitmap) => Left(BinaryArray::new(
266
self.dtype,
267
self.offsets,
268
self.values,
269
Some(bitmap),
270
)),
271
Right(mutable_bitmap) => match (self.values.into_mut(), self.offsets.into_mut()) {
272
(Left(values), Left(offsets)) => Left(BinaryArray::new(
273
self.dtype,
274
offsets,
275
values,
276
Some(mutable_bitmap.into()),
277
)),
278
(Left(values), Right(offsets)) => Left(BinaryArray::new(
279
self.dtype,
280
offsets.into(),
281
values,
282
Some(mutable_bitmap.into()),
283
)),
284
(Right(values), Left(offsets)) => Left(BinaryArray::new(
285
self.dtype,
286
offsets,
287
values.into(),
288
Some(mutable_bitmap.into()),
289
)),
290
(Right(values), Right(offsets)) => Right(
291
MutableBinaryArray::try_new(
292
self.dtype,
293
offsets,
294
values,
295
Some(mutable_bitmap),
296
)
297
.unwrap(),
298
),
299
},
300
}
301
} else {
302
match (self.values.into_mut(), self.offsets.into_mut()) {
303
(Left(values), Left(offsets)) => {
304
Left(BinaryArray::new(self.dtype, offsets, values, None))
305
},
306
(Left(values), Right(offsets)) => {
307
Left(BinaryArray::new(self.dtype, offsets.into(), values, None))
308
},
309
(Right(values), Left(offsets)) => {
310
Left(BinaryArray::new(self.dtype, offsets, values.into(), None))
311
},
312
(Right(values), Right(offsets)) => {
313
Right(MutableBinaryArray::try_new(self.dtype, offsets, values, None).unwrap())
314
},
315
}
316
}
317
}
318
319
/// Creates an empty [`BinaryArray`], i.e. whose `.len` is zero.
320
pub fn new_empty(dtype: ArrowDataType) -> Self {
321
Self::new(dtype, OffsetsBuffer::new(), Buffer::new(), None)
322
}
323
324
/// Creates an null [`BinaryArray`], i.e. whose `.null_count() == .len()`.
325
#[inline]
326
pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
327
unsafe {
328
Self::new_unchecked(
329
dtype,
330
Offsets::new_zeroed(length).into(),
331
Buffer::new(),
332
Some(Bitmap::new_zeroed(length)),
333
)
334
}
335
}
336
337
/// Returns the default [`ArrowDataType`], `DataType::Binary` or `DataType::LargeBinary`
338
pub fn default_dtype() -> ArrowDataType {
339
if O::IS_LARGE {
340
ArrowDataType::LargeBinary
341
} else {
342
ArrowDataType::Binary
343
}
344
}
345
346
/// Alias for unwrapping [`Self::try_new`]
347
pub fn new(
348
dtype: ArrowDataType,
349
offsets: OffsetsBuffer<O>,
350
values: Buffer<u8>,
351
validity: Option<Bitmap>,
352
) -> Self {
353
Self::try_new(dtype, offsets, values, validity).unwrap()
354
}
355
356
/// Returns a [`BinaryArray`] from an iterator of trusted length.
357
///
358
/// The [`BinaryArray`] is guaranteed to not have a validity
359
#[inline]
360
pub fn from_trusted_len_values_iter<T: AsRef<[u8]>, I: TrustedLen<Item = T>>(
361
iterator: I,
362
) -> Self {
363
MutableBinaryArray::<O>::from_trusted_len_values_iter(iterator).into()
364
}
365
366
/// Returns a new [`BinaryArray`] from a [`Iterator`] of `&[u8]`.
367
///
368
/// The [`BinaryArray`] is guaranteed to not have a validity
369
pub fn from_iter_values<T: AsRef<[u8]>, I: Iterator<Item = T>>(iterator: I) -> Self {
370
MutableBinaryArray::<O>::from_iter_values(iterator).into()
371
}
372
373
/// Creates a [`BinaryArray`] from an iterator of trusted length.
374
///
375
/// # Safety
376
/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
377
/// I.e. that `size_hint().1` correctly reports its length.
378
#[inline]
379
pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self
380
where
381
P: AsRef<[u8]>,
382
I: Iterator<Item = Option<P>>,
383
{
384
MutableBinaryArray::<O>::from_trusted_len_iter_unchecked(iterator).into()
385
}
386
387
/// Creates a [`BinaryArray`] from a [`TrustedLen`]
388
#[inline]
389
pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self
390
where
391
P: AsRef<[u8]>,
392
I: TrustedLen<Item = Option<P>>,
393
{
394
// soundness: I is `TrustedLen`
395
unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
396
}
397
398
/// Creates a [`BinaryArray`] from an falible iterator of trusted length.
399
///
400
/// # Safety
401
/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
402
/// I.e. that `size_hint().1` correctly reports its length.
403
#[inline]
404
pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(iterator: I) -> Result<Self, E>
405
where
406
P: AsRef<[u8]>,
407
I: IntoIterator<Item = Result<Option<P>, E>>,
408
{
409
MutableBinaryArray::<O>::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into())
410
}
411
412
/// Creates a [`BinaryArray`] from an fallible iterator of trusted length.
413
#[inline]
414
pub fn try_from_trusted_len_iter<E, I, P>(iter: I) -> Result<Self, E>
415
where
416
P: AsRef<[u8]>,
417
I: TrustedLen<Item = Result<Option<P>, E>>,
418
{
419
// soundness: I: TrustedLen
420
unsafe { Self::try_from_trusted_len_iter_unchecked(iter) }
421
}
422
}
423
424
impl<O: Offset> Array for BinaryArray<O> {
425
impl_common_array!();
426
427
fn validity(&self) -> Option<&Bitmap> {
428
self.validity.as_ref()
429
}
430
431
#[inline]
432
fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
433
Box::new(self.clone().with_validity(validity))
434
}
435
}
436
437
unsafe impl<O: Offset> GenericBinaryArray<O> for BinaryArray<O> {
438
#[inline]
439
fn values(&self) -> &[u8] {
440
self.values()
441
}
442
443
#[inline]
444
fn offsets(&self) -> &[O] {
445
self.offsets().buffer()
446
}
447
}
448
449
impl<O: Offset> Splitable for BinaryArray<O> {
450
#[inline(always)]
451
fn check_bound(&self, offset: usize) -> bool {
452
offset <= self.len()
453
}
454
455
unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
456
let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
457
let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };
458
459
(
460
Self {
461
dtype: self.dtype.clone(),
462
offsets: lhs_offsets,
463
values: self.values.clone(),
464
validity: lhs_validity,
465
},
466
Self {
467
dtype: self.dtype.clone(),
468
offsets: rhs_offsets,
469
values: self.values.clone(),
470
validity: rhs_validity,
471
},
472
)
473
}
474
}
475
476