Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-arrow/src/array/utf8/mod.rs
6939 views
1
use either::Either;
2
3
use super::specification::try_check_utf8;
4
use super::{Array, GenericBinaryArray, Splitable};
5
use crate::array::BinaryArray;
6
use crate::array::iterator::NonNullValuesIter;
7
use crate::bitmap::Bitmap;
8
use crate::bitmap::utils::{BitmapIter, ZipValidity};
9
use crate::buffer::Buffer;
10
use crate::datatypes::ArrowDataType;
11
use crate::offset::{Offset, Offsets, OffsetsBuffer};
12
use crate::trusted_len::TrustedLen;
13
14
mod ffi;
15
pub(super) mod fmt;
16
mod from;
17
mod iterator;
18
mod mutable;
19
mod mutable_values;
20
pub use iterator::*;
21
pub use mutable::*;
22
pub use mutable_values::MutableUtf8ValuesArray;
23
use polars_error::*;
24
25
// Auxiliary struct to allow presenting &str as [u8] to a generic function
26
pub(super) struct StrAsBytes<P>(P);
27
impl<T: AsRef<str>> AsRef<[u8]> for StrAsBytes<T> {
28
#[inline(always)]
29
fn as_ref(&self) -> &[u8] {
30
self.0.as_ref().as_bytes()
31
}
32
}
33
34
/// A [`Utf8Array`] is arrow's semantic equivalent of an immutable `Vec<Option<String>>`.
35
/// Cloning and slicing this struct is `O(1)`.
36
/// # Example
37
/// ```
38
/// use polars_arrow::bitmap::Bitmap;
39
/// use polars_arrow::buffer::Buffer;
40
/// use polars_arrow::array::Utf8Array;
41
/// # fn main() {
42
/// let array = Utf8Array::<i32>::from([Some("hi"), None, Some("there")]);
43
/// assert_eq!(array.value(0), "hi");
44
/// assert_eq!(array.iter().collect::<Vec<_>>(), vec![Some("hi"), None, Some("there")]);
45
/// assert_eq!(array.values_iter().collect::<Vec<_>>(), vec!["hi", "", "there"]);
46
/// // the underlying representation
47
/// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true])));
48
/// assert_eq!(array.values(), &Buffer::from(b"hithere".to_vec()));
49
/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 2 + 5]));
50
/// # }
51
/// ```
52
///
53
/// # Generic parameter
54
/// The generic parameter [`Offset`] can only be `i32` or `i64` and tradeoffs maximum array length with
55
/// memory usage:
56
/// * the sum of lengths of all elements cannot exceed `Offset::MAX`
57
/// * the total size of the underlying data is `array.len() * size_of::<Offset>() + sum of lengths of all elements`
58
///
59
/// # Safety
60
/// The following invariants hold:
61
/// * Two consecutive `offsets` cast (`as`) to `usize` are valid slices of `values`.
62
/// * A slice of `values` taken from two consecutive `offsets` is valid `utf8`.
63
/// * `len` is equal to `validity.len()`, when defined.
64
#[derive(Clone)]
65
pub struct Utf8Array<O: Offset> {
66
dtype: ArrowDataType,
67
offsets: OffsetsBuffer<O>,
68
values: Buffer<u8>,
69
validity: Option<Bitmap>,
70
}
71
72
// constructors
73
impl<O: Offset> Utf8Array<O> {
74
/// Returns a [`Utf8Array`] created from its internal representation.
75
///
76
/// # Errors
77
/// This function returns an error iff:
78
/// * The last offset is greater than the values' length.
79
/// * the validity's length is not equal to `offsets.len_proxy()`.
80
/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`.
81
/// * The `values` between two consecutive `offsets` are not valid utf8
82
/// # Implementation
83
/// This function is `O(N)` - checking utf8 is `O(N)`
84
pub fn try_new(
85
dtype: ArrowDataType,
86
offsets: OffsetsBuffer<O>,
87
values: Buffer<u8>,
88
validity: Option<Bitmap>,
89
) -> PolarsResult<Self> {
90
try_check_utf8(&offsets, &values)?;
91
if validity
92
.as_ref()
93
.is_some_and(|validity| validity.len() != offsets.len_proxy())
94
{
95
polars_bail!(ComputeError: "validity mask length must match the number of values");
96
}
97
98
if dtype.to_physical_type() != Self::default_dtype().to_physical_type() {
99
polars_bail!(ComputeError: "Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8")
100
}
101
102
Ok(Self {
103
dtype,
104
offsets,
105
values,
106
validity,
107
})
108
}
109
110
/// Returns a [`Utf8Array`] from a slice of `&str`.
111
///
112
/// A convenience method that uses [`Self::from_trusted_len_values_iter`].
113
pub fn from_slice<T: AsRef<str>, P: AsRef<[T]>>(slice: P) -> Self {
114
Self::from_trusted_len_values_iter(slice.as_ref().iter())
115
}
116
117
/// Returns a new [`Utf8Array`] from a slice of `&str`.
118
///
119
/// A convenience method that uses [`Self::from_trusted_len_iter`].
120
// Note: this can't be `impl From` because Rust does not allow double `AsRef` on it.
121
pub fn from<T: AsRef<str>, P: AsRef<[Option<T>]>>(slice: P) -> Self {
122
MutableUtf8Array::<O>::from(slice).into()
123
}
124
125
/// Returns an iterator of `Option<&str>`
126
pub fn iter(&self) -> ZipValidity<&str, Utf8ValuesIter<'_, O>, BitmapIter<'_>> {
127
ZipValidity::new_with_validity(self.values_iter(), self.validity())
128
}
129
130
/// Returns an iterator of `&str`
131
pub fn values_iter(&self) -> Utf8ValuesIter<'_, O> {
132
Utf8ValuesIter::new(self)
133
}
134
135
/// Returns an iterator of the non-null values `&str.
136
#[inline]
137
pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, Utf8Array<O>> {
138
NonNullValuesIter::new(self, self.validity())
139
}
140
141
/// Returns the length of this array
142
#[inline]
143
pub fn len(&self) -> usize {
144
self.offsets.len_proxy()
145
}
146
147
/// Returns the value of the element at index `i`, ignoring the array's validity.
148
/// # Panic
149
/// This function panics iff `i >= self.len`.
150
#[inline]
151
pub fn value(&self, i: usize) -> &str {
152
assert!(i < self.len());
153
unsafe { self.value_unchecked(i) }
154
}
155
156
/// Returns the value of the element at index `i`, ignoring the array's validity.
157
///
158
/// # Safety
159
/// This function is safe iff `i < self.len`.
160
#[inline]
161
pub unsafe fn value_unchecked(&self, i: usize) -> &str {
162
// soundness: the invariant of the function
163
let (start, end) = self.offsets.start_end_unchecked(i);
164
165
// soundness: the invariant of the struct
166
let slice = self.values.get_unchecked(start..end);
167
168
// soundness: the invariant of the struct
169
std::str::from_utf8_unchecked(slice)
170
}
171
172
/// Returns the element at index `i` or `None` if it is null
173
/// # Panics
174
/// iff `i >= self.len()`
175
#[inline]
176
pub fn get(&self, i: usize) -> Option<&str> {
177
if !self.is_null(i) {
178
// soundness: Array::is_null panics if i >= self.len
179
unsafe { Some(self.value_unchecked(i)) }
180
} else {
181
None
182
}
183
}
184
185
/// Returns the [`ArrowDataType`] of this array.
186
#[inline]
187
pub fn dtype(&self) -> &ArrowDataType {
188
&self.dtype
189
}
190
191
/// Returns the values of this [`Utf8Array`].
192
#[inline]
193
pub fn values(&self) -> &Buffer<u8> {
194
&self.values
195
}
196
197
/// Returns the offsets of this [`Utf8Array`].
198
#[inline]
199
pub fn offsets(&self) -> &OffsetsBuffer<O> {
200
&self.offsets
201
}
202
203
/// The optional validity.
204
#[inline]
205
pub fn validity(&self) -> Option<&Bitmap> {
206
self.validity.as_ref()
207
}
208
209
/// Slices this [`Utf8Array`].
210
/// # Implementation
211
/// This function is `O(1)`.
212
/// # Panics
213
/// iff `offset + length > self.len()`.
214
pub fn slice(&mut self, offset: usize, length: usize) {
215
assert!(
216
offset + length <= self.len(),
217
"the offset of the new array cannot exceed the arrays' length"
218
);
219
unsafe { self.slice_unchecked(offset, length) }
220
}
221
222
/// Slices this [`Utf8Array`].
223
/// # Implementation
224
/// This function is `O(1)`
225
///
226
/// # Safety
227
/// The caller must ensure that `offset + length <= self.len()`.
228
pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
229
self.validity = self
230
.validity
231
.take()
232
.map(|bitmap| bitmap.sliced_unchecked(offset, length))
233
.filter(|bitmap| bitmap.unset_bits() > 0);
234
self.offsets.slice_unchecked(offset, length + 1);
235
}
236
237
impl_sliced!();
238
impl_mut_validity!();
239
impl_into_array!();
240
241
/// Returns its internal representation
242
#[must_use]
243
pub fn into_inner(self) -> (ArrowDataType, OffsetsBuffer<O>, Buffer<u8>, Option<Bitmap>) {
244
let Self {
245
dtype,
246
offsets,
247
values,
248
validity,
249
} = self;
250
(dtype, offsets, values, validity)
251
}
252
253
/// Try to convert this `Utf8Array` to a `MutableUtf8Array`
254
#[must_use]
255
pub fn into_mut(self) -> Either<Self, MutableUtf8Array<O>> {
256
use Either::*;
257
if let Some(bitmap) = self.validity {
258
match bitmap.into_mut() {
259
// SAFETY: invariants are preserved
260
Left(bitmap) => Left(unsafe {
261
Utf8Array::new_unchecked(self.dtype, self.offsets, self.values, Some(bitmap))
262
}),
263
Right(mutable_bitmap) => match (self.values.into_mut(), self.offsets.into_mut()) {
264
(Left(values), Left(offsets)) => {
265
// SAFETY: invariants are preserved
266
Left(unsafe {
267
Utf8Array::new_unchecked(
268
self.dtype,
269
offsets,
270
values,
271
Some(mutable_bitmap.into()),
272
)
273
})
274
},
275
(Left(values), Right(offsets)) => {
276
// SAFETY: invariants are preserved
277
Left(unsafe {
278
Utf8Array::new_unchecked(
279
self.dtype,
280
offsets.into(),
281
values,
282
Some(mutable_bitmap.into()),
283
)
284
})
285
},
286
(Right(values), Left(offsets)) => {
287
// SAFETY: invariants are preserved
288
Left(unsafe {
289
Utf8Array::new_unchecked(
290
self.dtype,
291
offsets,
292
values.into(),
293
Some(mutable_bitmap.into()),
294
)
295
})
296
},
297
(Right(values), Right(offsets)) => Right(unsafe {
298
MutableUtf8Array::new_unchecked(
299
self.dtype,
300
offsets,
301
values,
302
Some(mutable_bitmap),
303
)
304
}),
305
},
306
}
307
} else {
308
match (self.values.into_mut(), self.offsets.into_mut()) {
309
(Left(values), Left(offsets)) => {
310
Left(unsafe { Utf8Array::new_unchecked(self.dtype, offsets, values, None) })
311
},
312
(Left(values), Right(offsets)) => Left(unsafe {
313
Utf8Array::new_unchecked(self.dtype, offsets.into(), values, None)
314
}),
315
(Right(values), Left(offsets)) => Left(unsafe {
316
Utf8Array::new_unchecked(self.dtype, offsets, values.into(), None)
317
}),
318
(Right(values), Right(offsets)) => Right(unsafe {
319
MutableUtf8Array::new_unchecked(self.dtype, offsets, values, None)
320
}),
321
}
322
}
323
}
324
325
/// Returns a new empty [`Utf8Array`].
326
///
327
/// The array is guaranteed to have no elements nor validity.
328
#[inline]
329
pub fn new_empty(dtype: ArrowDataType) -> Self {
330
unsafe { Self::new_unchecked(dtype, OffsetsBuffer::new(), Buffer::new(), None) }
331
}
332
333
/// Returns a new [`Utf8Array`] whose all slots are null / `None`.
334
#[inline]
335
pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
336
Self::new(
337
dtype,
338
Offsets::new_zeroed(length).into(),
339
Buffer::new(),
340
Some(Bitmap::new_zeroed(length)),
341
)
342
}
343
344
/// Returns a default [`ArrowDataType`] of this array, which depends on the generic parameter `O`: `DataType::Utf8` or `DataType::LargeUtf8`
345
pub fn default_dtype() -> ArrowDataType {
346
if O::IS_LARGE {
347
ArrowDataType::LargeUtf8
348
} else {
349
ArrowDataType::Utf8
350
}
351
}
352
353
/// Creates a new [`Utf8Array`] without checking for offsets monotinicity nor utf8-validity
354
///
355
/// # Panic
356
/// This function panics (in debug mode only) iff:
357
/// * The last offset is greater than the values' length.
358
/// * the validity's length is not equal to `offsets.len_proxy()`.
359
/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`.
360
///
361
/// # Safety
362
/// This function is unsound iff:
363
/// * The `values` between two consecutive `offsets` are not valid utf8
364
/// # Implementation
365
/// This function is `O(1)`
366
pub unsafe fn new_unchecked(
367
dtype: ArrowDataType,
368
offsets: OffsetsBuffer<O>,
369
values: Buffer<u8>,
370
validity: Option<Bitmap>,
371
) -> Self {
372
debug_assert!(
373
offsets.last().to_usize() <= values.len(),
374
"offsets must not exceed the values length"
375
);
376
debug_assert!(
377
validity
378
.as_ref()
379
.is_none_or(|validity| validity.len() == offsets.len_proxy()),
380
"validity mask length must match the number of values"
381
);
382
debug_assert!(
383
dtype.to_physical_type() == Self::default_dtype().to_physical_type(),
384
"Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8"
385
);
386
387
Self {
388
dtype,
389
offsets,
390
values,
391
validity,
392
}
393
}
394
395
/// Creates a new [`Utf8Array`].
396
/// # Panics
397
/// This function panics iff:
398
/// * `offsets.last()` is greater than `values.len()`.
399
/// * the validity's length is not equal to `offsets.len_proxy()`.
400
/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`.
401
/// * The `values` between two consecutive `offsets` are not valid utf8
402
/// # Implementation
403
/// This function is `O(N)` - checking utf8 is `O(N)`
404
pub fn new(
405
dtype: ArrowDataType,
406
offsets: OffsetsBuffer<O>,
407
values: Buffer<u8>,
408
validity: Option<Bitmap>,
409
) -> Self {
410
Self::try_new(dtype, offsets, values, validity).unwrap()
411
}
412
413
/// Returns a (non-null) [`Utf8Array`] created from a [`TrustedLen`] of `&str`.
414
/// # Implementation
415
/// This function is `O(N)`
416
#[inline]
417
pub fn from_trusted_len_values_iter<T: AsRef<str>, I: TrustedLen<Item = T>>(
418
iterator: I,
419
) -> Self {
420
MutableUtf8Array::<O>::from_trusted_len_values_iter(iterator).into()
421
}
422
423
/// Creates a new [`Utf8Array`] from a [`Iterator`] of `&str`.
424
pub fn from_iter_values<T: AsRef<str>, I: Iterator<Item = T>>(iterator: I) -> Self {
425
MutableUtf8Array::<O>::from_iter_values(iterator).into()
426
}
427
428
/// Creates a [`Utf8Array`] from an iterator of trusted length.
429
///
430
/// # Safety
431
/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
432
/// I.e. that `size_hint().1` correctly reports its length.
433
#[inline]
434
pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self
435
where
436
P: AsRef<str>,
437
I: Iterator<Item = Option<P>>,
438
{
439
MutableUtf8Array::<O>::from_trusted_len_iter_unchecked(iterator).into()
440
}
441
442
/// Creates a [`Utf8Array`] from an iterator of trusted length.
443
#[inline]
444
pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self
445
where
446
P: AsRef<str>,
447
I: TrustedLen<Item = Option<P>>,
448
{
449
MutableUtf8Array::<O>::from_trusted_len_iter(iterator).into()
450
}
451
452
/// Creates a [`Utf8Array`] from an falible iterator of trusted length.
453
///
454
/// # Safety
455
/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
456
/// I.e. that `size_hint().1` correctly reports its length.
457
#[inline]
458
pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(
459
iterator: I,
460
) -> std::result::Result<Self, E>
461
where
462
P: AsRef<str>,
463
I: IntoIterator<Item = std::result::Result<Option<P>, E>>,
464
{
465
MutableUtf8Array::<O>::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into())
466
}
467
468
/// Creates a [`Utf8Array`] from an fallible iterator of trusted length.
469
#[inline]
470
pub fn try_from_trusted_len_iter<E, I, P>(iter: I) -> std::result::Result<Self, E>
471
where
472
P: AsRef<str>,
473
I: TrustedLen<Item = std::result::Result<Option<P>, E>>,
474
{
475
MutableUtf8Array::<O>::try_from_trusted_len_iter(iter).map(|x| x.into())
476
}
477
478
/// Applies a function `f` to the validity of this array.
479
///
480
/// This is an API to leverage clone-on-write
481
/// # Panics
482
/// This function panics if the function `f` modifies the length of the [`Bitmap`].
483
pub fn apply_validity<F: FnOnce(Bitmap) -> Bitmap>(&mut self, f: F) {
484
if let Some(validity) = std::mem::take(&mut self.validity) {
485
self.set_validity(Some(f(validity)))
486
}
487
}
488
489
// Convert this [`Utf8Array`] to a [`BinaryArray`].
490
pub fn to_binary(&self) -> BinaryArray<O> {
491
unsafe {
492
BinaryArray::new_unchecked(
493
BinaryArray::<O>::default_dtype(),
494
self.offsets.clone(),
495
self.values.clone(),
496
self.validity.clone(),
497
)
498
}
499
}
500
}
501
502
impl<O: Offset> Splitable for Utf8Array<O> {
503
#[inline(always)]
504
fn check_bound(&self, offset: usize) -> bool {
505
offset <= self.len()
506
}
507
508
unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
509
let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };
510
let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
511
512
(
513
Self {
514
dtype: self.dtype.clone(),
515
offsets: lhs_offsets,
516
values: self.values.clone(),
517
validity: lhs_validity,
518
},
519
Self {
520
dtype: self.dtype.clone(),
521
offsets: rhs_offsets,
522
values: self.values.clone(),
523
validity: rhs_validity,
524
},
525
)
526
}
527
}
528
529
impl<O: Offset> Array for Utf8Array<O> {
530
impl_common_array!();
531
532
fn validity(&self) -> Option<&Bitmap> {
533
self.validity.as_ref()
534
}
535
536
#[inline]
537
fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
538
Box::new(self.clone().with_validity(validity))
539
}
540
}
541
542
unsafe impl<O: Offset> GenericBinaryArray<O> for Utf8Array<O> {
543
#[inline]
544
fn values(&self) -> &[u8] {
545
self.values()
546
}
547
548
#[inline]
549
fn offsets(&self) -> &[O] {
550
self.offsets().buffer()
551
}
552
}
553
554
impl<O: Offset> Default for Utf8Array<O> {
555
fn default() -> Self {
556
let dtype = if O::IS_LARGE {
557
ArrowDataType::LargeUtf8
558
} else {
559
ArrowDataType::Utf8
560
};
561
Utf8Array::new(dtype, Default::default(), Default::default(), None)
562
}
563
}
564
565