Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-arrow/src/array/utf8/mutable.rs
6939 views
1
use std::sync::Arc;
2
3
use polars_error::{PolarsResult, polars_bail};
4
5
use super::{MutableUtf8ValuesArray, MutableUtf8ValuesIter, StrAsBytes, Utf8Array};
6
use crate::array::physical_binary::*;
7
use crate::array::{Array, MutableArray, TryExtend, TryExtendFromSelf, TryPush};
8
use crate::bitmap::utils::{BitmapIter, ZipValidity};
9
use crate::bitmap::{Bitmap, MutableBitmap};
10
use crate::datatypes::ArrowDataType;
11
use crate::offset::{Offset, Offsets};
12
use crate::trusted_len::TrustedLen;
13
14
/// A [`MutableArray`] that builds a [`Utf8Array`]. It differs
15
/// from [`MutableUtf8ValuesArray`] in that it can build nullable [`Utf8Array`]s.
16
#[derive(Debug, Clone)]
17
pub struct MutableUtf8Array<O: Offset> {
18
values: MutableUtf8ValuesArray<O>,
19
validity: Option<MutableBitmap>,
20
}
21
22
impl<O: Offset> From<MutableUtf8Array<O>> for Utf8Array<O> {
23
fn from(other: MutableUtf8Array<O>) -> Self {
24
let validity = other.validity.and_then(|x| {
25
let validity: Option<Bitmap> = x.into();
26
validity
27
});
28
let array: Utf8Array<O> = other.values.into();
29
array.with_validity(validity)
30
}
31
}
32
33
impl<O: Offset> Default for MutableUtf8Array<O> {
34
fn default() -> Self {
35
Self::new()
36
}
37
}
38
39
impl<O: Offset> MutableUtf8Array<O> {
40
/// Initializes a new empty [`MutableUtf8Array`].
41
pub fn new() -> Self {
42
Self {
43
values: Default::default(),
44
validity: None,
45
}
46
}
47
48
/// Returns a [`MutableUtf8Array`] created from its internal representation.
49
///
50
/// # Errors
51
/// This function returns an error iff:
52
/// * The last offset is not equal to the values' length.
53
/// * the validity's length is not equal to `offsets.len()`.
54
/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`.
55
/// * The `values` between two consecutive `offsets` are not valid utf8
56
/// # Implementation
57
/// This function is `O(N)` - checking utf8 is `O(N)`
58
pub fn try_new(
59
dtype: ArrowDataType,
60
offsets: Offsets<O>,
61
values: Vec<u8>,
62
validity: Option<MutableBitmap>,
63
) -> PolarsResult<Self> {
64
let values = MutableUtf8ValuesArray::try_new(dtype, offsets, values)?;
65
66
if validity
67
.as_ref()
68
.is_some_and(|validity| validity.len() != values.len())
69
{
70
polars_bail!(ComputeError: "validity's length must be equal to the number of values")
71
}
72
73
Ok(Self { values, validity })
74
}
75
76
/// Create a [`MutableUtf8Array`] out of low-end APIs.
77
///
78
/// # Safety
79
/// The caller must ensure that every value between offsets is a valid utf8.
80
/// # Panics
81
/// This function panics iff:
82
/// * The `offsets` and `values` are inconsistent
83
/// * The validity is not `None` and its length is different from `offsets`'s length minus one.
84
pub unsafe fn new_unchecked(
85
dtype: ArrowDataType,
86
offsets: Offsets<O>,
87
values: Vec<u8>,
88
validity: Option<MutableBitmap>,
89
) -> Self {
90
let values = MutableUtf8ValuesArray::new_unchecked(dtype, offsets, values);
91
if let Some(ref validity) = validity {
92
assert_eq!(values.len(), validity.len());
93
}
94
Self { values, validity }
95
}
96
97
/// Creates a new [`MutableUtf8Array`] from a slice of optional `&[u8]`.
98
// Note: this can't be `impl From` because Rust does not allow double `AsRef` on it.
99
pub fn from<T: AsRef<str>, P: AsRef<[Option<T>]>>(slice: P) -> Self {
100
Self::from_trusted_len_iter(slice.as_ref().iter().map(|x| x.as_ref()))
101
}
102
103
fn default_dtype() -> ArrowDataType {
104
Utf8Array::<O>::default_dtype()
105
}
106
107
/// Initializes a new [`MutableUtf8Array`] with a pre-allocated capacity of slots.
108
pub fn with_capacity(capacity: usize) -> Self {
109
Self::with_capacities(capacity, 0)
110
}
111
112
/// Initializes a new [`MutableUtf8Array`] with a pre-allocated capacity of slots and values.
113
pub fn with_capacities(capacity: usize, values: usize) -> Self {
114
Self {
115
values: MutableUtf8ValuesArray::with_capacities(capacity, values),
116
validity: None,
117
}
118
}
119
120
/// Reserves `additional` elements and `additional_values` on the values buffer.
121
pub fn reserve(&mut self, additional: usize, additional_values: usize) {
122
self.values.reserve(additional, additional_values);
123
if let Some(x) = self.validity.as_mut() {
124
x.reserve(additional)
125
}
126
}
127
128
/// Reserves `additional` elements and `additional_values` on the values buffer.
129
pub fn capacity(&self) -> usize {
130
self.values.capacity()
131
}
132
133
/// Returns the length of this array
134
#[inline]
135
pub fn len(&self) -> usize {
136
self.values.len()
137
}
138
139
/// Pushes a new element to the array.
140
/// # Panic
141
/// This operation panics iff the length of all values (in bytes) exceeds `O` maximum value.
142
#[inline]
143
pub fn push<T: AsRef<str>>(&mut self, value: Option<T>) {
144
self.try_push(value).unwrap()
145
}
146
147
/// Returns the value of the element at index `i`, ignoring the array's validity.
148
#[inline]
149
pub fn value(&self, i: usize) -> &str {
150
self.values.value(i)
151
}
152
153
/// Returns the value of the element at index `i`, ignoring the array's validity.
154
///
155
/// # Safety
156
/// This function is safe iff `i < self.len`.
157
#[inline]
158
pub unsafe fn value_unchecked(&self, i: usize) -> &str {
159
self.values.value_unchecked(i)
160
}
161
162
/// Pop the last entry from [`MutableUtf8Array`].
163
/// This function returns `None` iff this array is empty.
164
pub fn pop(&mut self) -> Option<String> {
165
let value = self.values.pop()?;
166
self.validity
167
.as_mut()
168
.map(|x| x.pop()?.then(|| ()))
169
.unwrap_or_else(|| Some(()))
170
.map(|_| value)
171
}
172
173
fn init_validity(&mut self) {
174
let mut validity = MutableBitmap::with_capacity(self.values.capacity());
175
validity.extend_constant(self.len(), true);
176
validity.set(self.len() - 1, false);
177
self.validity = Some(validity);
178
}
179
180
/// Returns an iterator of `Option<&str>`
181
pub fn iter(&self) -> ZipValidity<&str, MutableUtf8ValuesIter<'_, O>, BitmapIter<'_>> {
182
ZipValidity::new(self.values_iter(), self.validity.as_ref().map(|x| x.iter()))
183
}
184
185
/// Converts itself into an [`Array`].
186
pub fn into_arc(self) -> Arc<dyn Array> {
187
let a: Utf8Array<O> = self.into();
188
Arc::new(a)
189
}
190
191
/// Shrinks the capacity of the [`MutableUtf8Array`] to fit its current length.
192
pub fn shrink_to_fit(&mut self) {
193
self.values.shrink_to_fit();
194
if let Some(validity) = &mut self.validity {
195
validity.shrink_to_fit()
196
}
197
}
198
199
/// Extract the low-end APIs from the [`MutableUtf8Array`].
200
pub fn into_data(self) -> (ArrowDataType, Offsets<O>, Vec<u8>, Option<MutableBitmap>) {
201
let (dtype, offsets, values) = self.values.into_inner();
202
(dtype, offsets, values, self.validity)
203
}
204
205
/// Returns an iterator of `&str`
206
pub fn values_iter(&self) -> MutableUtf8ValuesIter<'_, O> {
207
self.values.iter()
208
}
209
210
/// Sets the validity.
211
/// # Panic
212
/// Panics iff the validity's len is not equal to the existing values' length.
213
pub fn set_validity(&mut self, validity: Option<MutableBitmap>) {
214
if let Some(validity) = &validity {
215
assert_eq!(self.values.len(), validity.len())
216
}
217
self.validity = validity;
218
}
219
220
/// Applies a function `f` to the validity of this array.
221
///
222
/// This is an API to leverage clone-on-write
223
/// # Panics
224
/// This function panics if the function `f` modifies the length of the [`Bitmap`].
225
pub fn apply_validity<F: FnOnce(MutableBitmap) -> MutableBitmap>(&mut self, f: F) {
226
if let Some(validity) = std::mem::take(&mut self.validity) {
227
self.set_validity(Some(f(validity)))
228
}
229
}
230
}
231
232
impl<O: Offset> MutableUtf8Array<O> {
233
/// returns its values.
234
pub fn values(&self) -> &Vec<u8> {
235
self.values.values()
236
}
237
238
/// returns its offsets.
239
pub fn offsets(&self) -> &Offsets<O> {
240
self.values.offsets()
241
}
242
}
243
244
impl<O: Offset> MutableArray for MutableUtf8Array<O> {
245
fn len(&self) -> usize {
246
self.len()
247
}
248
249
fn validity(&self) -> Option<&MutableBitmap> {
250
self.validity.as_ref()
251
}
252
253
fn as_box(&mut self) -> Box<dyn Array> {
254
let array: Utf8Array<O> = std::mem::take(self).into();
255
array.boxed()
256
}
257
258
fn as_arc(&mut self) -> Arc<dyn Array> {
259
let array: Utf8Array<O> = std::mem::take(self).into();
260
array.arced()
261
}
262
263
fn dtype(&self) -> &ArrowDataType {
264
if O::IS_LARGE {
265
&ArrowDataType::LargeUtf8
266
} else {
267
&ArrowDataType::Utf8
268
}
269
}
270
271
fn as_any(&self) -> &dyn std::any::Any {
272
self
273
}
274
275
fn as_mut_any(&mut self) -> &mut dyn std::any::Any {
276
self
277
}
278
279
#[inline]
280
fn push_null(&mut self) {
281
self.push::<&str>(None)
282
}
283
284
fn reserve(&mut self, additional: usize) {
285
self.reserve(additional, 0)
286
}
287
288
fn shrink_to_fit(&mut self) {
289
self.shrink_to_fit()
290
}
291
}
292
293
impl<O: Offset, P: AsRef<str>> FromIterator<Option<P>> for MutableUtf8Array<O> {
294
fn from_iter<I: IntoIterator<Item = Option<P>>>(iter: I) -> Self {
295
Self::try_from_iter(iter).unwrap()
296
}
297
}
298
299
impl<O: Offset> MutableUtf8Array<O> {
300
/// Extends the [`MutableUtf8Array`] from an iterator of values of trusted len.
301
/// This differs from `extended_trusted_len` which accepts iterator of optional values.
302
#[inline]
303
pub fn extend_trusted_len_values<I, P>(&mut self, iterator: I)
304
where
305
P: AsRef<str>,
306
I: TrustedLen<Item = P>,
307
{
308
unsafe { self.extend_trusted_len_values_unchecked(iterator) }
309
}
310
311
/// Extends the [`MutableUtf8Array`] from an iterator of values.
312
/// This differs from `extended_trusted_len` which accepts iterator of optional values.
313
#[inline]
314
pub fn extend_values<I, P>(&mut self, iterator: I)
315
where
316
P: AsRef<str>,
317
I: Iterator<Item = P>,
318
{
319
let length = self.values.len();
320
self.values.extend(iterator);
321
let additional = self.values.len() - length;
322
323
if let Some(validity) = self.validity.as_mut() {
324
validity.extend_constant(additional, true);
325
}
326
}
327
328
/// Extends the [`MutableUtf8Array`] from an iterator of values of trusted len.
329
/// This differs from `extended_trusted_len_unchecked` which accepts iterator of optional
330
/// values.
331
///
332
/// # Safety
333
/// The iterator must be trusted len.
334
#[inline]
335
pub unsafe fn extend_trusted_len_values_unchecked<I, P>(&mut self, iterator: I)
336
where
337
P: AsRef<str>,
338
I: Iterator<Item = P>,
339
{
340
let length = self.values.len();
341
self.values.extend_trusted_len_unchecked(iterator);
342
let additional = self.values.len() - length;
343
344
if let Some(validity) = self.validity.as_mut() {
345
validity.extend_constant(additional, true);
346
}
347
}
348
349
/// Extends the [`MutableUtf8Array`] from an iterator of trusted len.
350
#[inline]
351
pub fn extend_trusted_len<I, P>(&mut self, iterator: I)
352
where
353
P: AsRef<str>,
354
I: TrustedLen<Item = Option<P>>,
355
{
356
unsafe { self.extend_trusted_len_unchecked(iterator) }
357
}
358
359
/// Extends [`MutableUtf8Array`] from an iterator of trusted len.
360
///
361
/// # Safety
362
/// The iterator must be trusted len.
363
#[inline]
364
pub unsafe fn extend_trusted_len_unchecked<I, P>(&mut self, iterator: I)
365
where
366
P: AsRef<str>,
367
I: Iterator<Item = Option<P>>,
368
{
369
if self.validity.is_none() {
370
let mut validity = MutableBitmap::new();
371
validity.extend_constant(self.len(), true);
372
self.validity = Some(validity);
373
}
374
375
self.values
376
.extend_from_trusted_len_iter(self.validity.as_mut().unwrap(), iterator);
377
}
378
379
/// Creates a [`MutableUtf8Array`] from an iterator of trusted length.
380
///
381
/// # Safety
382
/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
383
/// I.e. that `size_hint().1` correctly reports its length.
384
#[inline]
385
pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self
386
where
387
P: AsRef<str>,
388
I: Iterator<Item = Option<P>>,
389
{
390
let iterator = iterator.map(|x| x.map(StrAsBytes));
391
let (validity, offsets, values) = trusted_len_unzip(iterator);
392
393
// soundness: P is `str`
394
Self::new_unchecked(Self::default_dtype(), offsets, values, validity)
395
}
396
397
/// Creates a [`MutableUtf8Array`] from an iterator of trusted length.
398
#[inline]
399
pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self
400
where
401
P: AsRef<str>,
402
I: TrustedLen<Item = Option<P>>,
403
{
404
// soundness: I is `TrustedLen`
405
unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
406
}
407
408
/// Creates a [`MutableUtf8Array`] from an iterator of trusted length of `&str`.
409
///
410
/// # Safety
411
/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
412
/// I.e. that `size_hint().1` correctly reports its length.
413
#[inline]
414
pub unsafe fn from_trusted_len_values_iter_unchecked<T: AsRef<str>, I: Iterator<Item = T>>(
415
iterator: I,
416
) -> Self {
417
MutableUtf8ValuesArray::from_trusted_len_iter_unchecked(iterator).into()
418
}
419
420
/// Creates a new [`MutableUtf8Array`] from a [`TrustedLen`] of `&str`.
421
#[inline]
422
pub fn from_trusted_len_values_iter<T: AsRef<str>, I: TrustedLen<Item = T>>(
423
iterator: I,
424
) -> Self {
425
// soundness: I is `TrustedLen`
426
unsafe { Self::from_trusted_len_values_iter_unchecked(iterator) }
427
}
428
429
/// Creates a new [`MutableUtf8Array`] from an iterator.
430
/// # Error
431
/// This operation errors iff the total length in bytes on the iterator exceeds `O`'s maximum value.
432
/// (`i32::MAX` or `i64::MAX` respectively).
433
fn try_from_iter<P: AsRef<str>, I: IntoIterator<Item = Option<P>>>(
434
iter: I,
435
) -> PolarsResult<Self> {
436
let iterator = iter.into_iter();
437
let (lower, _) = iterator.size_hint();
438
let mut array = Self::with_capacity(lower);
439
for item in iterator {
440
array.try_push(item)?;
441
}
442
Ok(array)
443
}
444
445
/// Creates a [`MutableUtf8Array`] from an falible iterator of trusted length.
446
///
447
/// # Safety
448
/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
449
/// I.e. that `size_hint().1` correctly reports its length.
450
#[inline]
451
pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(
452
iterator: I,
453
) -> std::result::Result<Self, E>
454
where
455
P: AsRef<str>,
456
I: IntoIterator<Item = std::result::Result<Option<P>, E>>,
457
{
458
let iterator = iterator.into_iter();
459
460
let iterator = iterator.map(|x| x.map(|x| x.map(StrAsBytes)));
461
let (validity, offsets, values) = try_trusted_len_unzip(iterator)?;
462
463
// soundness: P is `str`
464
Ok(Self::new_unchecked(
465
Self::default_dtype(),
466
offsets,
467
values,
468
validity,
469
))
470
}
471
472
/// Creates a [`MutableUtf8Array`] from an falible iterator of trusted length.
473
#[inline]
474
pub fn try_from_trusted_len_iter<E, I, P>(iterator: I) -> std::result::Result<Self, E>
475
where
476
P: AsRef<str>,
477
I: TrustedLen<Item = std::result::Result<Option<P>, E>>,
478
{
479
// soundness: I: TrustedLen
480
unsafe { Self::try_from_trusted_len_iter_unchecked(iterator) }
481
}
482
483
/// Creates a new [`MutableUtf8Array`] from a [`Iterator`] of `&str`.
484
pub fn from_iter_values<T: AsRef<str>, I: Iterator<Item = T>>(iterator: I) -> Self {
485
MutableUtf8ValuesArray::from_iter(iterator).into()
486
}
487
488
/// Extend with a fallible iterator
489
pub fn extend_fallible<T, I, E>(&mut self, iter: I) -> std::result::Result<(), E>
490
where
491
E: std::error::Error,
492
I: IntoIterator<Item = std::result::Result<Option<T>, E>>,
493
T: AsRef<str>,
494
{
495
let mut iter = iter.into_iter();
496
self.reserve(iter.size_hint().0, 0);
497
iter.try_for_each(|x| {
498
self.push(x?);
499
Ok(())
500
})
501
}
502
}
503
504
impl<O: Offset, T: AsRef<str>> Extend<Option<T>> for MutableUtf8Array<O> {
505
fn extend<I: IntoIterator<Item = Option<T>>>(&mut self, iter: I) {
506
self.try_extend(iter).unwrap();
507
}
508
}
509
510
impl<O: Offset, T: AsRef<str>> TryExtend<Option<T>> for MutableUtf8Array<O> {
511
fn try_extend<I: IntoIterator<Item = Option<T>>>(&mut self, iter: I) -> PolarsResult<()> {
512
let mut iter = iter.into_iter();
513
self.reserve(iter.size_hint().0, 0);
514
iter.try_for_each(|x| self.try_push(x))
515
}
516
}
517
518
impl<O: Offset, T: AsRef<str>> TryPush<Option<T>> for MutableUtf8Array<O> {
519
#[inline]
520
fn try_push(&mut self, value: Option<T>) -> PolarsResult<()> {
521
match value {
522
Some(value) => {
523
self.values.try_push(value.as_ref())?;
524
525
if let Some(validity) = &mut self.validity {
526
validity.push(true)
527
}
528
},
529
None => {
530
self.values.push("");
531
match &mut self.validity {
532
Some(validity) => validity.push(false),
533
None => self.init_validity(),
534
}
535
},
536
}
537
Ok(())
538
}
539
}
540
541
impl<O: Offset> PartialEq for MutableUtf8Array<O> {
542
fn eq(&self, other: &Self) -> bool {
543
self.iter().eq(other.iter())
544
}
545
}
546
547
impl<O: Offset> TryExtendFromSelf for MutableUtf8Array<O> {
548
fn try_extend_from_self(&mut self, other: &Self) -> PolarsResult<()> {
549
extend_validity(self.len(), &mut self.validity, &other.validity);
550
551
self.values.try_extend_from_self(&other.values)
552
}
553
}
554
555