Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-arrow/src/array/utf8/mutable_values.rs
6939 views
1
use std::sync::Arc;
2
3
use polars_error::{PolarsResult, polars_bail};
4
5
use super::{MutableUtf8Array, StrAsBytes, Utf8Array};
6
use crate::array::physical_binary::*;
7
use crate::array::specification::{try_check_offsets_bounds, try_check_utf8};
8
use crate::array::{Array, ArrayValuesIter, MutableArray, TryExtend, TryExtendFromSelf, TryPush};
9
use crate::bitmap::MutableBitmap;
10
use crate::datatypes::ArrowDataType;
11
use crate::offset::{Offset, Offsets};
12
use crate::trusted_len::TrustedLen;
13
14
/// A [`MutableArray`] that builds a [`Utf8Array`]. It differs
15
/// from [`MutableUtf8Array`] in that it builds non-null [`Utf8Array`].
16
#[derive(Debug, Clone)]
17
pub struct MutableUtf8ValuesArray<O: Offset> {
18
dtype: ArrowDataType,
19
offsets: Offsets<O>,
20
values: Vec<u8>,
21
}
22
23
impl<O: Offset> From<MutableUtf8ValuesArray<O>> for Utf8Array<O> {
24
fn from(other: MutableUtf8ValuesArray<O>) -> Self {
25
// SAFETY:
26
// `MutableUtf8ValuesArray` has the same invariants as `Utf8Array` and thus
27
// `Utf8Array` can be safely created from `MutableUtf8ValuesArray` without checks.
28
unsafe {
29
Utf8Array::<O>::new_unchecked(
30
other.dtype,
31
other.offsets.into(),
32
other.values.into(),
33
None,
34
)
35
}
36
}
37
}
38
39
impl<O: Offset> From<MutableUtf8ValuesArray<O>> for MutableUtf8Array<O> {
40
fn from(other: MutableUtf8ValuesArray<O>) -> Self {
41
// SAFETY:
42
// `MutableUtf8ValuesArray` has the same invariants as `MutableUtf8Array`
43
unsafe {
44
MutableUtf8Array::<O>::new_unchecked(other.dtype, other.offsets, other.values, None)
45
}
46
}
47
}
48
49
impl<O: Offset> Default for MutableUtf8ValuesArray<O> {
50
fn default() -> Self {
51
Self::new()
52
}
53
}
54
55
impl<O: Offset> MutableUtf8ValuesArray<O> {
56
/// Returns an empty [`MutableUtf8ValuesArray`].
57
pub fn new() -> Self {
58
Self {
59
dtype: Self::default_dtype(),
60
offsets: Offsets::new(),
61
values: Vec::<u8>::new(),
62
}
63
}
64
65
/// Returns a [`MutableUtf8ValuesArray`] created from its internal representation.
66
///
67
/// # Errors
68
/// This function returns an error iff:
69
/// * `offsets.last()` is greater than `values.len()`.
70
/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`.
71
/// * The `values` between two consecutive `offsets` are not valid utf8
72
/// # Implementation
73
/// This function is `O(N)` - checking utf8 is `O(N)`
74
pub fn try_new(
75
dtype: ArrowDataType,
76
offsets: Offsets<O>,
77
values: Vec<u8>,
78
) -> PolarsResult<Self> {
79
try_check_utf8(&offsets, &values)?;
80
if dtype.to_physical_type() != Self::default_dtype().to_physical_type() {
81
polars_bail!(ComputeError: "MutableUtf8ValuesArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8")
82
}
83
84
Ok(Self {
85
dtype,
86
offsets,
87
values,
88
})
89
}
90
91
/// Returns a [`MutableUtf8ValuesArray`] created from its internal representation.
92
///
93
/// # Panic
94
/// This function does not panic iff:
95
/// * `offsets.last()` is greater than `values.len()`
96
/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is equal to either `Utf8` or `LargeUtf8`.
97
///
98
/// # Safety
99
/// This function is safe iff:
100
/// * the offsets are monotonically increasing
101
/// * The `values` between two consecutive `offsets` are not valid utf8
102
/// # Implementation
103
/// This function is `O(1)`
104
pub unsafe fn new_unchecked(
105
dtype: ArrowDataType,
106
offsets: Offsets<O>,
107
values: Vec<u8>,
108
) -> Self {
109
try_check_offsets_bounds(&offsets, values.len())
110
.expect("The length of the values must be equal to the last offset value");
111
112
if dtype.to_physical_type() != Self::default_dtype().to_physical_type() {
113
panic!(
114
"MutableUtf8ValuesArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8"
115
)
116
}
117
118
Self {
119
dtype,
120
offsets,
121
values,
122
}
123
}
124
125
/// Returns the default [`ArrowDataType`] of this container: [`ArrowDataType::Utf8`] or [`ArrowDataType::LargeUtf8`]
126
/// depending on the generic [`Offset`].
127
pub fn default_dtype() -> ArrowDataType {
128
Utf8Array::<O>::default_dtype()
129
}
130
131
/// Initializes a new [`MutableUtf8ValuesArray`] with a pre-allocated capacity of items.
132
pub fn with_capacity(capacity: usize) -> Self {
133
Self::with_capacities(capacity, 0)
134
}
135
136
/// Initializes a new [`MutableUtf8ValuesArray`] with a pre-allocated capacity of items and values.
137
pub fn with_capacities(capacity: usize, values: usize) -> Self {
138
Self {
139
dtype: Self::default_dtype(),
140
offsets: Offsets::<O>::with_capacity(capacity),
141
values: Vec::<u8>::with_capacity(values),
142
}
143
}
144
145
/// returns its values.
146
#[inline]
147
pub fn values(&self) -> &Vec<u8> {
148
&self.values
149
}
150
151
/// returns its offsets.
152
#[inline]
153
pub fn offsets(&self) -> &Offsets<O> {
154
&self.offsets
155
}
156
157
/// Reserves `additional` elements and `additional_values` on the values.
158
#[inline]
159
pub fn reserve(&mut self, additional: usize, additional_values: usize) {
160
self.offsets.reserve(additional + 1);
161
self.values.reserve(additional_values);
162
}
163
164
/// Returns the capacity in number of items
165
pub fn capacity(&self) -> usize {
166
self.offsets.capacity()
167
}
168
169
/// Returns the length of this array
170
#[inline]
171
pub fn len(&self) -> usize {
172
self.offsets.len_proxy()
173
}
174
175
/// Pushes a new item to the array.
176
/// # Panic
177
/// This operation panics iff the length of all values (in bytes) exceeds `O` maximum value.
178
#[inline]
179
pub fn push<T: AsRef<str>>(&mut self, value: T) {
180
self.try_push(value).unwrap()
181
}
182
183
/// Pop the last entry from [`MutableUtf8ValuesArray`].
184
/// This function returns `None` iff this array is empty.
185
pub fn pop(&mut self) -> Option<String> {
186
if self.len() == 0 {
187
return None;
188
}
189
self.offsets.pop()?;
190
let start = self.offsets.last().to_usize();
191
let value = self.values.split_off(start);
192
// SAFETY: utf8 is validated on initialization
193
Some(unsafe { String::from_utf8_unchecked(value) })
194
}
195
196
/// Returns the value of the element at index `i`.
197
/// # Panic
198
/// This function panics iff `i >= self.len`.
199
#[inline]
200
pub fn value(&self, i: usize) -> &str {
201
assert!(i < self.len());
202
unsafe { self.value_unchecked(i) }
203
}
204
205
/// Returns the value of the element at index `i`.
206
///
207
/// # Safety
208
/// This function is safe iff `i < self.len`.
209
#[inline]
210
pub unsafe fn value_unchecked(&self, i: usize) -> &str {
211
// soundness: the invariant of the function
212
let (start, end) = self.offsets.start_end(i);
213
214
// soundness: the invariant of the struct
215
let slice = self.values.get_unchecked(start..end);
216
217
// soundness: the invariant of the struct
218
std::str::from_utf8_unchecked(slice)
219
}
220
221
/// Returns an iterator of `&str`
222
pub fn iter(&self) -> ArrayValuesIter<'_, Self> {
223
ArrayValuesIter::new(self)
224
}
225
226
/// Shrinks the capacity of the [`MutableUtf8ValuesArray`] to fit its current length.
227
pub fn shrink_to_fit(&mut self) {
228
self.values.shrink_to_fit();
229
self.offsets.shrink_to_fit();
230
}
231
232
/// Extract the low-end APIs from the [`MutableUtf8ValuesArray`].
233
pub fn into_inner(self) -> (ArrowDataType, Offsets<O>, Vec<u8>) {
234
(self.dtype, self.offsets, self.values)
235
}
236
}
237
238
impl<O: Offset> MutableArray for MutableUtf8ValuesArray<O> {
239
fn len(&self) -> usize {
240
self.len()
241
}
242
243
fn validity(&self) -> Option<&MutableBitmap> {
244
None
245
}
246
247
fn as_box(&mut self) -> Box<dyn Array> {
248
let array: Utf8Array<O> = std::mem::take(self).into();
249
array.boxed()
250
}
251
252
fn as_arc(&mut self) -> Arc<dyn Array> {
253
let array: Utf8Array<O> = std::mem::take(self).into();
254
array.arced()
255
}
256
257
fn dtype(&self) -> &ArrowDataType {
258
&self.dtype
259
}
260
261
fn as_any(&self) -> &dyn std::any::Any {
262
self
263
}
264
265
fn as_mut_any(&mut self) -> &mut dyn std::any::Any {
266
self
267
}
268
269
#[inline]
270
fn push_null(&mut self) {
271
self.push::<&str>("")
272
}
273
274
fn reserve(&mut self, additional: usize) {
275
self.reserve(additional, 0)
276
}
277
278
fn shrink_to_fit(&mut self) {
279
self.shrink_to_fit()
280
}
281
}
282
283
impl<O: Offset, P: AsRef<str>> FromIterator<P> for MutableUtf8ValuesArray<O> {
284
fn from_iter<I: IntoIterator<Item = P>>(iter: I) -> Self {
285
let (offsets, values) = values_iter(iter.into_iter().map(StrAsBytes));
286
// soundness: T: AsRef<str> and offsets are monotonically increasing
287
unsafe { Self::new_unchecked(Self::default_dtype(), offsets, values) }
288
}
289
}
290
291
impl<O: Offset> MutableUtf8ValuesArray<O> {
292
pub(crate) unsafe fn extend_from_trusted_len_iter<I, P>(
293
&mut self,
294
validity: &mut MutableBitmap,
295
iterator: I,
296
) where
297
P: AsRef<str>,
298
I: Iterator<Item = Option<P>>,
299
{
300
let iterator = iterator.map(|x| x.map(StrAsBytes));
301
extend_from_trusted_len_iter(&mut self.offsets, &mut self.values, validity, iterator);
302
}
303
304
/// Extends the [`MutableUtf8ValuesArray`] from a [`TrustedLen`]
305
#[inline]
306
pub fn extend_trusted_len<I, P>(&mut self, iterator: I)
307
where
308
P: AsRef<str>,
309
I: TrustedLen<Item = P>,
310
{
311
unsafe { self.extend_trusted_len_unchecked(iterator) }
312
}
313
314
/// Extends [`MutableUtf8ValuesArray`] from an iterator of trusted len.
315
///
316
/// # Safety
317
/// The iterator must be trusted len.
318
#[inline]
319
pub unsafe fn extend_trusted_len_unchecked<I, P>(&mut self, iterator: I)
320
where
321
P: AsRef<str>,
322
I: Iterator<Item = P>,
323
{
324
let iterator = iterator.map(StrAsBytes);
325
extend_from_trusted_len_values_iter(&mut self.offsets, &mut self.values, iterator);
326
}
327
328
/// Creates a [`MutableUtf8ValuesArray`] from a [`TrustedLen`]
329
#[inline]
330
pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self
331
where
332
P: AsRef<str>,
333
I: TrustedLen<Item = P>,
334
{
335
// soundness: I is `TrustedLen`
336
unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
337
}
338
339
/// Returns a new [`MutableUtf8ValuesArray`] from an iterator of trusted length.
340
///
341
/// # Safety
342
/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
343
/// I.e. that `size_hint().1` correctly reports its length.
344
#[inline]
345
pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self
346
where
347
P: AsRef<str>,
348
I: Iterator<Item = P>,
349
{
350
let iterator = iterator.map(StrAsBytes);
351
let (offsets, values) = trusted_len_values_iter(iterator);
352
353
// soundness: P is `str` and offsets are monotonically increasing
354
Self::new_unchecked(Self::default_dtype(), offsets, values)
355
}
356
357
/// Returns a new [`MutableUtf8ValuesArray`] from an iterator.
358
/// # Error
359
/// This operation errors iff the total length in bytes on the iterator exceeds `O`'s maximum value.
360
/// (`i32::MAX` or `i64::MAX` respectively).
361
pub fn try_from_iter<P: AsRef<str>, I: IntoIterator<Item = P>>(iter: I) -> PolarsResult<Self> {
362
let iterator = iter.into_iter();
363
let (lower, _) = iterator.size_hint();
364
let mut array = Self::with_capacity(lower);
365
for item in iterator {
366
array.try_push(item)?;
367
}
368
Ok(array)
369
}
370
371
/// Extend with a fallible iterator
372
pub fn extend_fallible<T, I, E>(&mut self, iter: I) -> std::result::Result<(), E>
373
where
374
E: std::error::Error,
375
I: IntoIterator<Item = std::result::Result<T, E>>,
376
T: AsRef<str>,
377
{
378
let mut iter = iter.into_iter();
379
self.reserve(iter.size_hint().0, 0);
380
iter.try_for_each(|x| {
381
self.push(x?);
382
Ok(())
383
})
384
}
385
}
386
387
impl<O: Offset, T: AsRef<str>> Extend<T> for MutableUtf8ValuesArray<O> {
388
fn extend<I: IntoIterator<Item = T>>(&mut self, iter: I) {
389
extend_from_values_iter(
390
&mut self.offsets,
391
&mut self.values,
392
iter.into_iter().map(StrAsBytes),
393
);
394
}
395
}
396
397
impl<O: Offset, T: AsRef<str>> TryExtend<T> for MutableUtf8ValuesArray<O> {
398
fn try_extend<I: IntoIterator<Item = T>>(&mut self, iter: I) -> PolarsResult<()> {
399
let mut iter = iter.into_iter();
400
self.reserve(iter.size_hint().0, 0);
401
iter.try_for_each(|x| self.try_push(x))
402
}
403
}
404
405
impl<O: Offset, T: AsRef<str>> TryPush<T> for MutableUtf8ValuesArray<O> {
406
#[inline]
407
fn try_push(&mut self, value: T) -> PolarsResult<()> {
408
let bytes = value.as_ref().as_bytes();
409
self.values.extend_from_slice(bytes);
410
self.offsets.try_push(bytes.len())
411
}
412
}
413
414
impl<O: Offset> TryExtendFromSelf for MutableUtf8ValuesArray<O> {
415
fn try_extend_from_self(&mut self, other: &Self) -> PolarsResult<()> {
416
self.values.extend_from_slice(&other.values);
417
self.offsets.try_extend_from_self(&other.offsets)
418
}
419
}
420
421