Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-arrow/src/array/list/mod.rs
8398 views
1
use super::specification::try_check_offsets_bounds;
2
use super::{Array, Splitable, new_empty_array};
3
use crate::bitmap::Bitmap;
4
use crate::datatypes::{ArrowDataType, Field};
5
use crate::offset::{Offset, Offsets, OffsetsBuffer};
6
7
mod builder;
8
pub use builder::*;
9
mod ffi;
10
pub(super) mod fmt;
11
mod iterator;
12
pub use iterator::*;
13
mod mutable;
14
pub use mutable::*;
15
use polars_error::{PolarsResult, polars_bail};
16
use polars_utils::pl_str::PlSmallStr;
17
#[cfg(feature = "proptest")]
18
pub mod proptest;
19
20
/// Name used for the values array within List/FixedSizeList arrays.
21
pub const LIST_VALUES_NAME: PlSmallStr = PlSmallStr::from_static("item");
22
23
/// An [`Array`] semantically equivalent to `Vec<Option<Vec<Option<T>>>>` with Arrow's in-memory.
24
#[derive(Clone)]
25
pub struct ListArray<O: Offset> {
26
dtype: ArrowDataType,
27
offsets: OffsetsBuffer<O>,
28
values: Box<dyn Array>,
29
validity: Option<Bitmap>,
30
}
31
32
impl<O: Offset> ListArray<O> {
33
/// Creates a new [`ListArray`].
34
///
35
/// # Errors
36
/// This function returns an error iff:
37
/// * `offsets.last()` is greater than `values.len()`.
38
/// * the validity's length is not equal to `offsets.len_proxy()`.
39
/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`].
40
/// * The `dtype`'s inner field's data type is not equal to `values.dtype`.
41
/// # Implementation
42
/// This function is `O(1)`
43
pub fn try_new(
44
dtype: ArrowDataType,
45
offsets: OffsetsBuffer<O>,
46
values: Box<dyn Array>,
47
validity: Option<Bitmap>,
48
) -> PolarsResult<Self> {
49
try_check_offsets_bounds(&offsets, values.len())?;
50
51
if validity
52
.as_ref()
53
.is_some_and(|validity| validity.len() != offsets.len_proxy())
54
{
55
polars_bail!(ComputeError: "validity mask length must match the number of values")
56
}
57
58
let child_dtype = Self::try_get_child(&dtype)?.dtype();
59
let values_dtype = values.dtype();
60
if child_dtype != values_dtype {
61
polars_bail!(ComputeError: "ListArray's child's DataType must match. However, the expected DataType is {child_dtype:?} while it got {values_dtype:?}.");
62
}
63
64
Ok(Self {
65
dtype,
66
offsets,
67
values,
68
validity,
69
})
70
}
71
72
/// Creates a new [`ListArray`].
73
///
74
/// # Panics
75
/// This function panics iff:
76
/// * `offsets.last()` is greater than `values.len()`.
77
/// * the validity's length is not equal to `offsets.len_proxy()`.
78
/// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either [`crate::datatypes::PhysicalType::List`] or [`crate::datatypes::PhysicalType::LargeList`].
79
/// * The `dtype`'s inner field's data type is not equal to `values.dtype`.
80
/// # Implementation
81
/// This function is `O(1)`
82
pub fn new(
83
dtype: ArrowDataType,
84
offsets: OffsetsBuffer<O>,
85
values: Box<dyn Array>,
86
validity: Option<Bitmap>,
87
) -> Self {
88
Self::try_new(dtype, offsets, values, validity).unwrap()
89
}
90
91
/// Returns a new empty [`ListArray`].
92
pub fn new_empty(dtype: ArrowDataType) -> Self {
93
let values = new_empty_array(Self::get_child_type(&dtype).clone());
94
Self::new(dtype, OffsetsBuffer::default(), values, None)
95
}
96
97
/// Returns a new null [`ListArray`].
98
#[inline]
99
pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
100
let child = Self::get_child_type(&dtype).clone();
101
Self::new(
102
dtype,
103
Offsets::new_zeroed(length).into(),
104
new_empty_array(child),
105
Some(Bitmap::new_zeroed(length)),
106
)
107
}
108
109
pub fn into_inner(
110
self,
111
) -> (
112
ArrowDataType,
113
Box<dyn Array>,
114
OffsetsBuffer<O>,
115
Option<Bitmap>,
116
) {
117
(self.dtype, self.values, self.offsets, self.validity)
118
}
119
}
120
121
impl<O: Offset> ListArray<O> {
122
/// Slices this [`ListArray`].
123
/// # Panics
124
/// panics iff `offset + length > self.len()`
125
pub fn slice(&mut self, offset: usize, length: usize) {
126
assert!(
127
offset + length <= self.len(),
128
"the offset of the new Buffer cannot exceed the existing length"
129
);
130
unsafe { self.slice_unchecked(offset, length) }
131
}
132
133
/// Slices this [`ListArray`].
134
///
135
/// # Safety
136
/// The caller must ensure that `offset + length < self.len()`.
137
pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
138
self.validity = self
139
.validity
140
.take()
141
.map(|bitmap| bitmap.sliced_unchecked(offset, length))
142
.filter(|bitmap| bitmap.unset_bits() > 0);
143
self.offsets.slice_unchecked(offset, length + 1);
144
}
145
146
impl_sliced!();
147
impl_mut_validity!();
148
impl_into_array!();
149
}
150
151
// Accessors
152
impl<O: Offset> ListArray<O> {
153
/// Returns the length of this array
154
#[inline]
155
pub fn len(&self) -> usize {
156
self.offsets.len_proxy()
157
}
158
159
/// Returns the element at index `i`
160
/// # Panic
161
/// Panics iff `i >= self.len()`
162
#[inline]
163
pub fn value(&self, i: usize) -> Box<dyn Array> {
164
assert!(i < self.len());
165
// SAFETY: invariant of this function
166
unsafe { self.value_unchecked(i) }
167
}
168
169
/// Returns the element at index `i` as &str
170
///
171
/// # Safety
172
/// Assumes that the `i < self.len`.
173
#[inline]
174
pub unsafe fn value_unchecked(&self, i: usize) -> Box<dyn Array> {
175
// SAFETY: the invariant of the function
176
let (start, end) = self.offsets.start_end_unchecked(i);
177
let length = end - start;
178
179
// SAFETY: the invariant of the struct
180
self.values.sliced_unchecked(start, length)
181
}
182
183
/// The optional validity.
184
#[inline]
185
pub fn validity(&self) -> Option<&Bitmap> {
186
self.validity.as_ref()
187
}
188
189
/// The offsets [`Buffer`].
190
#[inline]
191
pub fn offsets(&self) -> &OffsetsBuffer<O> {
192
&self.offsets
193
}
194
195
/// The values.
196
#[inline]
197
pub fn values(&self) -> &Box<dyn Array> {
198
&self.values
199
}
200
}
201
202
impl<O: Offset> ListArray<O> {
203
/// Returns a default [`ArrowDataType`]: inner field is named "item" and is nullable
204
pub fn default_datatype(dtype: ArrowDataType) -> ArrowDataType {
205
let field = Box::new(Field::new(LIST_VALUES_NAME, dtype, true));
206
if O::IS_LARGE {
207
ArrowDataType::LargeList(field)
208
} else {
209
ArrowDataType::List(field)
210
}
211
}
212
213
/// Returns a the inner [`Field`]
214
/// # Panics
215
/// Panics iff the logical type is not consistent with this struct.
216
pub fn get_child_field(dtype: &ArrowDataType) -> &Field {
217
Self::try_get_child(dtype).unwrap()
218
}
219
220
/// Returns a the inner [`Field`]
221
/// # Errors
222
/// Panics iff the logical type is not consistent with this struct.
223
pub fn try_get_child(dtype: &ArrowDataType) -> PolarsResult<&Field> {
224
if O::IS_LARGE {
225
match dtype.to_storage() {
226
ArrowDataType::LargeList(child) => Ok(child.as_ref()),
227
_ => polars_bail!(ComputeError: "ListArray<i64> expects DataType::LargeList"),
228
}
229
} else {
230
match dtype.to_storage() {
231
ArrowDataType::List(child) => Ok(child.as_ref()),
232
_ => polars_bail!(ComputeError: "ListArray<i32> expects DataType::List"),
233
}
234
}
235
}
236
237
/// Returns a the inner [`ArrowDataType`]
238
/// # Panics
239
/// Panics iff the logical type is not consistent with this struct.
240
pub fn get_child_type(dtype: &ArrowDataType) -> &ArrowDataType {
241
Self::get_child_field(dtype).dtype()
242
}
243
}
244
245
impl<O: Offset> Array for ListArray<O> {
246
impl_common_array!();
247
248
fn validity(&self) -> Option<&Bitmap> {
249
self.validity.as_ref()
250
}
251
252
#[inline]
253
fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
254
Box::new(self.clone().with_validity(validity))
255
}
256
}
257
258
impl<O: Offset> Splitable for ListArray<O> {
259
fn check_bound(&self, offset: usize) -> bool {
260
offset <= self.len()
261
}
262
263
unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
264
let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
265
let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };
266
267
(
268
Self {
269
dtype: self.dtype.clone(),
270
offsets: lhs_offsets,
271
validity: lhs_validity,
272
values: self.values.clone(),
273
},
274
Self {
275
dtype: self.dtype.clone(),
276
offsets: rhs_offsets,
277
validity: rhs_validity,
278
values: self.values.clone(),
279
},
280
)
281
}
282
}
283
284