Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-arrow/src/ffi/array.rs
6939 views
1
//! Contains functionality to load an ArrayData from the C Data Interface
2
use std::sync::Arc;
3
4
use polars_error::{PolarsResult, polars_bail};
5
6
use super::ArrowArray;
7
use crate::array::*;
8
use crate::bitmap::Bitmap;
9
use crate::bitmap::utils::bytes_for;
10
use crate::buffer::Buffer;
11
use crate::datatypes::{ArrowDataType, PhysicalType};
12
use crate::ffi::schema::get_child;
13
use crate::storage::SharedStorage;
14
use crate::types::NativeType;
15
use crate::{ffi, match_integer_type, with_match_primitive_type_full};
16
17
/// Reads a valid `ffi` interface into a `Box<dyn Array>`
18
/// # Errors
19
/// If and only if:
20
/// * the interface is not valid (e.g. a null pointer)
21
pub unsafe fn try_from<A: ArrowArrayRef>(array: A) -> PolarsResult<Box<dyn Array>> {
22
use PhysicalType::*;
23
Ok(match array.dtype().to_physical_type() {
24
Null => Box::new(NullArray::try_from_ffi(array)?),
25
Boolean => Box::new(BooleanArray::try_from_ffi(array)?),
26
Primitive(primitive) => with_match_primitive_type_full!(primitive, |$T| {
27
Box::new(PrimitiveArray::<$T>::try_from_ffi(array)?)
28
}),
29
Utf8 => Box::new(Utf8Array::<i32>::try_from_ffi(array)?),
30
LargeUtf8 => Box::new(Utf8Array::<i64>::try_from_ffi(array)?),
31
Binary => Box::new(BinaryArray::<i32>::try_from_ffi(array)?),
32
LargeBinary => Box::new(BinaryArray::<i64>::try_from_ffi(array)?),
33
FixedSizeBinary => Box::new(FixedSizeBinaryArray::try_from_ffi(array)?),
34
List => Box::new(ListArray::<i32>::try_from_ffi(array)?),
35
LargeList => Box::new(ListArray::<i64>::try_from_ffi(array)?),
36
FixedSizeList => Box::new(FixedSizeListArray::try_from_ffi(array)?),
37
Struct => Box::new(StructArray::try_from_ffi(array)?),
38
Dictionary(key_type) => {
39
match_integer_type!(key_type, |$T| {
40
Box::new(DictionaryArray::<$T>::try_from_ffi(array)?)
41
})
42
},
43
Union => Box::new(UnionArray::try_from_ffi(array)?),
44
Map => Box::new(MapArray::try_from_ffi(array)?),
45
BinaryView => Box::new(BinaryViewArray::try_from_ffi(array)?),
46
Utf8View => Box::new(Utf8ViewArray::try_from_ffi(array)?),
47
})
48
}
49
50
// Sound because the arrow specification does not allow multiple implementations
51
// to change this struct
52
// This is intrinsically impossible to prove because the implementations agree
53
// on this as part of the Arrow specification
54
unsafe impl Send for ArrowArray {}
55
unsafe impl Sync for ArrowArray {}
56
57
impl Drop for ArrowArray {
58
fn drop(&mut self) {
59
match self.release {
60
None => (),
61
Some(release) => unsafe { release(self) },
62
};
63
}
64
}
65
66
// callback used to drop [ArrowArray] when it is exported
67
unsafe extern "C" fn c_release_array(array: *mut ArrowArray) {
68
if array.is_null() {
69
return;
70
}
71
let array = &mut *array;
72
73
// take ownership of `private_data`, therefore dropping it
74
let private = Box::from_raw(array.private_data as *mut PrivateData);
75
for child in private.children_ptr.iter() {
76
let _ = Box::from_raw(*child);
77
}
78
79
if let Some(ptr) = private.dictionary_ptr {
80
let _ = Box::from_raw(ptr);
81
}
82
83
array.release = None;
84
}
85
86
#[allow(dead_code)]
87
struct PrivateData {
88
array: Box<dyn Array>,
89
buffers_ptr: Box<[*const std::os::raw::c_void]>,
90
children_ptr: Box<[*mut ArrowArray]>,
91
dictionary_ptr: Option<*mut ArrowArray>,
92
variadic_buffer_sizes: Box<[i64]>,
93
}
94
95
impl ArrowArray {
96
/// creates a new `ArrowArray` from existing data.
97
///
98
/// # Safety
99
/// This method releases `buffers`. Consumers of this struct *must* call `release` before
100
/// releasing this struct, or contents in `buffers` leak.
101
pub(crate) fn new(array: Box<dyn Array>) -> Self {
102
#[allow(unused_mut)]
103
let (offset, mut buffers, children, dictionary) =
104
offset_buffers_children_dictionary(array.as_ref());
105
106
let variadic_buffer_sizes = match array.dtype() {
107
ArrowDataType::BinaryView => {
108
let arr = array.as_any().downcast_ref::<BinaryViewArray>().unwrap();
109
let boxed = arr.variadic_buffer_lengths().into_boxed_slice();
110
let ptr = boxed.as_ptr().cast::<u8>();
111
buffers.push(Some(ptr));
112
boxed
113
},
114
ArrowDataType::Utf8View => {
115
let arr = array.as_any().downcast_ref::<Utf8ViewArray>().unwrap();
116
let boxed = arr.variadic_buffer_lengths().into_boxed_slice();
117
let ptr = boxed.as_ptr().cast::<u8>();
118
buffers.push(Some(ptr));
119
boxed
120
},
121
_ => Box::new([]),
122
};
123
124
let buffers_ptr = buffers
125
.iter()
126
.map(|maybe_buffer| match maybe_buffer {
127
Some(b) => *b as *const std::os::raw::c_void,
128
None => std::ptr::null(),
129
})
130
.collect::<Box<[_]>>();
131
let n_buffers = buffers.len() as i64;
132
133
let children_ptr = children
134
.into_iter()
135
.map(|child| {
136
Box::into_raw(Box::new(ArrowArray::new(ffi::align_to_c_data_interface(
137
child,
138
))))
139
})
140
.collect::<Box<_>>();
141
let n_children = children_ptr.len() as i64;
142
143
let dictionary_ptr = dictionary.map(|array| {
144
Box::into_raw(Box::new(ArrowArray::new(ffi::align_to_c_data_interface(
145
array,
146
))))
147
});
148
149
let length = array.len() as i64;
150
let null_count = array.null_count() as i64;
151
152
let mut private_data = Box::new(PrivateData {
153
array,
154
buffers_ptr,
155
children_ptr,
156
dictionary_ptr,
157
variadic_buffer_sizes,
158
});
159
160
Self {
161
length,
162
null_count,
163
offset: offset as i64,
164
n_buffers,
165
n_children,
166
buffers: private_data.buffers_ptr.as_mut_ptr(),
167
children: private_data.children_ptr.as_mut_ptr(),
168
dictionary: private_data.dictionary_ptr.unwrap_or(std::ptr::null_mut()),
169
release: Some(c_release_array),
170
private_data: Box::into_raw(private_data) as *mut ::std::os::raw::c_void,
171
}
172
}
173
174
/// creates an empty [`ArrowArray`], which can be used to import data into
175
pub fn empty() -> Self {
176
Self {
177
length: 0,
178
null_count: 0,
179
offset: 0,
180
n_buffers: 0,
181
n_children: 0,
182
buffers: std::ptr::null_mut(),
183
children: std::ptr::null_mut(),
184
dictionary: std::ptr::null_mut(),
185
release: None,
186
private_data: std::ptr::null_mut(),
187
}
188
}
189
190
/// the length of the array
191
pub(crate) fn len(&self) -> usize {
192
self.length as usize
193
}
194
195
/// the offset of the array
196
pub(crate) fn offset(&self) -> usize {
197
self.offset as usize
198
}
199
200
/// the null count of the array
201
pub(crate) fn null_count(&self) -> usize {
202
self.null_count as usize
203
}
204
}
205
206
/// # Safety
207
/// The caller must ensure that the buffer at index `i` is not mutably shared.
208
unsafe fn get_buffer_ptr<T: NativeType>(
209
array: &ArrowArray,
210
dtype: &ArrowDataType,
211
index: usize,
212
) -> PolarsResult<*mut T> {
213
if array.buffers.is_null() {
214
polars_bail!( ComputeError:
215
"an ArrowArray of type {dtype:?} must have non-null buffers"
216
);
217
}
218
219
if array.buffers.align_offset(align_of::<*mut *const u8>()) != 0 {
220
polars_bail!( ComputeError:
221
"an ArrowArray of type {dtype:?}
222
must have buffer {index} aligned to type {}",
223
std::any::type_name::<*mut *const u8>()
224
);
225
}
226
let buffers = array.buffers as *mut *const u8;
227
228
if index >= array.n_buffers as usize {
229
polars_bail!(ComputeError:
230
"An ArrowArray of type {dtype:?}
231
must have buffer {index}."
232
)
233
}
234
235
let ptr = *buffers.add(index);
236
if ptr.is_null() {
237
polars_bail!(ComputeError:
238
"An array of type {dtype:?}
239
must have a non-null buffer {index}"
240
)
241
}
242
243
// note: we can't prove that this pointer is not mutably shared - part of the safety invariant
244
Ok(ptr as *mut T)
245
}
246
247
unsafe fn create_buffer_known_len<T: NativeType>(
248
array: &ArrowArray,
249
dtype: &ArrowDataType,
250
owner: InternalArrowArray,
251
len: usize,
252
index: usize,
253
) -> PolarsResult<Buffer<T>> {
254
if len == 0 {
255
return Ok(Buffer::new());
256
}
257
let ptr: *mut T = get_buffer_ptr(array, dtype, index)?;
258
let storage = SharedStorage::from_internal_arrow_array(ptr, len, owner);
259
Ok(Buffer::from_storage(storage))
260
}
261
262
/// returns the buffer `i` of `array` interpreted as a [`Buffer`].
263
/// # Safety
264
/// This function is safe iff:
265
/// * the buffers up to position `index` are valid for the declared length
266
/// * the buffers' pointers are not mutably shared for the lifetime of `owner`
267
unsafe fn create_buffer<T: NativeType>(
268
array: &ArrowArray,
269
dtype: &ArrowDataType,
270
owner: InternalArrowArray,
271
index: usize,
272
) -> PolarsResult<Buffer<T>> {
273
let len = buffer_len(array, dtype, index)?;
274
275
if len == 0 {
276
return Ok(Buffer::new());
277
}
278
279
let offset = buffer_offset(array, dtype, index);
280
let ptr: *mut T = get_buffer_ptr(array, dtype, index)?;
281
282
// We have to check alignment.
283
// This is the zero-copy path.
284
if ptr.align_offset(align_of::<T>()) == 0 {
285
let storage = SharedStorage::from_internal_arrow_array(ptr, len, owner);
286
Ok(Buffer::from_storage(storage).sliced(offset, len - offset))
287
}
288
// This is the path where alignment isn't correct.
289
// We copy the data to a new vec
290
else {
291
let buf = std::slice::from_raw_parts(ptr, len - offset).to_vec();
292
Ok(Buffer::from(buf))
293
}
294
}
295
296
/// returns the buffer `i` of `array` interpreted as a [`Bitmap`].
297
/// # Safety
298
/// This function is safe iff:
299
/// * the buffer at position `index` is valid for the declared length
300
/// * the buffers' pointer is not mutable for the lifetime of `owner`
301
unsafe fn create_bitmap(
302
array: &ArrowArray,
303
dtype: &ArrowDataType,
304
owner: InternalArrowArray,
305
index: usize,
306
// if this is the validity bitmap
307
// we can use the null count directly
308
is_validity: bool,
309
) -> PolarsResult<Bitmap> {
310
let len: usize = array.length.try_into().expect("length to fit in `usize`");
311
if len == 0 {
312
return Ok(Bitmap::new());
313
}
314
let ptr = get_buffer_ptr(array, dtype, index)?;
315
316
// Pointer of u8 has alignment 1, so we don't have to check alignment.
317
318
let offset: usize = array.offset.try_into().expect("offset to fit in `usize`");
319
let bytes_len = bytes_for(offset + len);
320
let storage = SharedStorage::from_internal_arrow_array(ptr, bytes_len, owner);
321
322
let null_count = if is_validity {
323
Some(array.null_count())
324
} else {
325
None
326
};
327
Ok(Bitmap::from_inner_unchecked(
328
storage, offset, len, null_count,
329
))
330
}
331
332
fn buffer_offset(array: &ArrowArray, dtype: &ArrowDataType, i: usize) -> usize {
333
use PhysicalType::*;
334
match (dtype.to_physical_type(), i) {
335
(LargeUtf8, 2) | (LargeBinary, 2) | (Utf8, 2) | (Binary, 2) => 0,
336
(FixedSizeBinary, 1) => {
337
if let ArrowDataType::FixedSizeBinary(size) = dtype.to_logical_type() {
338
let offset: usize = array.offset.try_into().expect("Offset to fit in `usize`");
339
offset * *size
340
} else {
341
unreachable!()
342
}
343
},
344
_ => array.offset.try_into().expect("Offset to fit in `usize`"),
345
}
346
}
347
348
/// Returns the length, in slots, of the buffer `i` (indexed according to the C data interface)
349
unsafe fn buffer_len(array: &ArrowArray, dtype: &ArrowDataType, i: usize) -> PolarsResult<usize> {
350
Ok(match (dtype.to_physical_type(), i) {
351
(PhysicalType::FixedSizeBinary, 1) => {
352
if let ArrowDataType::FixedSizeBinary(size) = dtype.to_logical_type() {
353
*size * (array.offset as usize + array.length as usize)
354
} else {
355
unreachable!()
356
}
357
},
358
(PhysicalType::FixedSizeList, 1) => {
359
if let ArrowDataType::FixedSizeList(_, size) = dtype.to_logical_type() {
360
*size * (array.offset as usize + array.length as usize)
361
} else {
362
unreachable!()
363
}
364
},
365
(PhysicalType::Utf8, 1)
366
| (PhysicalType::LargeUtf8, 1)
367
| (PhysicalType::Binary, 1)
368
| (PhysicalType::LargeBinary, 1)
369
| (PhysicalType::List, 1)
370
| (PhysicalType::LargeList, 1)
371
| (PhysicalType::Map, 1) => {
372
// the len of the offset buffer (buffer 1) equals length + 1
373
array.offset as usize + array.length as usize + 1
374
},
375
(PhysicalType::BinaryView, 1) | (PhysicalType::Utf8View, 1) => {
376
array.offset as usize + array.length as usize
377
},
378
(PhysicalType::Utf8, 2) | (PhysicalType::Binary, 2) => {
379
// the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
380
let len = buffer_len(array, dtype, 1)?;
381
// first buffer is the null buffer => add(1)
382
let offset_buffer = unsafe { *(array.buffers as *mut *const u8).add(1) };
383
// interpret as i32
384
let offset_buffer = offset_buffer as *const i32;
385
// get last offset
386
387
(unsafe { *offset_buffer.add(len - 1) }) as usize
388
},
389
(PhysicalType::LargeUtf8, 2) | (PhysicalType::LargeBinary, 2) => {
390
// the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
391
let len = buffer_len(array, dtype, 1)?;
392
// first buffer is the null buffer => add(1)
393
let offset_buffer = unsafe { *(array.buffers as *mut *const u8).add(1) };
394
// interpret as i64
395
let offset_buffer = offset_buffer as *const i64;
396
// get last offset
397
(unsafe { *offset_buffer.add(len - 1) }) as usize
398
},
399
// buffer len of primitive types
400
_ => array.offset as usize + array.length as usize,
401
})
402
}
403
404
/// # Safety
405
///
406
/// This function is safe iff:
407
/// * `array.children` at `index` is valid
408
/// * `array.children` is not mutably shared for the lifetime of `parent`
409
/// * the pointer of `array.children` at `index` is valid
410
/// * the pointer of `array.children` at `index` is not mutably shared for the lifetime of `parent`
411
unsafe fn create_child(
412
array: &ArrowArray,
413
dtype: &ArrowDataType,
414
parent: InternalArrowArray,
415
index: usize,
416
) -> PolarsResult<ArrowArrayChild<'static>> {
417
let dtype = get_child(dtype, index)?;
418
419
// catch what we can
420
if array.children.is_null() {
421
polars_bail!(ComputeError: "an ArrowArray of type {dtype:?} must have non-null children");
422
}
423
424
if index >= array.n_children as usize {
425
polars_bail!(ComputeError:
426
"an ArrowArray of type {dtype:?}
427
must have child {index}."
428
);
429
}
430
431
// SAFETY: part of the invariant
432
let arr_ptr = unsafe { *array.children.add(index) };
433
434
// catch what we can
435
if arr_ptr.is_null() {
436
polars_bail!(ComputeError:
437
"an array of type {dtype:?}
438
must have a non-null child {index}"
439
)
440
}
441
442
// SAFETY: invariant of this function
443
let arr_ptr = unsafe { &*arr_ptr };
444
Ok(ArrowArrayChild::new(arr_ptr, dtype, parent))
445
}
446
447
/// # Safety
448
///
449
/// This function is safe iff:
450
/// * `array.dictionary` is valid
451
/// * `array.dictionary` is not mutably shared for the lifetime of `parent`
452
unsafe fn create_dictionary(
453
array: &ArrowArray,
454
dtype: &ArrowDataType,
455
parent: InternalArrowArray,
456
) -> PolarsResult<Option<ArrowArrayChild<'static>>> {
457
if let ArrowDataType::Dictionary(_, values, _) = dtype {
458
let dtype = values.as_ref().clone();
459
// catch what we can
460
if array.dictionary.is_null() {
461
polars_bail!(ComputeError:
462
"an array of type {dtype:?}
463
must have a non-null dictionary"
464
)
465
}
466
467
// SAFETY: part of the invariant
468
let array = unsafe { &*array.dictionary };
469
Ok(Some(ArrowArrayChild::new(array, dtype, parent)))
470
} else {
471
Ok(None)
472
}
473
}
474
475
pub trait ArrowArrayRef: std::fmt::Debug {
476
fn owner(&self) -> InternalArrowArray {
477
(*self.parent()).clone()
478
}
479
480
/// returns the null bit buffer.
481
/// Rust implementation uses a buffer that is not part of the array of buffers.
482
/// The C Data interface's null buffer is part of the array of buffers.
483
///
484
/// # Safety
485
/// The caller must guarantee that the buffer `index` corresponds to a bitmap.
486
/// This function assumes that the bitmap created from FFI is valid; this is impossible to prove.
487
unsafe fn validity(&self) -> PolarsResult<Option<Bitmap>> {
488
if self.array().null_count() == 0 {
489
Ok(None)
490
} else {
491
create_bitmap(self.array(), self.dtype(), self.owner(), 0, true).map(Some)
492
}
493
}
494
495
/// # Safety
496
/// The caller must guarantee that the buffer `index` corresponds to a buffer.
497
/// This function assumes that the buffer created from FFI is valid; this is impossible to prove.
498
unsafe fn buffer<T: NativeType>(&self, index: usize) -> PolarsResult<Buffer<T>> {
499
create_buffer::<T>(self.array(), self.dtype(), self.owner(), index)
500
}
501
502
/// # Safety
503
/// The caller must guarantee that the buffer `index` corresponds to a buffer.
504
/// This function assumes that the buffer created from FFI is valid; this is impossible to prove.
505
unsafe fn buffer_known_len<T: NativeType>(
506
&self,
507
index: usize,
508
len: usize,
509
) -> PolarsResult<Buffer<T>> {
510
create_buffer_known_len::<T>(self.array(), self.dtype(), self.owner(), len, index)
511
}
512
513
/// # Safety
514
/// This function is safe iff:
515
/// * the buffer at position `index` is valid for the declared length
516
/// * the buffers' pointer is not mutable for the lifetime of `owner`
517
unsafe fn bitmap(&self, index: usize) -> PolarsResult<Bitmap> {
518
create_bitmap(self.array(), self.dtype(), self.owner(), index, false)
519
}
520
521
/// # Safety
522
/// * `array.children` at `index` is valid
523
/// * `array.children` is not mutably shared for the lifetime of `parent`
524
/// * the pointer of `array.children` at `index` is valid
525
/// * the pointer of `array.children` at `index` is not mutably shared for the lifetime of `parent`
526
unsafe fn child(&self, index: usize) -> PolarsResult<ArrowArrayChild<'_>> {
527
create_child(self.array(), self.dtype(), self.parent().clone(), index)
528
}
529
530
unsafe fn dictionary(&self) -> PolarsResult<Option<ArrowArrayChild<'_>>> {
531
create_dictionary(self.array(), self.dtype(), self.parent().clone())
532
}
533
534
fn n_buffers(&self) -> usize;
535
536
fn offset(&self) -> usize;
537
fn length(&self) -> usize;
538
539
fn parent(&self) -> &InternalArrowArray;
540
fn array(&self) -> &ArrowArray;
541
fn dtype(&self) -> &ArrowDataType;
542
}
543
544
/// Struct used to move an Array from and to the C Data Interface.
545
/// Its main responsibility is to expose functionality that requires
546
/// both [ArrowArray] and [ArrowSchema].
547
///
548
/// This struct has two main paths:
549
///
550
/// ## Import from the C Data Interface
551
/// * [InternalArrowArray::empty] to allocate memory to be filled by an external call
552
/// * [InternalArrowArray::try_from_raw] to consume two non-null allocated pointers
553
/// ## Export to the C Data Interface
554
/// * [InternalArrowArray::try_new] to create a new [InternalArrowArray] from Rust-specific information
555
/// * [InternalArrowArray::into_raw] to expose two pointers for [ArrowArray] and [ArrowSchema].
556
///
557
/// # Safety
558
/// Whoever creates this struct is responsible for releasing their resources. Specifically,
559
/// consumers *must* call [InternalArrowArray::into_raw] and take ownership of the individual pointers,
560
/// calling [ArrowArray::release] and [ArrowSchema::release] accordingly.
561
///
562
/// Furthermore, this struct assumes that the incoming data agrees with the C data interface.
563
#[derive(Debug, Clone)]
564
pub struct InternalArrowArray {
565
// Arc is used for sharability since this is immutable
566
array: Arc<ArrowArray>,
567
// Arced to reduce cost of cloning
568
dtype: Arc<ArrowDataType>,
569
}
570
571
impl InternalArrowArray {
572
pub fn new(array: ArrowArray, dtype: ArrowDataType) -> Self {
573
Self {
574
array: Arc::new(array),
575
dtype: Arc::new(dtype),
576
}
577
}
578
}
579
580
impl ArrowArrayRef for InternalArrowArray {
581
/// the dtype as declared in the schema
582
fn dtype(&self) -> &ArrowDataType {
583
&self.dtype
584
}
585
586
fn parent(&self) -> &InternalArrowArray {
587
self
588
}
589
590
fn array(&self) -> &ArrowArray {
591
self.array.as_ref()
592
}
593
594
fn n_buffers(&self) -> usize {
595
self.array.n_buffers as usize
596
}
597
598
fn offset(&self) -> usize {
599
self.array.offset as usize
600
}
601
602
fn length(&self) -> usize {
603
self.array.length as usize
604
}
605
}
606
607
#[derive(Debug)]
608
pub struct ArrowArrayChild<'a> {
609
array: &'a ArrowArray,
610
dtype: ArrowDataType,
611
parent: InternalArrowArray,
612
}
613
614
impl ArrowArrayRef for ArrowArrayChild<'_> {
615
/// the dtype as declared in the schema
616
fn dtype(&self) -> &ArrowDataType {
617
&self.dtype
618
}
619
620
fn parent(&self) -> &InternalArrowArray {
621
&self.parent
622
}
623
624
fn array(&self) -> &ArrowArray {
625
self.array
626
}
627
628
fn n_buffers(&self) -> usize {
629
self.array.n_buffers as usize
630
}
631
632
fn offset(&self) -> usize {
633
self.array.offset as usize
634
}
635
636
fn length(&self) -> usize {
637
self.array.length as usize
638
}
639
}
640
641
impl<'a> ArrowArrayChild<'a> {
642
fn new(array: &'a ArrowArray, dtype: ArrowDataType, parent: InternalArrowArray) -> Self {
643
Self {
644
array,
645
dtype,
646
parent,
647
}
648
}
649
}
650
651