Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-python/src/series/buffers.rs
7889 views
1
//! Construct and deconstruct Series based on the underlying buffers.
2
//!
3
//! This functionality is mainly intended for use with the Python dataframe
4
//! interchange protocol.
5
//!
6
//! As Polars has no Buffer concept in Python, each buffer is represented as
7
//! a Series of its physical type.
8
//!
9
//! Note that String Series have underlying `Utf8View` buffers, which
10
//! currently cannot be represented as Series. Since the interchange protocol
11
//! cannot handle these buffers anyway and expects bytes and offsets buffers,
12
//! operations on String Series will convert from/to such buffers. This
13
//! conversion requires data to be copied.
14
15
use arrow::array::{Array, BooleanArray, PrimitiveArray, Utf8Array};
16
use arrow::bitmap::Bitmap;
17
use arrow::buffer::Buffer;
18
use arrow::offset::OffsetsBuffer;
19
use arrow::types::NativeType;
20
use polars::prelude::*;
21
use polars_core::{with_match_physical_numeric_polars_type, with_match_physical_numeric_type};
22
use pyo3::exceptions::PyTypeError;
23
use pyo3::prelude::*;
24
use pyo3::types::PyTuple;
25
26
use super::{PySeries, ToSeries};
27
use crate::conversion::Wrap;
28
use crate::error::PyPolarsErr;
29
use crate::raise_err;
30
use crate::utils::EnterPolarsExt;
31
32
struct BufferInfo {
33
pointer: usize,
34
offset: usize,
35
length: usize,
36
}
37
impl<'py> IntoPyObject<'py> for BufferInfo {
38
type Target = PyTuple;
39
type Output = Bound<'py, Self::Target>;
40
type Error = PyErr;
41
42
fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
43
(self.pointer, self.offset, self.length).into_pyobject(py)
44
}
45
}
46
impl<'py> FromPyObject<'py> for BufferInfo {
47
fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {
48
let (pointer, offset, length) = ob.extract()?;
49
Ok(Self {
50
pointer,
51
offset,
52
length,
53
})
54
}
55
}
56
57
#[pymethods]
58
impl PySeries {
59
/// Return pointer, offset, and length information about the underlying buffer.
60
fn _get_buffer_info(&self) -> PyResult<BufferInfo> {
61
let lock = self.series.read();
62
let s = lock.to_physical_repr();
63
let arrays = s.chunks();
64
if arrays.len() != 1 {
65
let msg = "cannot get buffer info for Series consisting of multiple chunks";
66
raise_err!(msg, ComputeError);
67
}
68
match s.dtype() {
69
DataType::Boolean => {
70
let ca = s.bool().unwrap();
71
let arr = ca.downcast_iter().next().unwrap();
72
let (slice, offset, len) = arr.values().as_slice();
73
Ok(BufferInfo {
74
pointer: slice.as_ptr() as usize,
75
offset,
76
length: len,
77
})
78
},
79
dt if dt.is_primitive_numeric() => {
80
Ok(with_match_physical_numeric_polars_type!(dt, |$T| {
81
let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();
82
BufferInfo { pointer: get_pointer(ca), offset: 0, length: ca.len() }
83
}))
84
},
85
dt => {
86
let msg = format!(
87
"`_get_buffer_info` not implemented for non-physical type {dt}; try to select a buffer first"
88
);
89
Err(PyTypeError::new_err(msg))
90
},
91
}
92
}
93
94
/// Return the underlying values, validity, and offsets buffers as Series.
95
fn _get_buffers(&self, py: Python) -> PyResult<(Self, Option<Self>, Option<Self>)> {
96
let s = &self.series.read();
97
py.enter_polars(|| match s.dtype().to_physical() {
98
dt if dt.is_primitive_numeric() => get_buffers_from_primitive(s),
99
DataType::Boolean => get_buffers_from_primitive(s),
100
DataType::String => get_buffers_from_string(s),
101
dt => {
102
let msg = format!("`_get_buffers` not implemented for `dtype` {dt}");
103
Err(PyTypeError::new_err(msg))
104
},
105
})
106
}
107
}
108
109
fn get_pointer<T: PolarsNumericType>(ca: &ChunkedArray<T>) -> usize {
110
let arr = ca.downcast_iter().next().unwrap();
111
arr.values().as_ptr() as usize
112
}
113
114
fn get_buffers_from_primitive(
115
s: &Series,
116
) -> PyResult<(PySeries, Option<PySeries>, Option<PySeries>)> {
117
let chunks = s
118
.chunks()
119
.iter()
120
.map(|arr| arr.with_validity(None))
121
.collect::<Vec<_>>();
122
let values = Series::try_from((s.name().clone(), chunks))
123
.map_err(PyPolarsErr::from)?
124
.into();
125
126
let validity = get_bitmap(s);
127
let offsets = None;
128
Ok((values, validity, offsets))
129
}
130
131
/// The underlying buffers for `String` Series cannot be represented in this
132
/// format. Instead, the buffers are converted to a values and offsets buffer.
133
/// This copies data.
134
fn get_buffers_from_string(s: &Series) -> PyResult<(PySeries, Option<PySeries>, Option<PySeries>)> {
135
// We cannot do this zero copy anyway, so rechunk first
136
let s = s.rechunk();
137
138
let ca = s.str().map_err(PyPolarsErr::from)?;
139
let arr_binview = ca.downcast_iter().next().unwrap();
140
141
// This is not zero-copy
142
let arr_utf8 = polars_compute::cast::utf8view_to_utf8(arr_binview);
143
144
let values = get_string_bytes(&arr_utf8)?;
145
let validity = get_bitmap(&s);
146
let offsets = get_string_offsets(&arr_utf8)?;
147
148
Ok((values, validity, Some(offsets)))
149
}
150
151
fn get_bitmap(s: &Series) -> Option<PySeries> {
152
if s.null_count() > 0 {
153
Some(s.is_not_null().into_series().into())
154
} else {
155
None
156
}
157
}
158
159
fn get_string_bytes(arr: &Utf8Array<i64>) -> PyResult<PySeries> {
160
let values_buffer = arr.values();
161
let values_arr =
162
PrimitiveArray::<u8>::try_new(ArrowDataType::UInt8, values_buffer.clone(), None)
163
.map_err(PyPolarsErr::from)?;
164
let values = Series::from_arrow(PlSmallStr::EMPTY, values_arr.to_boxed())
165
.map_err(PyPolarsErr::from)?
166
.into();
167
Ok(values)
168
}
169
170
fn get_string_offsets(arr: &Utf8Array<i64>) -> PyResult<PySeries> {
171
let offsets_buffer = arr.offsets().buffer();
172
let offsets_arr =
173
PrimitiveArray::<i64>::try_new(ArrowDataType::Int64, offsets_buffer.clone(), None)
174
.map_err(PyPolarsErr::from)?;
175
let offsets = Series::from_arrow(PlSmallStr::EMPTY, offsets_arr.to_boxed())
176
.map_err(PyPolarsErr::from)?
177
.into();
178
Ok(offsets)
179
}
180
181
#[pymethods]
182
impl PySeries {
183
/// Construct a PySeries from information about its underlying buffer.
184
#[staticmethod]
185
unsafe fn _from_buffer(
186
dtype: Wrap<DataType>,
187
buffer_info: BufferInfo,
188
owner: &Bound<'_, PyAny>,
189
) -> PyResult<Self> {
190
let dtype = dtype.0;
191
let BufferInfo {
192
pointer,
193
offset,
194
length,
195
} = buffer_info;
196
let owner = owner.to_owned().unbind();
197
198
let arr_boxed = match dtype {
199
dt if dt.is_primitive_numeric() => {
200
with_match_physical_numeric_type!(dt, |$T| unsafe {
201
from_buffer_impl::<$T>(pointer, offset, length, owner)
202
})
203
},
204
DataType::Boolean => {
205
unsafe { from_buffer_boolean_impl(pointer, offset, length, owner) }?
206
},
207
dt => {
208
let msg = format!(
209
"`_from_buffer` requires a physical type as input for `dtype`, got {dt}"
210
);
211
return Err(PyTypeError::new_err(msg));
212
},
213
};
214
215
let s = Series::from_arrow(PlSmallStr::EMPTY, arr_boxed)
216
.unwrap()
217
.into();
218
Ok(s)
219
}
220
}
221
222
unsafe fn from_buffer_impl<T: NativeType>(
223
pointer: usize,
224
offset: usize,
225
length: usize,
226
owner: Py<PyAny>,
227
) -> Box<dyn Array> {
228
let pointer = pointer as *const T;
229
let pointer = unsafe { pointer.add(offset) };
230
let slice = unsafe { std::slice::from_raw_parts(pointer, length) };
231
let arr = unsafe { arrow::ffi::mmap::slice_and_owner(slice, owner) };
232
arr.to_boxed()
233
}
234
unsafe fn from_buffer_boolean_impl(
235
pointer: usize,
236
offset: usize,
237
length: usize,
238
owner: Py<PyAny>,
239
) -> PyResult<Box<dyn Array>> {
240
let length_in_bytes = get_boolean_buffer_length_in_bytes(length, offset);
241
242
let pointer = pointer as *const u8;
243
let slice = unsafe { std::slice::from_raw_parts(pointer, length_in_bytes) };
244
let arr_result = unsafe { arrow::ffi::mmap::bitmap_and_owner(slice, offset, length, owner) };
245
let arr = arr_result.map_err(PyPolarsErr::from)?;
246
Ok(arr.to_boxed())
247
}
248
fn get_boolean_buffer_length_in_bytes(length: usize, offset: usize) -> usize {
249
let n_bits = offset + length;
250
let n_bytes = n_bits / 8;
251
let rest = n_bits % 8;
252
if rest == 0 { n_bytes } else { n_bytes + 1 }
253
}
254
255
#[pymethods]
256
impl PySeries {
257
/// Construct a PySeries from information about its underlying buffers.
258
#[staticmethod]
259
#[pyo3(signature = (dtype, data, validity))]
260
unsafe fn _from_buffers(
261
py: Python<'_>,
262
dtype: Wrap<DataType>,
263
data: Vec<PySeries>,
264
validity: Option<PySeries>,
265
) -> PyResult<Self> {
266
let dtype = dtype.0;
267
let mut data = data.to_series();
268
269
match data.len() {
270
0 => {
271
let msg = "`data` input to `_from_buffers` must contain at least one buffer";
272
return Err(PyTypeError::new_err(msg));
273
},
274
1 if validity.is_none() => {
275
let values = data.pop().unwrap();
276
let s = values.strict_cast(&dtype).map_err(PyPolarsErr::from)?;
277
return Ok(s.into());
278
},
279
_ => (),
280
}
281
282
let validity = match validity {
283
Some(ps) => {
284
let s = ps.series.into_inner();
285
let dtype = s.dtype();
286
if !dtype.is_bool() {
287
let msg = format!("validity buffer must have data type Boolean, got {dtype:?}");
288
return Err(PyTypeError::new_err(msg));
289
}
290
Some(series_to_bitmap(s).unwrap())
291
},
292
None => None,
293
};
294
295
let s = match dtype.to_physical() {
296
dt if dt.is_primitive_numeric() => {
297
let values = data.into_iter().next().unwrap();
298
with_match_physical_numeric_polars_type!(dt, |$T| {
299
let values_buffer = series_to_buffer::<$T>(values);
300
from_buffers_num_impl::<<$T as PolarsNumericType>::Native>(values_buffer, validity)?
301
})
302
},
303
DataType::Boolean => {
304
let values = data.into_iter().next().unwrap();
305
let values_buffer = series_to_bitmap(values)?;
306
from_buffers_bool_impl(values_buffer, validity)?
307
},
308
DataType::String => {
309
let mut data_iter = data.into_iter();
310
let values = data_iter.next().unwrap();
311
let offsets = match data_iter.next() {
312
Some(s) => {
313
let dtype = s.dtype();
314
if !matches!(dtype, DataType::Int64) {
315
return Err(PyTypeError::new_err(format!(
316
"offsets buffer must have data type Int64, got {dtype:?}"
317
)));
318
}
319
series_to_offsets(s)
320
},
321
None => {
322
return Err(PyTypeError::new_err(
323
"`_from_buffers` cannot create a String column without an offsets buffer",
324
));
325
},
326
};
327
let values = series_to_buffer::<UInt8Type>(values);
328
py.enter_polars(|| from_buffers_string_impl(values, validity, offsets))?
329
},
330
dt => {
331
let msg = format!("`_from_buffers` not implemented for `dtype` {dt}");
332
return Err(PyTypeError::new_err(msg));
333
},
334
};
335
336
let out = s.strict_cast(&dtype).map_err(PyPolarsErr::from)?;
337
Ok(out.into())
338
}
339
}
340
341
fn series_to_buffer<T>(s: Series) -> Buffer<T::Native>
342
where
343
T: PolarsNumericType,
344
{
345
let ca: &ChunkedArray<T> = s.as_ref().as_ref();
346
let ca = ca.rechunk();
347
ca.downcast_as_array().values().clone()
348
}
349
fn series_to_bitmap(s: Series) -> PyResult<Bitmap> {
350
let ca_result = s.bool();
351
let ca = ca_result.map_err(PyPolarsErr::from)?.rechunk();
352
Ok(ca.downcast_as_array().values().clone())
353
}
354
fn series_to_offsets(s: Series) -> OffsetsBuffer<i64> {
355
let buffer = series_to_buffer::<Int64Type>(s);
356
unsafe { OffsetsBuffer::new_unchecked(buffer) }
357
}
358
359
fn from_buffers_num_impl<T: NativeType>(
360
data: Buffer<T>,
361
validity: Option<Bitmap>,
362
) -> PyResult<Series> {
363
let arr = PrimitiveArray::new(T::PRIMITIVE.into(), data, validity);
364
let s_result = Series::from_arrow(PlSmallStr::EMPTY, arr.to_boxed());
365
let s = s_result.map_err(PyPolarsErr::from)?;
366
Ok(s)
367
}
368
fn from_buffers_bool_impl(data: Bitmap, validity: Option<Bitmap>) -> PyResult<Series> {
369
let arr = BooleanArray::new(ArrowDataType::Boolean, data, validity);
370
let s_result = Series::from_arrow(PlSmallStr::EMPTY, arr.to_boxed());
371
let s = s_result.map_err(PyPolarsErr::from)?;
372
Ok(s)
373
}
374
/// Constructing a `String` Series requires specifying a values and offsets buffer,
375
/// which does not match the actual underlying buffers. The values and offsets
376
/// buffer are converted into the actual buffers, which copies data.
377
fn from_buffers_string_impl(
378
data: Buffer<u8>,
379
validity: Option<Bitmap>,
380
offsets: OffsetsBuffer<i64>,
381
) -> PyResult<Series> {
382
let arr = Utf8Array::new(ArrowDataType::LargeUtf8, offsets, data, validity);
383
384
// This is not zero-copy
385
let s_result = Series::from_arrow(PlSmallStr::EMPTY, arr.to_boxed());
386
387
let s = s_result.map_err(PyPolarsErr::from)?;
388
Ok(s)
389
}
390
391