Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-python/src/series/buffers.rs
8405 views
1
//! Construct and deconstruct Series based on the underlying buffers.
2
//!
3
//! This functionality is mainly intended for use with the Python dataframe
4
//! interchange protocol.
5
//!
6
//! As Polars has no Buffer concept in Python, each buffer is represented as
7
//! a Series of its physical type.
8
//!
9
//! Note that String Series have underlying `Utf8View` buffers, which
10
//! currently cannot be represented as Series. Since the interchange protocol
11
//! cannot handle these buffers anyway and expects bytes and offsets buffers,
12
//! operations on String Series will convert from/to such buffers. This
13
//! conversion requires data to be copied.
14
15
use arrow::array::{Array, BooleanArray, PrimitiveArray, Utf8Array};
16
use arrow::bitmap::Bitmap;
17
use arrow::offset::OffsetsBuffer;
18
use arrow::types::NativeType;
19
use polars::prelude::*;
20
use polars_buffer::Buffer;
21
use polars_core::{with_match_physical_numeric_polars_type, with_match_physical_numeric_type};
22
use pyo3::exceptions::PyTypeError;
23
use pyo3::prelude::*;
24
use pyo3::types::PyTuple;
25
26
use super::{PySeries, ToSeries};
27
use crate::conversion::Wrap;
28
use crate::error::PyPolarsErr;
29
use crate::raise_err;
30
use crate::utils::EnterPolarsExt;
31
32
struct BufferInfo {
33
pointer: usize,
34
offset: usize,
35
length: usize,
36
}
37
impl<'py> IntoPyObject<'py> for BufferInfo {
38
type Target = PyTuple;
39
type Output = Bound<'py, Self::Target>;
40
type Error = PyErr;
41
42
fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
43
(self.pointer, self.offset, self.length).into_pyobject(py)
44
}
45
}
46
impl<'a, 'py> FromPyObject<'a, 'py> for BufferInfo {
47
type Error = PyErr;
48
49
fn extract(ob: Borrowed<'a, 'py, PyAny>) -> PyResult<Self> {
50
let (pointer, offset, length) = ob.extract()?;
51
Ok(Self {
52
pointer,
53
offset,
54
length,
55
})
56
}
57
}
58
59
#[pymethods]
60
impl PySeries {
61
/// Return pointer, offset, and length information about the underlying buffer.
62
fn _get_buffer_info(&self) -> PyResult<BufferInfo> {
63
let lock = self.series.read();
64
let s = lock.to_physical_repr();
65
let arrays = s.chunks();
66
if arrays.len() != 1 {
67
let msg = "cannot get buffer info for Series consisting of multiple chunks";
68
raise_err!(msg, ComputeError);
69
}
70
match s.dtype() {
71
DataType::Boolean => {
72
let ca = s.bool().unwrap();
73
let arr = ca.downcast_iter().next().unwrap();
74
let (slice, offset, len) = arr.values().as_slice();
75
Ok(BufferInfo {
76
pointer: slice.as_ptr() as usize,
77
offset,
78
length: len,
79
})
80
},
81
dt if dt.is_primitive_numeric() => {
82
Ok(with_match_physical_numeric_polars_type!(dt, |$T| {
83
let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();
84
BufferInfo { pointer: get_pointer(ca), offset: 0, length: ca.len() }
85
}))
86
},
87
dt => {
88
let msg = format!(
89
"`_get_buffer_info` not implemented for non-physical type {dt}; try to select a buffer first"
90
);
91
Err(PyTypeError::new_err(msg))
92
},
93
}
94
}
95
96
/// Return the underlying values, validity, and offsets buffers as Series.
97
fn _get_buffers(&self, py: Python) -> PyResult<(Self, Option<Self>, Option<Self>)> {
98
let s = &self.series.read();
99
py.enter_polars(|| match s.dtype().to_physical() {
100
dt if dt.is_primitive_numeric() => get_buffers_from_primitive(s),
101
DataType::Boolean => get_buffers_from_primitive(s),
102
DataType::String => get_buffers_from_string(s),
103
dt => {
104
let msg = format!("`_get_buffers` not implemented for `dtype` {dt}");
105
Err(PyTypeError::new_err(msg))
106
},
107
})
108
}
109
}
110
111
fn get_pointer<T: PolarsNumericType>(ca: &ChunkedArray<T>) -> usize {
112
let arr = ca.downcast_iter().next().unwrap();
113
arr.values().as_ptr() as usize
114
}
115
116
fn get_buffers_from_primitive(
117
s: &Series,
118
) -> PyResult<(PySeries, Option<PySeries>, Option<PySeries>)> {
119
let chunks = s
120
.chunks()
121
.iter()
122
.map(|arr| arr.with_validity(None))
123
.collect::<Vec<_>>();
124
let values = Series::try_from((s.name().clone(), chunks))
125
.map_err(PyPolarsErr::from)?
126
.into();
127
128
let validity = get_bitmap(s);
129
let offsets = None;
130
Ok((values, validity, offsets))
131
}
132
133
/// The underlying buffers for `String` Series cannot be represented in this
134
/// format. Instead, the buffers are converted to a values and offsets buffer.
135
/// This copies data.
136
fn get_buffers_from_string(s: &Series) -> PyResult<(PySeries, Option<PySeries>, Option<PySeries>)> {
137
// We cannot do this zero copy anyway, so rechunk first
138
let s = s.rechunk();
139
140
let ca = s.str().map_err(PyPolarsErr::from)?;
141
let arr_binview = ca.downcast_iter().next().unwrap();
142
143
// This is not zero-copy
144
let arr_utf8 = polars_compute::cast::utf8view_to_utf8(arr_binview);
145
146
let values = get_string_bytes(&arr_utf8)?;
147
let validity = get_bitmap(&s);
148
let offsets = get_string_offsets(&arr_utf8)?;
149
150
Ok((values, validity, Some(offsets)))
151
}
152
153
fn get_bitmap(s: &Series) -> Option<PySeries> {
154
if s.null_count() > 0 {
155
Some(s.is_not_null().into_series().into())
156
} else {
157
None
158
}
159
}
160
161
fn get_string_bytes(arr: &Utf8Array<i64>) -> PyResult<PySeries> {
162
let values_buffer = arr.values();
163
let values_arr =
164
PrimitiveArray::<u8>::try_new(ArrowDataType::UInt8, values_buffer.clone(), None)
165
.map_err(PyPolarsErr::from)?;
166
let values = Series::from_arrow(PlSmallStr::EMPTY, values_arr.to_boxed())
167
.map_err(PyPolarsErr::from)?
168
.into();
169
Ok(values)
170
}
171
172
fn get_string_offsets(arr: &Utf8Array<i64>) -> PyResult<PySeries> {
173
let offsets_buffer = arr.offsets().buffer();
174
let offsets_arr =
175
PrimitiveArray::<i64>::try_new(ArrowDataType::Int64, offsets_buffer.clone(), None)
176
.map_err(PyPolarsErr::from)?;
177
let offsets = Series::from_arrow(PlSmallStr::EMPTY, offsets_arr.to_boxed())
178
.map_err(PyPolarsErr::from)?
179
.into();
180
Ok(offsets)
181
}
182
183
#[pymethods]
184
impl PySeries {
185
/// Construct a PySeries from information about its underlying buffer.
186
#[staticmethod]
187
unsafe fn _from_buffer(
188
dtype: Wrap<DataType>,
189
buffer_info: BufferInfo,
190
owner: &Bound<'_, PyAny>,
191
) -> PyResult<Self> {
192
let dtype = dtype.0;
193
let BufferInfo {
194
pointer,
195
offset,
196
length,
197
} = buffer_info;
198
let owner = owner.to_owned().unbind();
199
200
let arr_boxed = match dtype {
201
dt if dt.is_primitive_numeric() => {
202
with_match_physical_numeric_type!(dt, |$T| unsafe {
203
from_buffer_impl::<$T>(pointer, offset, length, owner)
204
})
205
},
206
DataType::Boolean => {
207
unsafe { from_buffer_boolean_impl(pointer, offset, length, owner) }?
208
},
209
dt => {
210
let msg = format!(
211
"`_from_buffer` requires a physical type as input for `dtype`, got {dt}"
212
);
213
return Err(PyTypeError::new_err(msg));
214
},
215
};
216
217
let s = Series::from_arrow(PlSmallStr::EMPTY, arr_boxed)
218
.unwrap()
219
.into();
220
Ok(s)
221
}
222
}
223
224
unsafe fn from_buffer_impl<T: NativeType>(
225
pointer: usize,
226
offset: usize,
227
length: usize,
228
owner: Py<PyAny>,
229
) -> Box<dyn Array> {
230
let pointer = pointer as *const T;
231
let pointer = unsafe { pointer.add(offset) };
232
let slice = unsafe { std::slice::from_raw_parts(pointer, length) };
233
let arr = unsafe { arrow::ffi::mmap::slice_and_owner(slice, owner) };
234
arr.to_boxed()
235
}
236
unsafe fn from_buffer_boolean_impl(
237
pointer: usize,
238
offset: usize,
239
length: usize,
240
owner: Py<PyAny>,
241
) -> PyResult<Box<dyn Array>> {
242
let length_in_bytes = get_boolean_buffer_length_in_bytes(length, offset);
243
244
let pointer = pointer as *const u8;
245
let slice = unsafe { std::slice::from_raw_parts(pointer, length_in_bytes) };
246
let arr_result = unsafe { arrow::ffi::mmap::bitmap_and_owner(slice, offset, length, owner) };
247
let arr = arr_result.map_err(PyPolarsErr::from)?;
248
Ok(arr.to_boxed())
249
}
250
fn get_boolean_buffer_length_in_bytes(length: usize, offset: usize) -> usize {
251
let n_bits = offset + length;
252
let n_bytes = n_bits / 8;
253
let rest = n_bits % 8;
254
if rest == 0 { n_bytes } else { n_bytes + 1 }
255
}
256
257
#[pymethods]
258
impl PySeries {
259
/// Construct a PySeries from information about its underlying buffers.
260
#[staticmethod]
261
#[pyo3(signature = (dtype, data, validity))]
262
unsafe fn _from_buffers(
263
py: Python<'_>,
264
dtype: Wrap<DataType>,
265
data: Vec<PySeries>,
266
validity: Option<PySeries>,
267
) -> PyResult<Self> {
268
let dtype = dtype.0;
269
let mut data = data.to_series();
270
271
match data.len() {
272
0 => {
273
let msg = "`data` input to `_from_buffers` must contain at least one buffer";
274
return Err(PyTypeError::new_err(msg));
275
},
276
1 if validity.is_none() => {
277
let values = data.pop().unwrap();
278
let s = values.strict_cast(&dtype).map_err(PyPolarsErr::from)?;
279
return Ok(s.into());
280
},
281
_ => (),
282
}
283
284
let validity = match validity {
285
Some(ps) => {
286
let s = ps.series.into_inner();
287
let dtype = s.dtype();
288
if !dtype.is_bool() {
289
let msg = format!("validity buffer must have data type Boolean, got {dtype:?}");
290
return Err(PyTypeError::new_err(msg));
291
}
292
Some(series_to_bitmap(s).unwrap())
293
},
294
None => None,
295
};
296
297
let s = match dtype.to_physical() {
298
dt if dt.is_primitive_numeric() => {
299
let values = data.into_iter().next().unwrap();
300
with_match_physical_numeric_polars_type!(dt, |$T| {
301
let values_buffer = series_to_buffer::<$T>(values);
302
from_buffers_num_impl::<<$T as PolarsNumericType>::Native>(values_buffer, validity)?
303
})
304
},
305
DataType::Boolean => {
306
let values = data.into_iter().next().unwrap();
307
let values_buffer = series_to_bitmap(values)?;
308
from_buffers_bool_impl(values_buffer, validity)?
309
},
310
DataType::String => {
311
let mut data_iter = data.into_iter();
312
let values = data_iter.next().unwrap();
313
let offsets = match data_iter.next() {
314
Some(s) => {
315
let dtype = s.dtype();
316
if !matches!(dtype, DataType::Int64) {
317
return Err(PyTypeError::new_err(format!(
318
"offsets buffer must have data type Int64, got {dtype:?}"
319
)));
320
}
321
series_to_offsets(s)
322
},
323
None => {
324
return Err(PyTypeError::new_err(
325
"`_from_buffers` cannot create a String column without an offsets buffer",
326
));
327
},
328
};
329
let values = series_to_buffer::<UInt8Type>(values);
330
py.enter_polars(|| from_buffers_string_impl(values, validity, offsets))?
331
},
332
dt => {
333
let msg = format!("`_from_buffers` not implemented for `dtype` {dt}");
334
return Err(PyTypeError::new_err(msg));
335
},
336
};
337
338
let out = s.strict_cast(&dtype).map_err(PyPolarsErr::from)?;
339
Ok(out.into())
340
}
341
}
342
343
fn series_to_buffer<T>(s: Series) -> Buffer<T::Native>
344
where
345
T: PolarsNumericType,
346
{
347
let ca: &ChunkedArray<T> = s.as_ref().as_ref();
348
let ca = ca.rechunk();
349
ca.downcast_as_array().values().clone()
350
}
351
fn series_to_bitmap(s: Series) -> PyResult<Bitmap> {
352
let ca_result = s.bool();
353
let ca = ca_result.map_err(PyPolarsErr::from)?.rechunk();
354
Ok(ca.downcast_as_array().values().clone())
355
}
356
fn series_to_offsets(s: Series) -> OffsetsBuffer<i64> {
357
let buffer = series_to_buffer::<Int64Type>(s);
358
unsafe { OffsetsBuffer::new_unchecked(buffer) }
359
}
360
361
fn from_buffers_num_impl<T: NativeType>(
362
data: Buffer<T>,
363
validity: Option<Bitmap>,
364
) -> PyResult<Series> {
365
let arr = PrimitiveArray::new(T::PRIMITIVE.into(), data, validity);
366
let s_result = Series::from_arrow(PlSmallStr::EMPTY, arr.to_boxed());
367
let s = s_result.map_err(PyPolarsErr::from)?;
368
Ok(s)
369
}
370
fn from_buffers_bool_impl(data: Bitmap, validity: Option<Bitmap>) -> PyResult<Series> {
371
let arr = BooleanArray::new(ArrowDataType::Boolean, data, validity);
372
let s_result = Series::from_arrow(PlSmallStr::EMPTY, arr.to_boxed());
373
let s = s_result.map_err(PyPolarsErr::from)?;
374
Ok(s)
375
}
376
/// Constructing a `String` Series requires specifying a values and offsets buffer,
377
/// which does not match the actual underlying buffers. The values and offsets
378
/// buffer are converted into the actual buffers, which copies data.
379
fn from_buffers_string_impl(
380
data: Buffer<u8>,
381
validity: Option<Bitmap>,
382
offsets: OffsetsBuffer<i64>,
383
) -> PyResult<Series> {
384
let arr = Utf8Array::new(ArrowDataType::LargeUtf8, offsets, data, validity);
385
386
// This is not zero-copy
387
let s_result = Series::from_arrow(PlSmallStr::EMPTY, arr.to_boxed());
388
389
let s = s_result.map_err(PyPolarsErr::from)?;
390
Ok(s)
391
}
392
393