Path: blob/main/crates/polars-python/src/series/buffers.rs
7889 views
//! Construct and deconstruct Series based on the underlying buffers.1//!2//! This functionality is mainly intended for use with the Python dataframe3//! interchange protocol.4//!5//! As Polars has no Buffer concept in Python, each buffer is represented as6//! a Series of its physical type.7//!8//! Note that String Series have underlying `Utf8View` buffers, which9//! currently cannot be represented as Series. Since the interchange protocol10//! cannot handle these buffers anyway and expects bytes and offsets buffers,11//! operations on String Series will convert from/to such buffers. This12//! conversion requires data to be copied.1314use arrow::array::{Array, BooleanArray, PrimitiveArray, Utf8Array};15use arrow::bitmap::Bitmap;16use arrow::buffer::Buffer;17use arrow::offset::OffsetsBuffer;18use arrow::types::NativeType;19use polars::prelude::*;20use polars_core::{with_match_physical_numeric_polars_type, with_match_physical_numeric_type};21use pyo3::exceptions::PyTypeError;22use pyo3::prelude::*;23use pyo3::types::PyTuple;2425use super::{PySeries, ToSeries};26use crate::conversion::Wrap;27use crate::error::PyPolarsErr;28use crate::raise_err;29use crate::utils::EnterPolarsExt;3031struct BufferInfo {32pointer: usize,33offset: usize,34length: usize,35}36impl<'py> IntoPyObject<'py> for BufferInfo {37type Target = PyTuple;38type Output = Bound<'py, Self::Target>;39type Error = PyErr;4041fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {42(self.pointer, self.offset, self.length).into_pyobject(py)43}44}45impl<'py> FromPyObject<'py> for BufferInfo {46fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {47let (pointer, offset, length) = ob.extract()?;48Ok(Self {49pointer,50offset,51length,52})53}54}5556#[pymethods]57impl PySeries {58/// Return pointer, offset, and length information about the underlying buffer.59fn _get_buffer_info(&self) -> PyResult<BufferInfo> {60let lock = self.series.read();61let s = lock.to_physical_repr();62let arrays = s.chunks();63if arrays.len() != 1 {64let msg = "cannot get buffer info for Series consisting of multiple chunks";65raise_err!(msg, ComputeError);66}67match s.dtype() {68DataType::Boolean => {69let ca = s.bool().unwrap();70let arr = ca.downcast_iter().next().unwrap();71let (slice, offset, len) = arr.values().as_slice();72Ok(BufferInfo {73pointer: slice.as_ptr() as usize,74offset,75length: len,76})77},78dt if dt.is_primitive_numeric() => {79Ok(with_match_physical_numeric_polars_type!(dt, |$T| {80let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();81BufferInfo { pointer: get_pointer(ca), offset: 0, length: ca.len() }82}))83},84dt => {85let msg = format!(86"`_get_buffer_info` not implemented for non-physical type {dt}; try to select a buffer first"87);88Err(PyTypeError::new_err(msg))89},90}91}9293/// Return the underlying values, validity, and offsets buffers as Series.94fn _get_buffers(&self, py: Python) -> PyResult<(Self, Option<Self>, Option<Self>)> {95let s = &self.series.read();96py.enter_polars(|| match s.dtype().to_physical() {97dt if dt.is_primitive_numeric() => get_buffers_from_primitive(s),98DataType::Boolean => get_buffers_from_primitive(s),99DataType::String => get_buffers_from_string(s),100dt => {101let msg = format!("`_get_buffers` not implemented for `dtype` {dt}");102Err(PyTypeError::new_err(msg))103},104})105}106}107108fn get_pointer<T: PolarsNumericType>(ca: &ChunkedArray<T>) -> usize {109let arr = ca.downcast_iter().next().unwrap();110arr.values().as_ptr() as usize111}112113fn get_buffers_from_primitive(114s: &Series,115) -> PyResult<(PySeries, Option<PySeries>, Option<PySeries>)> {116let chunks = s117.chunks()118.iter()119.map(|arr| arr.with_validity(None))120.collect::<Vec<_>>();121let values = Series::try_from((s.name().clone(), chunks))122.map_err(PyPolarsErr::from)?123.into();124125let validity = get_bitmap(s);126let offsets = None;127Ok((values, validity, offsets))128}129130/// The underlying buffers for `String` Series cannot be represented in this131/// format. Instead, the buffers are converted to a values and offsets buffer.132/// This copies data.133fn get_buffers_from_string(s: &Series) -> PyResult<(PySeries, Option<PySeries>, Option<PySeries>)> {134// We cannot do this zero copy anyway, so rechunk first135let s = s.rechunk();136137let ca = s.str().map_err(PyPolarsErr::from)?;138let arr_binview = ca.downcast_iter().next().unwrap();139140// This is not zero-copy141let arr_utf8 = polars_compute::cast::utf8view_to_utf8(arr_binview);142143let values = get_string_bytes(&arr_utf8)?;144let validity = get_bitmap(&s);145let offsets = get_string_offsets(&arr_utf8)?;146147Ok((values, validity, Some(offsets)))148}149150fn get_bitmap(s: &Series) -> Option<PySeries> {151if s.null_count() > 0 {152Some(s.is_not_null().into_series().into())153} else {154None155}156}157158fn get_string_bytes(arr: &Utf8Array<i64>) -> PyResult<PySeries> {159let values_buffer = arr.values();160let values_arr =161PrimitiveArray::<u8>::try_new(ArrowDataType::UInt8, values_buffer.clone(), None)162.map_err(PyPolarsErr::from)?;163let values = Series::from_arrow(PlSmallStr::EMPTY, values_arr.to_boxed())164.map_err(PyPolarsErr::from)?165.into();166Ok(values)167}168169fn get_string_offsets(arr: &Utf8Array<i64>) -> PyResult<PySeries> {170let offsets_buffer = arr.offsets().buffer();171let offsets_arr =172PrimitiveArray::<i64>::try_new(ArrowDataType::Int64, offsets_buffer.clone(), None)173.map_err(PyPolarsErr::from)?;174let offsets = Series::from_arrow(PlSmallStr::EMPTY, offsets_arr.to_boxed())175.map_err(PyPolarsErr::from)?176.into();177Ok(offsets)178}179180#[pymethods]181impl PySeries {182/// Construct a PySeries from information about its underlying buffer.183#[staticmethod]184unsafe fn _from_buffer(185dtype: Wrap<DataType>,186buffer_info: BufferInfo,187owner: &Bound<'_, PyAny>,188) -> PyResult<Self> {189let dtype = dtype.0;190let BufferInfo {191pointer,192offset,193length,194} = buffer_info;195let owner = owner.to_owned().unbind();196197let arr_boxed = match dtype {198dt if dt.is_primitive_numeric() => {199with_match_physical_numeric_type!(dt, |$T| unsafe {200from_buffer_impl::<$T>(pointer, offset, length, owner)201})202},203DataType::Boolean => {204unsafe { from_buffer_boolean_impl(pointer, offset, length, owner) }?205},206dt => {207let msg = format!(208"`_from_buffer` requires a physical type as input for `dtype`, got {dt}"209);210return Err(PyTypeError::new_err(msg));211},212};213214let s = Series::from_arrow(PlSmallStr::EMPTY, arr_boxed)215.unwrap()216.into();217Ok(s)218}219}220221unsafe fn from_buffer_impl<T: NativeType>(222pointer: usize,223offset: usize,224length: usize,225owner: Py<PyAny>,226) -> Box<dyn Array> {227let pointer = pointer as *const T;228let pointer = unsafe { pointer.add(offset) };229let slice = unsafe { std::slice::from_raw_parts(pointer, length) };230let arr = unsafe { arrow::ffi::mmap::slice_and_owner(slice, owner) };231arr.to_boxed()232}233unsafe fn from_buffer_boolean_impl(234pointer: usize,235offset: usize,236length: usize,237owner: Py<PyAny>,238) -> PyResult<Box<dyn Array>> {239let length_in_bytes = get_boolean_buffer_length_in_bytes(length, offset);240241let pointer = pointer as *const u8;242let slice = unsafe { std::slice::from_raw_parts(pointer, length_in_bytes) };243let arr_result = unsafe { arrow::ffi::mmap::bitmap_and_owner(slice, offset, length, owner) };244let arr = arr_result.map_err(PyPolarsErr::from)?;245Ok(arr.to_boxed())246}247fn get_boolean_buffer_length_in_bytes(length: usize, offset: usize) -> usize {248let n_bits = offset + length;249let n_bytes = n_bits / 8;250let rest = n_bits % 8;251if rest == 0 { n_bytes } else { n_bytes + 1 }252}253254#[pymethods]255impl PySeries {256/// Construct a PySeries from information about its underlying buffers.257#[staticmethod]258#[pyo3(signature = (dtype, data, validity))]259unsafe fn _from_buffers(260py: Python<'_>,261dtype: Wrap<DataType>,262data: Vec<PySeries>,263validity: Option<PySeries>,264) -> PyResult<Self> {265let dtype = dtype.0;266let mut data = data.to_series();267268match data.len() {2690 => {270let msg = "`data` input to `_from_buffers` must contain at least one buffer";271return Err(PyTypeError::new_err(msg));272},2731 if validity.is_none() => {274let values = data.pop().unwrap();275let s = values.strict_cast(&dtype).map_err(PyPolarsErr::from)?;276return Ok(s.into());277},278_ => (),279}280281let validity = match validity {282Some(ps) => {283let s = ps.series.into_inner();284let dtype = s.dtype();285if !dtype.is_bool() {286let msg = format!("validity buffer must have data type Boolean, got {dtype:?}");287return Err(PyTypeError::new_err(msg));288}289Some(series_to_bitmap(s).unwrap())290},291None => None,292};293294let s = match dtype.to_physical() {295dt if dt.is_primitive_numeric() => {296let values = data.into_iter().next().unwrap();297with_match_physical_numeric_polars_type!(dt, |$T| {298let values_buffer = series_to_buffer::<$T>(values);299from_buffers_num_impl::<<$T as PolarsNumericType>::Native>(values_buffer, validity)?300})301},302DataType::Boolean => {303let values = data.into_iter().next().unwrap();304let values_buffer = series_to_bitmap(values)?;305from_buffers_bool_impl(values_buffer, validity)?306},307DataType::String => {308let mut data_iter = data.into_iter();309let values = data_iter.next().unwrap();310let offsets = match data_iter.next() {311Some(s) => {312let dtype = s.dtype();313if !matches!(dtype, DataType::Int64) {314return Err(PyTypeError::new_err(format!(315"offsets buffer must have data type Int64, got {dtype:?}"316)));317}318series_to_offsets(s)319},320None => {321return Err(PyTypeError::new_err(322"`_from_buffers` cannot create a String column without an offsets buffer",323));324},325};326let values = series_to_buffer::<UInt8Type>(values);327py.enter_polars(|| from_buffers_string_impl(values, validity, offsets))?328},329dt => {330let msg = format!("`_from_buffers` not implemented for `dtype` {dt}");331return Err(PyTypeError::new_err(msg));332},333};334335let out = s.strict_cast(&dtype).map_err(PyPolarsErr::from)?;336Ok(out.into())337}338}339340fn series_to_buffer<T>(s: Series) -> Buffer<T::Native>341where342T: PolarsNumericType,343{344let ca: &ChunkedArray<T> = s.as_ref().as_ref();345let ca = ca.rechunk();346ca.downcast_as_array().values().clone()347}348fn series_to_bitmap(s: Series) -> PyResult<Bitmap> {349let ca_result = s.bool();350let ca = ca_result.map_err(PyPolarsErr::from)?.rechunk();351Ok(ca.downcast_as_array().values().clone())352}353fn series_to_offsets(s: Series) -> OffsetsBuffer<i64> {354let buffer = series_to_buffer::<Int64Type>(s);355unsafe { OffsetsBuffer::new_unchecked(buffer) }356}357358fn from_buffers_num_impl<T: NativeType>(359data: Buffer<T>,360validity: Option<Bitmap>,361) -> PyResult<Series> {362let arr = PrimitiveArray::new(T::PRIMITIVE.into(), data, validity);363let s_result = Series::from_arrow(PlSmallStr::EMPTY, arr.to_boxed());364let s = s_result.map_err(PyPolarsErr::from)?;365Ok(s)366}367fn from_buffers_bool_impl(data: Bitmap, validity: Option<Bitmap>) -> PyResult<Series> {368let arr = BooleanArray::new(ArrowDataType::Boolean, data, validity);369let s_result = Series::from_arrow(PlSmallStr::EMPTY, arr.to_boxed());370let s = s_result.map_err(PyPolarsErr::from)?;371Ok(s)372}373/// Constructing a `String` Series requires specifying a values and offsets buffer,374/// which does not match the actual underlying buffers. The values and offsets375/// buffer are converted into the actual buffers, which copies data.376fn from_buffers_string_impl(377data: Buffer<u8>,378validity: Option<Bitmap>,379offsets: OffsetsBuffer<i64>,380) -> PyResult<Series> {381let arr = Utf8Array::new(ArrowDataType::LargeUtf8, offsets, data, validity);382383// This is not zero-copy384let s_result = Series::from_arrow(PlSmallStr::EMPTY, arr.to_boxed());385386let s = s_result.map_err(PyPolarsErr::from)?;387Ok(s)388}389390391