Path: blob/main/crates/polars-python/src/series/buffers.rs
8405 views
//! Construct and deconstruct Series based on the underlying buffers.1//!2//! This functionality is mainly intended for use with the Python dataframe3//! interchange protocol.4//!5//! As Polars has no Buffer concept in Python, each buffer is represented as6//! a Series of its physical type.7//!8//! Note that String Series have underlying `Utf8View` buffers, which9//! currently cannot be represented as Series. Since the interchange protocol10//! cannot handle these buffers anyway and expects bytes and offsets buffers,11//! operations on String Series will convert from/to such buffers. This12//! conversion requires data to be copied.1314use arrow::array::{Array, BooleanArray, PrimitiveArray, Utf8Array};15use arrow::bitmap::Bitmap;16use arrow::offset::OffsetsBuffer;17use arrow::types::NativeType;18use polars::prelude::*;19use polars_buffer::Buffer;20use polars_core::{with_match_physical_numeric_polars_type, with_match_physical_numeric_type};21use pyo3::exceptions::PyTypeError;22use pyo3::prelude::*;23use pyo3::types::PyTuple;2425use super::{PySeries, ToSeries};26use crate::conversion::Wrap;27use crate::error::PyPolarsErr;28use crate::raise_err;29use crate::utils::EnterPolarsExt;3031struct BufferInfo {32pointer: usize,33offset: usize,34length: usize,35}36impl<'py> IntoPyObject<'py> for BufferInfo {37type Target = PyTuple;38type Output = Bound<'py, Self::Target>;39type Error = PyErr;4041fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {42(self.pointer, self.offset, self.length).into_pyobject(py)43}44}45impl<'a, 'py> FromPyObject<'a, 'py> for BufferInfo {46type Error = PyErr;4748fn extract(ob: Borrowed<'a, 'py, PyAny>) -> PyResult<Self> {49let (pointer, offset, length) = ob.extract()?;50Ok(Self {51pointer,52offset,53length,54})55}56}5758#[pymethods]59impl PySeries {60/// Return pointer, offset, and length information about the underlying buffer.61fn _get_buffer_info(&self) -> PyResult<BufferInfo> {62let lock = self.series.read();63let s = lock.to_physical_repr();64let arrays = s.chunks();65if arrays.len() != 1 {66let msg = "cannot get buffer info for Series consisting of multiple chunks";67raise_err!(msg, ComputeError);68}69match s.dtype() {70DataType::Boolean => {71let ca = s.bool().unwrap();72let arr = ca.downcast_iter().next().unwrap();73let (slice, offset, len) = arr.values().as_slice();74Ok(BufferInfo {75pointer: slice.as_ptr() as usize,76offset,77length: len,78})79},80dt if dt.is_primitive_numeric() => {81Ok(with_match_physical_numeric_polars_type!(dt, |$T| {82let ca: &ChunkedArray<$T> = s.as_ref().as_ref().as_ref();83BufferInfo { pointer: get_pointer(ca), offset: 0, length: ca.len() }84}))85},86dt => {87let msg = format!(88"`_get_buffer_info` not implemented for non-physical type {dt}; try to select a buffer first"89);90Err(PyTypeError::new_err(msg))91},92}93}9495/// Return the underlying values, validity, and offsets buffers as Series.96fn _get_buffers(&self, py: Python) -> PyResult<(Self, Option<Self>, Option<Self>)> {97let s = &self.series.read();98py.enter_polars(|| match s.dtype().to_physical() {99dt if dt.is_primitive_numeric() => get_buffers_from_primitive(s),100DataType::Boolean => get_buffers_from_primitive(s),101DataType::String => get_buffers_from_string(s),102dt => {103let msg = format!("`_get_buffers` not implemented for `dtype` {dt}");104Err(PyTypeError::new_err(msg))105},106})107}108}109110fn get_pointer<T: PolarsNumericType>(ca: &ChunkedArray<T>) -> usize {111let arr = ca.downcast_iter().next().unwrap();112arr.values().as_ptr() as usize113}114115fn get_buffers_from_primitive(116s: &Series,117) -> PyResult<(PySeries, Option<PySeries>, Option<PySeries>)> {118let chunks = s119.chunks()120.iter()121.map(|arr| arr.with_validity(None))122.collect::<Vec<_>>();123let values = Series::try_from((s.name().clone(), chunks))124.map_err(PyPolarsErr::from)?125.into();126127let validity = get_bitmap(s);128let offsets = None;129Ok((values, validity, offsets))130}131132/// The underlying buffers for `String` Series cannot be represented in this133/// format. Instead, the buffers are converted to a values and offsets buffer.134/// This copies data.135fn get_buffers_from_string(s: &Series) -> PyResult<(PySeries, Option<PySeries>, Option<PySeries>)> {136// We cannot do this zero copy anyway, so rechunk first137let s = s.rechunk();138139let ca = s.str().map_err(PyPolarsErr::from)?;140let arr_binview = ca.downcast_iter().next().unwrap();141142// This is not zero-copy143let arr_utf8 = polars_compute::cast::utf8view_to_utf8(arr_binview);144145let values = get_string_bytes(&arr_utf8)?;146let validity = get_bitmap(&s);147let offsets = get_string_offsets(&arr_utf8)?;148149Ok((values, validity, Some(offsets)))150}151152fn get_bitmap(s: &Series) -> Option<PySeries> {153if s.null_count() > 0 {154Some(s.is_not_null().into_series().into())155} else {156None157}158}159160fn get_string_bytes(arr: &Utf8Array<i64>) -> PyResult<PySeries> {161let values_buffer = arr.values();162let values_arr =163PrimitiveArray::<u8>::try_new(ArrowDataType::UInt8, values_buffer.clone(), None)164.map_err(PyPolarsErr::from)?;165let values = Series::from_arrow(PlSmallStr::EMPTY, values_arr.to_boxed())166.map_err(PyPolarsErr::from)?167.into();168Ok(values)169}170171fn get_string_offsets(arr: &Utf8Array<i64>) -> PyResult<PySeries> {172let offsets_buffer = arr.offsets().buffer();173let offsets_arr =174PrimitiveArray::<i64>::try_new(ArrowDataType::Int64, offsets_buffer.clone(), None)175.map_err(PyPolarsErr::from)?;176let offsets = Series::from_arrow(PlSmallStr::EMPTY, offsets_arr.to_boxed())177.map_err(PyPolarsErr::from)?178.into();179Ok(offsets)180}181182#[pymethods]183impl PySeries {184/// Construct a PySeries from information about its underlying buffer.185#[staticmethod]186unsafe fn _from_buffer(187dtype: Wrap<DataType>,188buffer_info: BufferInfo,189owner: &Bound<'_, PyAny>,190) -> PyResult<Self> {191let dtype = dtype.0;192let BufferInfo {193pointer,194offset,195length,196} = buffer_info;197let owner = owner.to_owned().unbind();198199let arr_boxed = match dtype {200dt if dt.is_primitive_numeric() => {201with_match_physical_numeric_type!(dt, |$T| unsafe {202from_buffer_impl::<$T>(pointer, offset, length, owner)203})204},205DataType::Boolean => {206unsafe { from_buffer_boolean_impl(pointer, offset, length, owner) }?207},208dt => {209let msg = format!(210"`_from_buffer` requires a physical type as input for `dtype`, got {dt}"211);212return Err(PyTypeError::new_err(msg));213},214};215216let s = Series::from_arrow(PlSmallStr::EMPTY, arr_boxed)217.unwrap()218.into();219Ok(s)220}221}222223unsafe fn from_buffer_impl<T: NativeType>(224pointer: usize,225offset: usize,226length: usize,227owner: Py<PyAny>,228) -> Box<dyn Array> {229let pointer = pointer as *const T;230let pointer = unsafe { pointer.add(offset) };231let slice = unsafe { std::slice::from_raw_parts(pointer, length) };232let arr = unsafe { arrow::ffi::mmap::slice_and_owner(slice, owner) };233arr.to_boxed()234}235unsafe fn from_buffer_boolean_impl(236pointer: usize,237offset: usize,238length: usize,239owner: Py<PyAny>,240) -> PyResult<Box<dyn Array>> {241let length_in_bytes = get_boolean_buffer_length_in_bytes(length, offset);242243let pointer = pointer as *const u8;244let slice = unsafe { std::slice::from_raw_parts(pointer, length_in_bytes) };245let arr_result = unsafe { arrow::ffi::mmap::bitmap_and_owner(slice, offset, length, owner) };246let arr = arr_result.map_err(PyPolarsErr::from)?;247Ok(arr.to_boxed())248}249fn get_boolean_buffer_length_in_bytes(length: usize, offset: usize) -> usize {250let n_bits = offset + length;251let n_bytes = n_bits / 8;252let rest = n_bits % 8;253if rest == 0 { n_bytes } else { n_bytes + 1 }254}255256#[pymethods]257impl PySeries {258/// Construct a PySeries from information about its underlying buffers.259#[staticmethod]260#[pyo3(signature = (dtype, data, validity))]261unsafe fn _from_buffers(262py: Python<'_>,263dtype: Wrap<DataType>,264data: Vec<PySeries>,265validity: Option<PySeries>,266) -> PyResult<Self> {267let dtype = dtype.0;268let mut data = data.to_series();269270match data.len() {2710 => {272let msg = "`data` input to `_from_buffers` must contain at least one buffer";273return Err(PyTypeError::new_err(msg));274},2751 if validity.is_none() => {276let values = data.pop().unwrap();277let s = values.strict_cast(&dtype).map_err(PyPolarsErr::from)?;278return Ok(s.into());279},280_ => (),281}282283let validity = match validity {284Some(ps) => {285let s = ps.series.into_inner();286let dtype = s.dtype();287if !dtype.is_bool() {288let msg = format!("validity buffer must have data type Boolean, got {dtype:?}");289return Err(PyTypeError::new_err(msg));290}291Some(series_to_bitmap(s).unwrap())292},293None => None,294};295296let s = match dtype.to_physical() {297dt if dt.is_primitive_numeric() => {298let values = data.into_iter().next().unwrap();299with_match_physical_numeric_polars_type!(dt, |$T| {300let values_buffer = series_to_buffer::<$T>(values);301from_buffers_num_impl::<<$T as PolarsNumericType>::Native>(values_buffer, validity)?302})303},304DataType::Boolean => {305let values = data.into_iter().next().unwrap();306let values_buffer = series_to_bitmap(values)?;307from_buffers_bool_impl(values_buffer, validity)?308},309DataType::String => {310let mut data_iter = data.into_iter();311let values = data_iter.next().unwrap();312let offsets = match data_iter.next() {313Some(s) => {314let dtype = s.dtype();315if !matches!(dtype, DataType::Int64) {316return Err(PyTypeError::new_err(format!(317"offsets buffer must have data type Int64, got {dtype:?}"318)));319}320series_to_offsets(s)321},322None => {323return Err(PyTypeError::new_err(324"`_from_buffers` cannot create a String column without an offsets buffer",325));326},327};328let values = series_to_buffer::<UInt8Type>(values);329py.enter_polars(|| from_buffers_string_impl(values, validity, offsets))?330},331dt => {332let msg = format!("`_from_buffers` not implemented for `dtype` {dt}");333return Err(PyTypeError::new_err(msg));334},335};336337let out = s.strict_cast(&dtype).map_err(PyPolarsErr::from)?;338Ok(out.into())339}340}341342fn series_to_buffer<T>(s: Series) -> Buffer<T::Native>343where344T: PolarsNumericType,345{346let ca: &ChunkedArray<T> = s.as_ref().as_ref();347let ca = ca.rechunk();348ca.downcast_as_array().values().clone()349}350fn series_to_bitmap(s: Series) -> PyResult<Bitmap> {351let ca_result = s.bool();352let ca = ca_result.map_err(PyPolarsErr::from)?.rechunk();353Ok(ca.downcast_as_array().values().clone())354}355fn series_to_offsets(s: Series) -> OffsetsBuffer<i64> {356let buffer = series_to_buffer::<Int64Type>(s);357unsafe { OffsetsBuffer::new_unchecked(buffer) }358}359360fn from_buffers_num_impl<T: NativeType>(361data: Buffer<T>,362validity: Option<Bitmap>,363) -> PyResult<Series> {364let arr = PrimitiveArray::new(T::PRIMITIVE.into(), data, validity);365let s_result = Series::from_arrow(PlSmallStr::EMPTY, arr.to_boxed());366let s = s_result.map_err(PyPolarsErr::from)?;367Ok(s)368}369fn from_buffers_bool_impl(data: Bitmap, validity: Option<Bitmap>) -> PyResult<Series> {370let arr = BooleanArray::new(ArrowDataType::Boolean, data, validity);371let s_result = Series::from_arrow(PlSmallStr::EMPTY, arr.to_boxed());372let s = s_result.map_err(PyPolarsErr::from)?;373Ok(s)374}375/// Constructing a `String` Series requires specifying a values and offsets buffer,376/// which does not match the actual underlying buffers. The values and offsets377/// buffer are converted into the actual buffers, which copies data.378fn from_buffers_string_impl(379data: Buffer<u8>,380validity: Option<Bitmap>,381offsets: OffsetsBuffer<i64>,382) -> PyResult<Series> {383let arr = Utf8Array::new(ArrowDataType::LargeUtf8, offsets, data, validity);384385// This is not zero-copy386let s_result = Series::from_arrow(PlSmallStr::EMPTY, arr.to_boxed());387388let s = s_result.map_err(PyPolarsErr::from)?;389Ok(s)390}391392393