Path: blob/main/crates/polars-python/src/interop/numpy/to_numpy_df.rs
8353 views
use ndarray::IntoDimension;1use numpy::npyffi::flags;2use numpy::{Element, IntoPyArray, PyArray1};3use polars_core::prelude::*;4use polars_core::utils::dtypes_to_supertype;5use polars_core::with_match_physical_numeric_polars_type;6use pyo3::exceptions::PyRuntimeError;7use pyo3::prelude::*;8use pyo3::types::{PyList, PyTuple};9use pyo3::{IntoPyObjectExt, intern};1011use super::to_numpy_series::series_to_numpy;12use super::utils::{13create_borrowed_np_array, dtype_supports_view, polars_dtype_to_np_temporal_dtype,14};15use crate::conversion::Wrap;16use crate::dataframe::PyDataFrame;1718#[pymethods]19impl PyDataFrame {20/// Convert this DataFrame to a NumPy ndarray.21fn to_numpy(22&self,23py: Python<'_>,24order: Wrap<IndexOrder>,25writable: bool,26allow_copy: bool,27) -> PyResult<Py<PyAny>> {28df_to_numpy(py, &self.df.read(), order.0, writable, allow_copy)29}30}3132pub(super) fn df_to_numpy(33py: Python<'_>,34df: &DataFrame,35order: IndexOrder,36writable: bool,37allow_copy: bool,38) -> PyResult<Py<PyAny>> {39if df.shape_has_zero() {40if df.width() == 0 {41let shape = PyTuple::new(py, [df.height(), df.width()])?;42let numpy = super::utils::get_numpy_module(py)?;4344return Ok(numpy45.call_method1(46intern!(py, "zeros"),47(shape, numpy.getattr(intern!(py, "int8"))?),48)?49.unbind());50}51// Take this path to ensure a writable array.52// This does not actually copy data for an empty DataFrame.53return df_to_numpy_with_copy(py, df, order, true);54}5556if matches!(order, IndexOrder::Fortran) {57if let Some(mut arr) = try_df_to_numpy_view(py, df, false) {58if writable {59if !allow_copy {60return Err(PyRuntimeError::new_err(61"copy not allowed: cannot create a writable array without copying data",62));63}64arr = arr.call_method0(py, intern!(py, "copy"))?;65}66return Ok(arr);67}68}6970if !allow_copy {71return Err(PyRuntimeError::new_err(72"copy not allowed: cannot convert to a NumPy array without copying data",73));74}7576df_to_numpy_with_copy(py, df, order, writable)77}7879/// Create a NumPy view of the given DataFrame.80fn try_df_to_numpy_view(py: Python<'_>, df: &DataFrame, allow_nulls: bool) -> Option<Py<PyAny>> {81let first_dtype = check_df_dtypes_support_view(df)?;8283// TODO: Check for nested nulls using `series_contains_null` util when we support Array types.84if !allow_nulls && df.columns().iter().any(|s| s.null_count() > 0) {85return None;86}87if !check_df_columns_contiguous(df) {88return None;89}9091let owner = PyDataFrame::from(df.clone()).into_py_any(py).ok()?; // Keep the DataFrame memory alive.9293let arr = match first_dtype {94dt if dt.is_primitive_numeric() => {95with_match_physical_numpy_polars_type!(first_dtype, |$T| {96numeric_df_to_numpy_view::<$T>(py, df, owner)97})98},99DataType::Datetime(_, _) | DataType::Duration(_) => {100temporal_df_to_numpy_view(py, df, owner)101},102_ => unreachable!(),103};104Some(arr)105}106/// Check whether the data types of the DataFrame allow for creating a NumPy view.107///108/// Returns the common data type if it is supported, otherwise returns `None`.109fn check_df_dtypes_support_view(df: &DataFrame) -> Option<&DataType> {110let columns = df.columns();111let first_dtype = columns.first()?.dtype();112113// TODO: Support viewing Array types114if first_dtype.is_array() || !dtype_supports_view(first_dtype) {115return None;116}117if columns.iter().any(|s| s.dtype() != first_dtype) {118return None;119}120Some(first_dtype)121}122/// Returns whether all columns of the dataframe are contiguous in memory.123fn check_df_columns_contiguous(df: &DataFrame) -> bool {124let columns = df.columns();125126if columns127.iter()128.any(|s| s.as_materialized_series().n_chunks() > 1)129{130return false;131}132if columns.len() <= 1 {133return true;134}135136match columns.first().unwrap().dtype() {137dt if dt.is_primitive_numeric() => {138with_match_physical_numeric_polars_type!(dt, |$T| {139let slices = columns140.iter()141.map(|s| {142let ca: &ChunkedArray<$T> = s.as_materialized_series().unpack().unwrap();143ca.data_views().next().unwrap()144})145.collect::<Vec<_>>();146147check_slices_contiguous::<$T>(slices)148})149},150DataType::Datetime(_, _) | DataType::Duration(_) => {151let phys: Vec<_> = columns.iter().map(|s| s.to_physical_repr()).collect();152let slices = phys153.iter()154.map(|s| {155let ca = s.i64().unwrap();156ca.data_views().next().unwrap()157})158.collect::<Vec<_>>();159160check_slices_contiguous::<Int64Type>(slices)161},162_ => panic!("invalid data type"),163}164}165/// Returns whether the end and start pointers of all consecutive slices match.166fn check_slices_contiguous<T>(slices: Vec<&[T::Native]>) -> bool167where168T: PolarsNumericType,169{170let first_slice = slices.first().unwrap();171172// Check whether all arrays are from the same buffer.173let mut end_ptr = unsafe { first_slice.as_ptr().add(first_slice.len()) };174slices[1..].iter().all(|slice| {175let slice_ptr = slice.as_ptr();176let valid = std::ptr::eq(slice_ptr, end_ptr);177178end_ptr = unsafe { slice_ptr.add(slice.len()) };179180valid181})182}183184/// Create a NumPy view of a numeric DataFrame.185fn numeric_df_to_numpy_view<T>(py: Python<'_>, df: &DataFrame, owner: Py<PyAny>) -> Py<PyAny>186where187T: PolarsNumericType,188T::Native: Element,189{190let ca: &ChunkedArray<T> = df191.columns()192.first()193.unwrap()194.as_materialized_series()195.unpack()196.unwrap();197let first_slice = ca.data_views().next().unwrap();198199let start_ptr = first_slice.as_ptr();200let np_dtype = T::Native::get_dtype(py);201let dims = [first_slice.len(), df.width()].into_dimension();202203unsafe {204create_borrowed_np_array::<_>(205py,206np_dtype,207dims,208flags::NPY_ARRAY_FARRAY_RO,209start_ptr as _,210owner,211)212}213}214/// Create a NumPy view of a Datetime or Duration DataFrame.215fn temporal_df_to_numpy_view(py: Python<'_>, df: &DataFrame, owner: Py<PyAny>) -> Py<PyAny> {216let s = df.columns().first().unwrap();217let phys = s.to_physical_repr();218let ca = phys.i64().unwrap();219let first_slice = ca.data_views().next().unwrap();220221let start_ptr = first_slice.as_ptr();222let np_dtype = polars_dtype_to_np_temporal_dtype(py, s.dtype());223let dims = [first_slice.len(), df.width()].into_dimension();224225unsafe {226create_borrowed_np_array::<_>(227py,228np_dtype,229dims,230flags::NPY_ARRAY_FARRAY_RO,231start_ptr as _,232owner,233)234}235}236237fn df_to_numpy_with_copy(238py: Python<'_>,239df: &DataFrame,240order: IndexOrder,241writable: bool,242) -> PyResult<Py<PyAny>> {243if let Some(arr) = try_df_to_numpy_numeric_supertype(py, df, order) {244Ok(arr)245} else {246df_columns_to_numpy(py, df, order, writable)247}248}249fn try_df_to_numpy_numeric_supertype(250py: Python<'_>,251df: &DataFrame,252order: IndexOrder,253) -> Option<Py<PyAny>> {254let st = dtypes_to_supertype(df.columns().iter().map(|s| s.dtype())).ok()?;255256let np_array = match st {257dt if dt.is_primitive_numeric() => with_match_physical_numpy_polars_type!(dt, |$T| {258df.to_ndarray::<$T>(order).ok()?.into_pyarray(py).into_py_any(py).ok()?259}),260_ => return None,261};262Some(np_array)263}264265fn df_columns_to_numpy(266py: Python<'_>,267df: &DataFrame,268order: IndexOrder,269writable: bool,270) -> PyResult<Py<PyAny>> {271let np_arrays = df.columns().iter().map(|c| {272let mut arr = series_to_numpy(py, c.as_materialized_series(), writable, true).unwrap();273274// Convert multidimensional arrays to 1D object arrays.275let shape: Vec<usize> = arr276.getattr(py, intern!(py, "shape"))277.unwrap()278.extract(py)279.unwrap();280if shape.len() > 1 {281// TODO: Downcast the NumPy array to Rust and split without calling into Python.282let subarrays = (0..shape[0]).map(|idx| {283arr.call_method1(py, intern!(py, "__getitem__"), (idx,))284.unwrap()285});286arr = PyArray1::from_iter(py, subarrays).into_py_any(py).unwrap();287}288arr289});290291let numpy = super::utils::get_numpy_module(py)?;292let np_array = match order {293IndexOrder::C => numpy294.getattr(intern!(py, "column_stack"))?295.call1((PyList::new(py, np_arrays)?,))?,296IndexOrder::Fortran => numpy297.getattr(intern!(py, "vstack"))?298.call1((PyList::new(py, np_arrays)?,))?299.getattr(intern!(py, "T"))?,300};301302Ok(np_array.into())303}304305306