Path: blob/main/crates/polars-python/src/interop/numpy/to_numpy_df.rs
7892 views
use ndarray::IntoDimension;1use numpy::npyffi::flags;2use numpy::{Element, IntoPyArray, PyArray1};3use polars_core::prelude::*;4use polars_core::utils::dtypes_to_supertype;5use polars_core::with_match_physical_numeric_polars_type;6use pyo3::exceptions::PyRuntimeError;7use pyo3::prelude::*;8use pyo3::types::PyList;9use pyo3::{IntoPyObjectExt, intern};1011use super::to_numpy_series::series_to_numpy;12use super::utils::{13create_borrowed_np_array, dtype_supports_view, polars_dtype_to_np_temporal_dtype,14};15use crate::conversion::Wrap;16use crate::dataframe::PyDataFrame;1718#[pymethods]19impl PyDataFrame {20/// Convert this DataFrame to a NumPy ndarray.21fn to_numpy(22&self,23py: Python<'_>,24order: Wrap<IndexOrder>,25writable: bool,26allow_copy: bool,27) -> PyResult<Py<PyAny>> {28df_to_numpy(py, &self.df.read(), order.0, writable, allow_copy)29}30}3132pub(super) fn df_to_numpy(33py: Python<'_>,34df: &DataFrame,35order: IndexOrder,36writable: bool,37allow_copy: bool,38) -> PyResult<Py<PyAny>> {39if df.is_empty() {40// Take this path to ensure a writable array.41// This does not actually copy data for an empty DataFrame.42return df_to_numpy_with_copy(py, df, order, true);43}4445if matches!(order, IndexOrder::Fortran) {46if let Some(mut arr) = try_df_to_numpy_view(py, df, false) {47if writable {48if !allow_copy {49return Err(PyRuntimeError::new_err(50"copy not allowed: cannot create a writable array without copying data",51));52}53arr = arr.call_method0(py, intern!(py, "copy"))?;54}55return Ok(arr);56}57}5859if !allow_copy {60return Err(PyRuntimeError::new_err(61"copy not allowed: cannot convert to a NumPy array without copying data",62));63}6465df_to_numpy_with_copy(py, df, order, writable)66}6768/// Create a NumPy view of the given DataFrame.69fn try_df_to_numpy_view(py: Python<'_>, df: &DataFrame, allow_nulls: bool) -> Option<Py<PyAny>> {70let first_dtype = check_df_dtypes_support_view(df)?;7172// TODO: Check for nested nulls using `series_contains_null` util when we support Array types.73if !allow_nulls && df.get_columns().iter().any(|s| s.null_count() > 0) {74return None;75}76if !check_df_columns_contiguous(df) {77return None;78}7980let owner = PyDataFrame::from(df.clone()).into_py_any(py).ok()?; // Keep the DataFrame memory alive.8182let arr = match first_dtype {83dt if dt.is_primitive_numeric() => {84with_match_physical_numpy_polars_type!(first_dtype, |$T| {85numeric_df_to_numpy_view::<$T>(py, df, owner)86})87},88DataType::Datetime(_, _) | DataType::Duration(_) => {89temporal_df_to_numpy_view(py, df, owner)90},91_ => unreachable!(),92};93Some(arr)94}95/// Check whether the data types of the DataFrame allow for creating a NumPy view.96///97/// Returns the common data type if it is supported, otherwise returns `None`.98fn check_df_dtypes_support_view(df: &DataFrame) -> Option<&DataType> {99let columns = df.get_columns();100let first_dtype = columns.first()?.dtype();101102// TODO: Support viewing Array types103if first_dtype.is_array() || !dtype_supports_view(first_dtype) {104return None;105}106if columns.iter().any(|s| s.dtype() != first_dtype) {107return None;108}109Some(first_dtype)110}111/// Returns whether all columns of the dataframe are contiguous in memory.112fn check_df_columns_contiguous(df: &DataFrame) -> bool {113let columns = df.get_columns();114115if columns116.iter()117.any(|s| s.as_materialized_series().n_chunks() > 1)118{119return false;120}121if columns.len() <= 1 {122return true;123}124125match columns.first().unwrap().dtype() {126dt if dt.is_primitive_numeric() => {127with_match_physical_numeric_polars_type!(dt, |$T| {128let slices = columns129.iter()130.map(|s| {131let ca: &ChunkedArray<$T> = s.as_materialized_series().unpack().unwrap();132ca.data_views().next().unwrap()133})134.collect::<Vec<_>>();135136check_slices_contiguous::<$T>(slices)137})138},139DataType::Datetime(_, _) | DataType::Duration(_) => {140let phys: Vec<_> = columns.iter().map(|s| s.to_physical_repr()).collect();141let slices = phys142.iter()143.map(|s| {144let ca = s.i64().unwrap();145ca.data_views().next().unwrap()146})147.collect::<Vec<_>>();148149check_slices_contiguous::<Int64Type>(slices)150},151_ => panic!("invalid data type"),152}153}154/// Returns whether the end and start pointers of all consecutive slices match.155fn check_slices_contiguous<T>(slices: Vec<&[T::Native]>) -> bool156where157T: PolarsNumericType,158{159let first_slice = slices.first().unwrap();160161// Check whether all arrays are from the same buffer.162let mut end_ptr = unsafe { first_slice.as_ptr().add(first_slice.len()) };163slices[1..].iter().all(|slice| {164let slice_ptr = slice.as_ptr();165let valid = std::ptr::eq(slice_ptr, end_ptr);166167end_ptr = unsafe { slice_ptr.add(slice.len()) };168169valid170})171}172173/// Create a NumPy view of a numeric DataFrame.174fn numeric_df_to_numpy_view<T>(py: Python<'_>, df: &DataFrame, owner: Py<PyAny>) -> Py<PyAny>175where176T: PolarsNumericType,177T::Native: Element,178{179let ca: &ChunkedArray<T> = df180.get_columns()181.first()182.unwrap()183.as_materialized_series()184.unpack()185.unwrap();186let first_slice = ca.data_views().next().unwrap();187188let start_ptr = first_slice.as_ptr();189let np_dtype = T::Native::get_dtype(py);190let dims = [first_slice.len(), df.width()].into_dimension();191192unsafe {193create_borrowed_np_array::<_>(194py,195np_dtype,196dims,197flags::NPY_ARRAY_FARRAY_RO,198start_ptr as _,199owner,200)201}202}203/// Create a NumPy view of a Datetime or Duration DataFrame.204fn temporal_df_to_numpy_view(py: Python<'_>, df: &DataFrame, owner: Py<PyAny>) -> Py<PyAny> {205let s = df.get_columns().first().unwrap();206let phys = s.to_physical_repr();207let ca = phys.i64().unwrap();208let first_slice = ca.data_views().next().unwrap();209210let start_ptr = first_slice.as_ptr();211let np_dtype = polars_dtype_to_np_temporal_dtype(py, s.dtype());212let dims = [first_slice.len(), df.width()].into_dimension();213214unsafe {215create_borrowed_np_array::<_>(216py,217np_dtype,218dims,219flags::NPY_ARRAY_FARRAY_RO,220start_ptr as _,221owner,222)223}224}225226fn df_to_numpy_with_copy(227py: Python<'_>,228df: &DataFrame,229order: IndexOrder,230writable: bool,231) -> PyResult<Py<PyAny>> {232if let Some(arr) = try_df_to_numpy_numeric_supertype(py, df, order) {233Ok(arr)234} else {235df_columns_to_numpy(py, df, order, writable)236}237}238fn try_df_to_numpy_numeric_supertype(239py: Python<'_>,240df: &DataFrame,241order: IndexOrder,242) -> Option<Py<PyAny>> {243let st = dtypes_to_supertype(df.iter().map(|s| s.dtype())).ok()?;244245let np_array = match st {246dt if dt.is_primitive_numeric() => with_match_physical_numpy_polars_type!(dt, |$T| {247df.to_ndarray::<$T>(order).ok()?.into_pyarray(py).into_py_any(py).ok()?248}),249_ => return None,250};251Some(np_array)252}253254fn df_columns_to_numpy(255py: Python<'_>,256df: &DataFrame,257order: IndexOrder,258writable: bool,259) -> PyResult<Py<PyAny>> {260let np_arrays = df.iter().map(|s| {261let mut arr = series_to_numpy(py, s, writable, true).unwrap();262263// Convert multidimensional arrays to 1D object arrays.264let shape: Vec<usize> = arr265.getattr(py, intern!(py, "shape"))266.unwrap()267.extract(py)268.unwrap();269if shape.len() > 1 {270// TODO: Downcast the NumPy array to Rust and split without calling into Python.271let subarrays = (0..shape[0]).map(|idx| {272arr.call_method1(py, intern!(py, "__getitem__"), (idx,))273.unwrap()274});275arr = PyArray1::from_iter(py, subarrays).into_py_any(py).unwrap();276}277arr278});279280let numpy = PyModule::import(py, intern!(py, "numpy"))?;281let np_array = match order {282IndexOrder::C => numpy283.getattr(intern!(py, "column_stack"))?284.call1((PyList::new(py, np_arrays)?,))?,285IndexOrder::Fortran => numpy286.getattr(intern!(py, "vstack"))?287.call1((PyList::new(py, np_arrays)?,))?288.getattr(intern!(py, "T"))?,289};290291Ok(np_array.into())292}293294295