Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-python/src/interop/numpy/to_numpy_df.rs
7892 views
1
use ndarray::IntoDimension;
2
use numpy::npyffi::flags;
3
use numpy::{Element, IntoPyArray, PyArray1};
4
use polars_core::prelude::*;
5
use polars_core::utils::dtypes_to_supertype;
6
use polars_core::with_match_physical_numeric_polars_type;
7
use pyo3::exceptions::PyRuntimeError;
8
use pyo3::prelude::*;
9
use pyo3::types::PyList;
10
use pyo3::{IntoPyObjectExt, intern};
11
12
use super::to_numpy_series::series_to_numpy;
13
use super::utils::{
14
create_borrowed_np_array, dtype_supports_view, polars_dtype_to_np_temporal_dtype,
15
};
16
use crate::conversion::Wrap;
17
use crate::dataframe::PyDataFrame;
18
19
#[pymethods]
20
impl PyDataFrame {
21
/// Convert this DataFrame to a NumPy ndarray.
22
fn to_numpy(
23
&self,
24
py: Python<'_>,
25
order: Wrap<IndexOrder>,
26
writable: bool,
27
allow_copy: bool,
28
) -> PyResult<Py<PyAny>> {
29
df_to_numpy(py, &self.df.read(), order.0, writable, allow_copy)
30
}
31
}
32
33
pub(super) fn df_to_numpy(
34
py: Python<'_>,
35
df: &DataFrame,
36
order: IndexOrder,
37
writable: bool,
38
allow_copy: bool,
39
) -> PyResult<Py<PyAny>> {
40
if df.is_empty() {
41
// Take this path to ensure a writable array.
42
// This does not actually copy data for an empty DataFrame.
43
return df_to_numpy_with_copy(py, df, order, true);
44
}
45
46
if matches!(order, IndexOrder::Fortran) {
47
if let Some(mut arr) = try_df_to_numpy_view(py, df, false) {
48
if writable {
49
if !allow_copy {
50
return Err(PyRuntimeError::new_err(
51
"copy not allowed: cannot create a writable array without copying data",
52
));
53
}
54
arr = arr.call_method0(py, intern!(py, "copy"))?;
55
}
56
return Ok(arr);
57
}
58
}
59
60
if !allow_copy {
61
return Err(PyRuntimeError::new_err(
62
"copy not allowed: cannot convert to a NumPy array without copying data",
63
));
64
}
65
66
df_to_numpy_with_copy(py, df, order, writable)
67
}
68
69
/// Create a NumPy view of the given DataFrame.
70
fn try_df_to_numpy_view(py: Python<'_>, df: &DataFrame, allow_nulls: bool) -> Option<Py<PyAny>> {
71
let first_dtype = check_df_dtypes_support_view(df)?;
72
73
// TODO: Check for nested nulls using `series_contains_null` util when we support Array types.
74
if !allow_nulls && df.get_columns().iter().any(|s| s.null_count() > 0) {
75
return None;
76
}
77
if !check_df_columns_contiguous(df) {
78
return None;
79
}
80
81
let owner = PyDataFrame::from(df.clone()).into_py_any(py).ok()?; // Keep the DataFrame memory alive.
82
83
let arr = match first_dtype {
84
dt if dt.is_primitive_numeric() => {
85
with_match_physical_numpy_polars_type!(first_dtype, |$T| {
86
numeric_df_to_numpy_view::<$T>(py, df, owner)
87
})
88
},
89
DataType::Datetime(_, _) | DataType::Duration(_) => {
90
temporal_df_to_numpy_view(py, df, owner)
91
},
92
_ => unreachable!(),
93
};
94
Some(arr)
95
}
96
/// Check whether the data types of the DataFrame allow for creating a NumPy view.
97
///
98
/// Returns the common data type if it is supported, otherwise returns `None`.
99
fn check_df_dtypes_support_view(df: &DataFrame) -> Option<&DataType> {
100
let columns = df.get_columns();
101
let first_dtype = columns.first()?.dtype();
102
103
// TODO: Support viewing Array types
104
if first_dtype.is_array() || !dtype_supports_view(first_dtype) {
105
return None;
106
}
107
if columns.iter().any(|s| s.dtype() != first_dtype) {
108
return None;
109
}
110
Some(first_dtype)
111
}
112
/// Returns whether all columns of the dataframe are contiguous in memory.
113
fn check_df_columns_contiguous(df: &DataFrame) -> bool {
114
let columns = df.get_columns();
115
116
if columns
117
.iter()
118
.any(|s| s.as_materialized_series().n_chunks() > 1)
119
{
120
return false;
121
}
122
if columns.len() <= 1 {
123
return true;
124
}
125
126
match columns.first().unwrap().dtype() {
127
dt if dt.is_primitive_numeric() => {
128
with_match_physical_numeric_polars_type!(dt, |$T| {
129
let slices = columns
130
.iter()
131
.map(|s| {
132
let ca: &ChunkedArray<$T> = s.as_materialized_series().unpack().unwrap();
133
ca.data_views().next().unwrap()
134
})
135
.collect::<Vec<_>>();
136
137
check_slices_contiguous::<$T>(slices)
138
})
139
},
140
DataType::Datetime(_, _) | DataType::Duration(_) => {
141
let phys: Vec<_> = columns.iter().map(|s| s.to_physical_repr()).collect();
142
let slices = phys
143
.iter()
144
.map(|s| {
145
let ca = s.i64().unwrap();
146
ca.data_views().next().unwrap()
147
})
148
.collect::<Vec<_>>();
149
150
check_slices_contiguous::<Int64Type>(slices)
151
},
152
_ => panic!("invalid data type"),
153
}
154
}
155
/// Returns whether the end and start pointers of all consecutive slices match.
156
fn check_slices_contiguous<T>(slices: Vec<&[T::Native]>) -> bool
157
where
158
T: PolarsNumericType,
159
{
160
let first_slice = slices.first().unwrap();
161
162
// Check whether all arrays are from the same buffer.
163
let mut end_ptr = unsafe { first_slice.as_ptr().add(first_slice.len()) };
164
slices[1..].iter().all(|slice| {
165
let slice_ptr = slice.as_ptr();
166
let valid = std::ptr::eq(slice_ptr, end_ptr);
167
168
end_ptr = unsafe { slice_ptr.add(slice.len()) };
169
170
valid
171
})
172
}
173
174
/// Create a NumPy view of a numeric DataFrame.
175
fn numeric_df_to_numpy_view<T>(py: Python<'_>, df: &DataFrame, owner: Py<PyAny>) -> Py<PyAny>
176
where
177
T: PolarsNumericType,
178
T::Native: Element,
179
{
180
let ca: &ChunkedArray<T> = df
181
.get_columns()
182
.first()
183
.unwrap()
184
.as_materialized_series()
185
.unpack()
186
.unwrap();
187
let first_slice = ca.data_views().next().unwrap();
188
189
let start_ptr = first_slice.as_ptr();
190
let np_dtype = T::Native::get_dtype(py);
191
let dims = [first_slice.len(), df.width()].into_dimension();
192
193
unsafe {
194
create_borrowed_np_array::<_>(
195
py,
196
np_dtype,
197
dims,
198
flags::NPY_ARRAY_FARRAY_RO,
199
start_ptr as _,
200
owner,
201
)
202
}
203
}
204
/// Create a NumPy view of a Datetime or Duration DataFrame.
205
fn temporal_df_to_numpy_view(py: Python<'_>, df: &DataFrame, owner: Py<PyAny>) -> Py<PyAny> {
206
let s = df.get_columns().first().unwrap();
207
let phys = s.to_physical_repr();
208
let ca = phys.i64().unwrap();
209
let first_slice = ca.data_views().next().unwrap();
210
211
let start_ptr = first_slice.as_ptr();
212
let np_dtype = polars_dtype_to_np_temporal_dtype(py, s.dtype());
213
let dims = [first_slice.len(), df.width()].into_dimension();
214
215
unsafe {
216
create_borrowed_np_array::<_>(
217
py,
218
np_dtype,
219
dims,
220
flags::NPY_ARRAY_FARRAY_RO,
221
start_ptr as _,
222
owner,
223
)
224
}
225
}
226
227
fn df_to_numpy_with_copy(
228
py: Python<'_>,
229
df: &DataFrame,
230
order: IndexOrder,
231
writable: bool,
232
) -> PyResult<Py<PyAny>> {
233
if let Some(arr) = try_df_to_numpy_numeric_supertype(py, df, order) {
234
Ok(arr)
235
} else {
236
df_columns_to_numpy(py, df, order, writable)
237
}
238
}
239
fn try_df_to_numpy_numeric_supertype(
240
py: Python<'_>,
241
df: &DataFrame,
242
order: IndexOrder,
243
) -> Option<Py<PyAny>> {
244
let st = dtypes_to_supertype(df.iter().map(|s| s.dtype())).ok()?;
245
246
let np_array = match st {
247
dt if dt.is_primitive_numeric() => with_match_physical_numpy_polars_type!(dt, |$T| {
248
df.to_ndarray::<$T>(order).ok()?.into_pyarray(py).into_py_any(py).ok()?
249
}),
250
_ => return None,
251
};
252
Some(np_array)
253
}
254
255
fn df_columns_to_numpy(
256
py: Python<'_>,
257
df: &DataFrame,
258
order: IndexOrder,
259
writable: bool,
260
) -> PyResult<Py<PyAny>> {
261
let np_arrays = df.iter().map(|s| {
262
let mut arr = series_to_numpy(py, s, writable, true).unwrap();
263
264
// Convert multidimensional arrays to 1D object arrays.
265
let shape: Vec<usize> = arr
266
.getattr(py, intern!(py, "shape"))
267
.unwrap()
268
.extract(py)
269
.unwrap();
270
if shape.len() > 1 {
271
// TODO: Downcast the NumPy array to Rust and split without calling into Python.
272
let subarrays = (0..shape[0]).map(|idx| {
273
arr.call_method1(py, intern!(py, "__getitem__"), (idx,))
274
.unwrap()
275
});
276
arr = PyArray1::from_iter(py, subarrays).into_py_any(py).unwrap();
277
}
278
arr
279
});
280
281
let numpy = PyModule::import(py, intern!(py, "numpy"))?;
282
let np_array = match order {
283
IndexOrder::C => numpy
284
.getattr(intern!(py, "column_stack"))?
285
.call1((PyList::new(py, np_arrays)?,))?,
286
IndexOrder::Fortran => numpy
287
.getattr(intern!(py, "vstack"))?
288
.call1((PyList::new(py, np_arrays)?,))?
289
.getattr(intern!(py, "T"))?,
290
};
291
292
Ok(np_array.into())
293
}
294
295