Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-python/src/interop/numpy/to_numpy_df.rs
8353 views
1
use ndarray::IntoDimension;
2
use numpy::npyffi::flags;
3
use numpy::{Element, IntoPyArray, PyArray1};
4
use polars_core::prelude::*;
5
use polars_core::utils::dtypes_to_supertype;
6
use polars_core::with_match_physical_numeric_polars_type;
7
use pyo3::exceptions::PyRuntimeError;
8
use pyo3::prelude::*;
9
use pyo3::types::{PyList, PyTuple};
10
use pyo3::{IntoPyObjectExt, intern};
11
12
use super::to_numpy_series::series_to_numpy;
13
use super::utils::{
14
create_borrowed_np_array, dtype_supports_view, polars_dtype_to_np_temporal_dtype,
15
};
16
use crate::conversion::Wrap;
17
use crate::dataframe::PyDataFrame;
18
19
#[pymethods]
20
impl PyDataFrame {
21
/// Convert this DataFrame to a NumPy ndarray.
22
fn to_numpy(
23
&self,
24
py: Python<'_>,
25
order: Wrap<IndexOrder>,
26
writable: bool,
27
allow_copy: bool,
28
) -> PyResult<Py<PyAny>> {
29
df_to_numpy(py, &self.df.read(), order.0, writable, allow_copy)
30
}
31
}
32
33
pub(super) fn df_to_numpy(
34
py: Python<'_>,
35
df: &DataFrame,
36
order: IndexOrder,
37
writable: bool,
38
allow_copy: bool,
39
) -> PyResult<Py<PyAny>> {
40
if df.shape_has_zero() {
41
if df.width() == 0 {
42
let shape = PyTuple::new(py, [df.height(), df.width()])?;
43
let numpy = super::utils::get_numpy_module(py)?;
44
45
return Ok(numpy
46
.call_method1(
47
intern!(py, "zeros"),
48
(shape, numpy.getattr(intern!(py, "int8"))?),
49
)?
50
.unbind());
51
}
52
// Take this path to ensure a writable array.
53
// This does not actually copy data for an empty DataFrame.
54
return df_to_numpy_with_copy(py, df, order, true);
55
}
56
57
if matches!(order, IndexOrder::Fortran) {
58
if let Some(mut arr) = try_df_to_numpy_view(py, df, false) {
59
if writable {
60
if !allow_copy {
61
return Err(PyRuntimeError::new_err(
62
"copy not allowed: cannot create a writable array without copying data",
63
));
64
}
65
arr = arr.call_method0(py, intern!(py, "copy"))?;
66
}
67
return Ok(arr);
68
}
69
}
70
71
if !allow_copy {
72
return Err(PyRuntimeError::new_err(
73
"copy not allowed: cannot convert to a NumPy array without copying data",
74
));
75
}
76
77
df_to_numpy_with_copy(py, df, order, writable)
78
}
79
80
/// Create a NumPy view of the given DataFrame.
81
fn try_df_to_numpy_view(py: Python<'_>, df: &DataFrame, allow_nulls: bool) -> Option<Py<PyAny>> {
82
let first_dtype = check_df_dtypes_support_view(df)?;
83
84
// TODO: Check for nested nulls using `series_contains_null` util when we support Array types.
85
if !allow_nulls && df.columns().iter().any(|s| s.null_count() > 0) {
86
return None;
87
}
88
if !check_df_columns_contiguous(df) {
89
return None;
90
}
91
92
let owner = PyDataFrame::from(df.clone()).into_py_any(py).ok()?; // Keep the DataFrame memory alive.
93
94
let arr = match first_dtype {
95
dt if dt.is_primitive_numeric() => {
96
with_match_physical_numpy_polars_type!(first_dtype, |$T| {
97
numeric_df_to_numpy_view::<$T>(py, df, owner)
98
})
99
},
100
DataType::Datetime(_, _) | DataType::Duration(_) => {
101
temporal_df_to_numpy_view(py, df, owner)
102
},
103
_ => unreachable!(),
104
};
105
Some(arr)
106
}
107
/// Check whether the data types of the DataFrame allow for creating a NumPy view.
108
///
109
/// Returns the common data type if it is supported, otherwise returns `None`.
110
fn check_df_dtypes_support_view(df: &DataFrame) -> Option<&DataType> {
111
let columns = df.columns();
112
let first_dtype = columns.first()?.dtype();
113
114
// TODO: Support viewing Array types
115
if first_dtype.is_array() || !dtype_supports_view(first_dtype) {
116
return None;
117
}
118
if columns.iter().any(|s| s.dtype() != first_dtype) {
119
return None;
120
}
121
Some(first_dtype)
122
}
123
/// Returns whether all columns of the dataframe are contiguous in memory.
124
fn check_df_columns_contiguous(df: &DataFrame) -> bool {
125
let columns = df.columns();
126
127
if columns
128
.iter()
129
.any(|s| s.as_materialized_series().n_chunks() > 1)
130
{
131
return false;
132
}
133
if columns.len() <= 1 {
134
return true;
135
}
136
137
match columns.first().unwrap().dtype() {
138
dt if dt.is_primitive_numeric() => {
139
with_match_physical_numeric_polars_type!(dt, |$T| {
140
let slices = columns
141
.iter()
142
.map(|s| {
143
let ca: &ChunkedArray<$T> = s.as_materialized_series().unpack().unwrap();
144
ca.data_views().next().unwrap()
145
})
146
.collect::<Vec<_>>();
147
148
check_slices_contiguous::<$T>(slices)
149
})
150
},
151
DataType::Datetime(_, _) | DataType::Duration(_) => {
152
let phys: Vec<_> = columns.iter().map(|s| s.to_physical_repr()).collect();
153
let slices = phys
154
.iter()
155
.map(|s| {
156
let ca = s.i64().unwrap();
157
ca.data_views().next().unwrap()
158
})
159
.collect::<Vec<_>>();
160
161
check_slices_contiguous::<Int64Type>(slices)
162
},
163
_ => panic!("invalid data type"),
164
}
165
}
166
/// Returns whether the end and start pointers of all consecutive slices match.
167
fn check_slices_contiguous<T>(slices: Vec<&[T::Native]>) -> bool
168
where
169
T: PolarsNumericType,
170
{
171
let first_slice = slices.first().unwrap();
172
173
// Check whether all arrays are from the same buffer.
174
let mut end_ptr = unsafe { first_slice.as_ptr().add(first_slice.len()) };
175
slices[1..].iter().all(|slice| {
176
let slice_ptr = slice.as_ptr();
177
let valid = std::ptr::eq(slice_ptr, end_ptr);
178
179
end_ptr = unsafe { slice_ptr.add(slice.len()) };
180
181
valid
182
})
183
}
184
185
/// Create a NumPy view of a numeric DataFrame.
186
fn numeric_df_to_numpy_view<T>(py: Python<'_>, df: &DataFrame, owner: Py<PyAny>) -> Py<PyAny>
187
where
188
T: PolarsNumericType,
189
T::Native: Element,
190
{
191
let ca: &ChunkedArray<T> = df
192
.columns()
193
.first()
194
.unwrap()
195
.as_materialized_series()
196
.unpack()
197
.unwrap();
198
let first_slice = ca.data_views().next().unwrap();
199
200
let start_ptr = first_slice.as_ptr();
201
let np_dtype = T::Native::get_dtype(py);
202
let dims = [first_slice.len(), df.width()].into_dimension();
203
204
unsafe {
205
create_borrowed_np_array::<_>(
206
py,
207
np_dtype,
208
dims,
209
flags::NPY_ARRAY_FARRAY_RO,
210
start_ptr as _,
211
owner,
212
)
213
}
214
}
215
/// Create a NumPy view of a Datetime or Duration DataFrame.
216
fn temporal_df_to_numpy_view(py: Python<'_>, df: &DataFrame, owner: Py<PyAny>) -> Py<PyAny> {
217
let s = df.columns().first().unwrap();
218
let phys = s.to_physical_repr();
219
let ca = phys.i64().unwrap();
220
let first_slice = ca.data_views().next().unwrap();
221
222
let start_ptr = first_slice.as_ptr();
223
let np_dtype = polars_dtype_to_np_temporal_dtype(py, s.dtype());
224
let dims = [first_slice.len(), df.width()].into_dimension();
225
226
unsafe {
227
create_borrowed_np_array::<_>(
228
py,
229
np_dtype,
230
dims,
231
flags::NPY_ARRAY_FARRAY_RO,
232
start_ptr as _,
233
owner,
234
)
235
}
236
}
237
238
fn df_to_numpy_with_copy(
239
py: Python<'_>,
240
df: &DataFrame,
241
order: IndexOrder,
242
writable: bool,
243
) -> PyResult<Py<PyAny>> {
244
if let Some(arr) = try_df_to_numpy_numeric_supertype(py, df, order) {
245
Ok(arr)
246
} else {
247
df_columns_to_numpy(py, df, order, writable)
248
}
249
}
250
fn try_df_to_numpy_numeric_supertype(
251
py: Python<'_>,
252
df: &DataFrame,
253
order: IndexOrder,
254
) -> Option<Py<PyAny>> {
255
let st = dtypes_to_supertype(df.columns().iter().map(|s| s.dtype())).ok()?;
256
257
let np_array = match st {
258
dt if dt.is_primitive_numeric() => with_match_physical_numpy_polars_type!(dt, |$T| {
259
df.to_ndarray::<$T>(order).ok()?.into_pyarray(py).into_py_any(py).ok()?
260
}),
261
_ => return None,
262
};
263
Some(np_array)
264
}
265
266
fn df_columns_to_numpy(
267
py: Python<'_>,
268
df: &DataFrame,
269
order: IndexOrder,
270
writable: bool,
271
) -> PyResult<Py<PyAny>> {
272
let np_arrays = df.columns().iter().map(|c| {
273
let mut arr = series_to_numpy(py, c.as_materialized_series(), writable, true).unwrap();
274
275
// Convert multidimensional arrays to 1D object arrays.
276
let shape: Vec<usize> = arr
277
.getattr(py, intern!(py, "shape"))
278
.unwrap()
279
.extract(py)
280
.unwrap();
281
if shape.len() > 1 {
282
// TODO: Downcast the NumPy array to Rust and split without calling into Python.
283
let subarrays = (0..shape[0]).map(|idx| {
284
arr.call_method1(py, intern!(py, "__getitem__"), (idx,))
285
.unwrap()
286
});
287
arr = PyArray1::from_iter(py, subarrays).into_py_any(py).unwrap();
288
}
289
arr
290
});
291
292
let numpy = super::utils::get_numpy_module(py)?;
293
let np_array = match order {
294
IndexOrder::C => numpy
295
.getattr(intern!(py, "column_stack"))?
296
.call1((PyList::new(py, np_arrays)?,))?,
297
IndexOrder::Fortran => numpy
298
.getattr(intern!(py, "vstack"))?
299
.call1((PyList::new(py, np_arrays)?,))?
300
.getattr(intern!(py, "T"))?,
301
};
302
303
Ok(np_array.into())
304
}
305
306