Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-python/src/dataframe/general.rs
8353 views
1
use std::hash::BuildHasher;
2
3
use arrow::bitmap::MutableBitmap;
4
use either::Either;
5
use parking_lot::RwLock;
6
use polars::prelude::*;
7
use polars_ffi::version_0::SeriesExport;
8
use pyo3::exceptions::PyIndexError;
9
use pyo3::prelude::*;
10
use pyo3::pybacked::PyBackedStr;
11
use pyo3::types::{PyList, PyType};
12
13
use self::row_encode::{_get_rows_encoded_ca, _get_rows_encoded_ca_unordered};
14
use super::PyDataFrame;
15
use crate::PyLazyFrame;
16
use crate::conversion::Wrap;
17
use crate::error::PyPolarsErr;
18
use crate::prelude::strings_to_pl_smallstr;
19
use crate::py_modules::polars;
20
use crate::series::{PySeries, ToPySeries, ToSeries};
21
use crate::utils::{EnterPolarsExt, to_py_err};
22
23
#[pymethods]
24
impl PyDataFrame {
25
#[new]
26
pub fn __init__(columns: Vec<PySeries>) -> PyResult<Self> {
27
let columns = columns.to_series();
28
// @scalar-opt
29
let columns = columns.into_iter().map(|s| s.into()).collect();
30
let df = DataFrame::new_infer_height(columns).map_err(PyPolarsErr::from)?;
31
Ok(PyDataFrame::new(df))
32
}
33
34
#[staticmethod]
35
pub fn empty_with_height(height: u64) -> PyResult<Self> {
36
Ok(PyDataFrame::new(DataFrame::empty_with_height(
37
IdxSize::try_from(height)
38
.map_err(|_| polars_err!(bigidx, ctx = "DataFrame(height = _)", size = height))
39
.map_err(to_py_err)? as usize,
40
)))
41
}
42
43
pub fn estimated_size(&self) -> usize {
44
self.df.read().estimated_size()
45
}
46
47
pub fn dtype_strings(&self) -> Vec<String> {
48
self.df
49
.read()
50
.columns()
51
.iter()
52
.map(|s| format!("{}", s.dtype()))
53
.collect()
54
}
55
56
pub fn add(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
57
py.enter_polars_df(|| &*self.df.read() + &*s.series.read())
58
}
59
60
pub fn sub(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
61
py.enter_polars_df(|| &*self.df.read() - &*s.series.read())
62
}
63
64
pub fn mul(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
65
py.enter_polars_df(|| &*self.df.read() * &*s.series.read())
66
}
67
68
pub fn div(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
69
py.enter_polars_df(|| &*self.df.read() / &*s.series.read())
70
}
71
72
pub fn rem(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {
73
py.enter_polars_df(|| &*self.df.read() % &*s.series.read())
74
}
75
76
pub fn add_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
77
py.enter_polars_df(|| &*self.df.read() + &*s.df.read())
78
}
79
80
pub fn sub_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
81
py.enter_polars_df(|| &*self.df.read() - &*s.df.read())
82
}
83
84
pub fn mul_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
85
py.enter_polars_df(|| &*self.df.read() * &*s.df.read())
86
}
87
88
pub fn div_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
89
py.enter_polars_df(|| &*self.df.read() / &*s.df.read())
90
}
91
92
pub fn rem_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {
93
py.enter_polars_df(|| &*self.df.read() % &*s.df.read())
94
}
95
96
#[pyo3(signature = (n, with_replacement, shuffle, seed=None))]
97
pub fn sample_n(
98
&self,
99
py: Python<'_>,
100
n: &PySeries,
101
with_replacement: bool,
102
shuffle: bool,
103
seed: Option<u64>,
104
) -> PyResult<Self> {
105
py.enter_polars_df(|| {
106
self.df
107
.read()
108
.sample_n(&n.series.read(), with_replacement, shuffle, seed)
109
})
110
}
111
112
#[pyo3(signature = (frac, with_replacement, shuffle, seed=None))]
113
pub fn sample_frac(
114
&self,
115
py: Python<'_>,
116
frac: &PySeries,
117
with_replacement: bool,
118
shuffle: bool,
119
seed: Option<u64>,
120
) -> PyResult<Self> {
121
py.enter_polars_df(|| {
122
self.df
123
.read()
124
.sample_frac(&frac.series.read(), with_replacement, shuffle, seed)
125
})
126
}
127
128
pub fn rechunk(&self, py: Python) -> PyResult<Self> {
129
py.enter_polars_df(|| {
130
let mut df = self.df.read().clone();
131
df.rechunk_mut_par();
132
Ok(df)
133
})
134
}
135
136
/// Format `DataFrame` as String
137
pub fn as_str(&self) -> String {
138
format!("{:?}", self.df.read())
139
}
140
141
pub fn get_columns(&self) -> Vec<PySeries> {
142
let cols = self.df.read().columns().to_vec();
143
cols.to_pyseries()
144
}
145
146
/// Get column names
147
pub fn columns(&self) -> Vec<String> {
148
self.df
149
.read()
150
.columns()
151
.iter()
152
.map(|s| s.name().to_string())
153
.collect()
154
}
155
156
/// set column names
157
pub fn set_column_names(&self, names: Vec<PyBackedStr>) -> PyResult<()> {
158
self.df
159
.write()
160
.set_column_names(&names)
161
.map_err(PyPolarsErr::from)?;
162
Ok(())
163
}
164
165
/// Get datatypes
166
pub fn dtypes<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyList>> {
167
let df = self.df.read();
168
let iter = df
169
.columns()
170
.iter()
171
.map(|s| Wrap(s.dtype().clone()).into_pyobject(py).unwrap());
172
PyList::new(py, iter)
173
}
174
175
pub fn n_chunks(&self) -> usize {
176
self.df.read().first_col_n_chunks()
177
}
178
179
pub fn shape(&self) -> (usize, usize) {
180
self.df.read().shape()
181
}
182
183
pub fn height(&self) -> usize {
184
self.df.read().height()
185
}
186
187
pub fn width(&self) -> usize {
188
self.df.read().width()
189
}
190
191
pub fn is_empty(&self) -> bool {
192
self.df.read().shape_has_zero()
193
}
194
195
pub fn hstack(&self, py: Python<'_>, columns: Vec<PySeries>) -> PyResult<Self> {
196
let columns = columns.to_series();
197
// @scalar-opt
198
let columns = columns.into_iter().map(Into::into).collect::<Vec<_>>();
199
py.enter_polars_df(|| self.df.read().hstack(&columns))
200
}
201
202
pub fn hstack_mut(&self, py: Python<'_>, columns: Vec<PySeries>) -> PyResult<()> {
203
let columns = columns.to_series();
204
// @scalar-opt
205
let columns = columns.into_iter().map(Into::into).collect::<Vec<_>>();
206
py.enter_polars(|| self.df.write().hstack_mut(&columns).map(drop))?;
207
Ok(())
208
}
209
210
pub fn vstack(&self, py: Python<'_>, other: &PyDataFrame) -> PyResult<Self> {
211
py.enter_polars_df(|| self.df.read().vstack(&other.df.read()))
212
}
213
214
pub fn vstack_mut(&self, py: Python<'_>, other: &PyDataFrame) -> PyResult<()> {
215
py.enter_polars(|| {
216
// Prevent self-vstack deadlocks.
217
let other = other.df.read().clone();
218
self.df.write().vstack_mut_owned(other)?;
219
PolarsResult::Ok(())
220
})?;
221
Ok(())
222
}
223
224
pub fn extend(&self, py: Python<'_>, other: &PyDataFrame) -> PyResult<()> {
225
py.enter_polars(|| {
226
// Prevent self-extend deadlocks.
227
let other = other.df.read().clone();
228
self.df.write().extend(&other)
229
})?;
230
Ok(())
231
}
232
233
pub fn drop_in_place(&self, name: &str) -> PyResult<PySeries> {
234
let s = self
235
.df
236
.write()
237
.drop_in_place(name)
238
.map_err(PyPolarsErr::from)?;
239
let s = s.take_materialized_series();
240
Ok(PySeries::from(s))
241
}
242
243
pub fn to_series(&self, index: isize) -> PyResult<PySeries> {
244
let df = &self.df.read();
245
246
let index_adjusted = if index < 0 {
247
df.width().checked_sub(index.unsigned_abs())
248
} else {
249
Some(usize::try_from(index).unwrap())
250
};
251
252
let s = index_adjusted.and_then(|i| df.select_at_idx(i));
253
match s {
254
Some(s) => Ok(PySeries::new(s.as_materialized_series().clone())),
255
None => Err(PyIndexError::new_err(
256
polars_err!(oob = index, df.width()).to_string(),
257
)),
258
}
259
}
260
261
pub fn get_column_index(&self, name: &str) -> PyResult<usize> {
262
Ok(self
263
.df
264
.read()
265
.try_get_column_index(name)
266
.map_err(PyPolarsErr::from)?)
267
}
268
269
pub fn get_column(&self, name: &str) -> PyResult<PySeries> {
270
let series = self
271
.df
272
.read()
273
.column(name)
274
.map(|s| PySeries::new(s.as_materialized_series().clone()))
275
.map_err(PyPolarsErr::from)?;
276
Ok(series)
277
}
278
279
pub fn select(&self, py: Python<'_>, columns: Vec<PyBackedStr>) -> PyResult<Self> {
280
py.enter_polars_df(|| self.df.read().select(columns.iter().map(|x| &**x)))
281
}
282
283
pub fn gather(&self, py: Python<'_>, indices: Wrap<Vec<IdxSize>>) -> PyResult<Self> {
284
let indices = indices.0;
285
let indices = IdxCa::from_vec("".into(), indices);
286
py.enter_polars_df(|| self.df.read().take(&indices))
287
}
288
289
pub fn gather_with_series(&self, py: Python<'_>, indices: &PySeries) -> PyResult<Self> {
290
let idx_s = indices.series.read();
291
let indices = idx_s.idx().map_err(PyPolarsErr::from)?;
292
py.enter_polars_df(|| self.df.read().take(indices))
293
}
294
295
pub fn replace(&self, column: &str, new_col: PySeries) -> PyResult<()> {
296
self.df
297
.write()
298
.replace(column, new_col.series.into_inner().into_column())
299
.map_err(PyPolarsErr::from)?;
300
Ok(())
301
}
302
303
pub fn replace_column(&self, index: usize, new_column: PySeries) -> PyResult<()> {
304
self.df
305
.write()
306
.replace_column(index, new_column.series.into_inner().into_column())
307
.map_err(PyPolarsErr::from)?;
308
Ok(())
309
}
310
311
pub fn insert_column(&self, index: usize, column: PySeries) -> PyResult<()> {
312
self.df
313
.write()
314
.insert_column(index, column.series.into_inner().into_column())
315
.map_err(PyPolarsErr::from)?;
316
Ok(())
317
}
318
319
#[pyo3(signature = (offset, length))]
320
pub fn slice(&self, py: Python<'_>, offset: i64, length: Option<usize>) -> PyResult<Self> {
321
py.enter_polars_df(|| {
322
let df = self.df.read();
323
let len = length.unwrap_or(usize::MAX);
324
Ok(df.slice(offset, len))
325
})
326
}
327
328
pub fn head(&self, py: Python<'_>, n: usize) -> PyResult<Self> {
329
py.enter_polars_df(|| Ok(self.df.read().head(Some(n))))
330
}
331
332
pub fn tail(&self, py: Python<'_>, n: usize) -> PyResult<Self> {
333
py.enter_polars_df(|| Ok(self.df.read().tail(Some(n))))
334
}
335
336
pub fn is_unique(&self, py: Python) -> PyResult<PySeries> {
337
py.enter_polars_series(|| self.df.read().is_unique())
338
}
339
340
pub fn is_duplicated(&self, py: Python) -> PyResult<PySeries> {
341
py.enter_polars_series(|| self.df.read().is_duplicated())
342
}
343
344
pub fn equals(&self, py: Python<'_>, other: &PyDataFrame, null_equal: bool) -> PyResult<bool> {
345
if null_equal {
346
py.enter_polars_ok(|| self.df.read().equals_missing(&other.df.read()))
347
} else {
348
py.enter_polars_ok(|| self.df.read().equals(&other.df.read()))
349
}
350
}
351
352
#[pyo3(signature = (name, offset=None))]
353
pub fn with_row_index(
354
&self,
355
py: Python<'_>,
356
name: &str,
357
offset: Option<IdxSize>,
358
) -> PyResult<Self> {
359
py.enter_polars_df(|| self.df.read().with_row_index(name.into(), offset))
360
}
361
362
pub fn _to_metadata(&self) -> Self {
363
Self {
364
df: RwLock::new(self.df.read()._to_metadata()),
365
}
366
}
367
368
pub fn group_by_map_groups(
369
&self,
370
py: Python<'_>,
371
by: Vec<PyBackedStr>,
372
lambda: Py<PyAny>,
373
maintain_order: bool,
374
) -> PyResult<Self> {
375
py.enter_polars_df(|| {
376
let df = self.df.read().clone(); // Clone so we can't deadlock on re-entrance from lambda.
377
let gb = if maintain_order {
378
df.group_by_stable(by.iter().map(|x| &**x))
379
} else {
380
df.group_by(by.iter().map(|x| &**x))
381
}?;
382
383
let function = move |df: DataFrame| {
384
Python::attach(|py| {
385
let pypolars = polars(py).bind(py);
386
let pydf = PyDataFrame::new(df);
387
let python_df_wrapper =
388
pypolars.getattr("wrap_df").unwrap().call1((pydf,)).unwrap();
389
390
// Call the lambda and get a python-side DataFrame wrapper.
391
let result_df_wrapper = match lambda.call1(py, (python_df_wrapper,)) {
392
Ok(pyobj) => pyobj,
393
Err(e) => panic!("UDF failed: {}", e.value(py)),
394
};
395
let py_pydf = result_df_wrapper.getattr(py, "_df").expect(
396
"Could not get DataFrame attribute '_df'. Make sure that you return a DataFrame object.",
397
);
398
399
let pydf = py_pydf.extract::<PyDataFrame>(py).unwrap();
400
Ok(pydf.df.into_inner())
401
})
402
};
403
404
gb.apply(function)
405
})
406
}
407
408
#[allow(clippy::should_implement_trait)]
409
pub fn clone(&self) -> Self {
410
Clone::clone(self)
411
}
412
413
#[cfg(feature = "pivot")]
414
#[pyo3(signature = (on, index, value_name=None, variable_name=None))]
415
pub fn unpivot(
416
&self,
417
py: Python<'_>,
418
on: Option<Vec<PyBackedStr>>,
419
index: Vec<PyBackedStr>,
420
value_name: Option<&str>,
421
variable_name: Option<&str>,
422
) -> PyResult<Self> {
423
use polars_ops::unpivot::UnpivotDF;
424
let args = UnpivotArgsIR::new(
425
self.df.read().get_column_names_owned(),
426
on.map(strings_to_pl_smallstr),
427
strings_to_pl_smallstr(index),
428
value_name.map(|s| s.into()),
429
variable_name.map(|s| s.into()),
430
);
431
432
py.enter_polars_df(|| self.df.read().unpivot2(args))
433
}
434
435
pub fn partition_by(
436
&self,
437
py: Python<'_>,
438
by: Vec<String>,
439
maintain_order: bool,
440
include_key: bool,
441
) -> PyResult<Vec<Self>> {
442
let out = py.enter_polars(|| {
443
if maintain_order {
444
self.df.read().partition_by_stable(by, include_key)
445
} else {
446
self.df.read().partition_by(by, include_key)
447
}
448
})?;
449
450
Ok(out.into_iter().map(PyDataFrame::from).collect())
451
}
452
453
pub fn lazy(&self) -> PyLazyFrame {
454
self.df.read().clone().lazy().into()
455
}
456
457
#[pyo3(signature = (columns, separator, drop_first, drop_nulls))]
458
pub fn to_dummies(
459
&self,
460
py: Python<'_>,
461
columns: Option<Vec<String>>,
462
separator: Option<&str>,
463
drop_first: bool,
464
drop_nulls: bool,
465
) -> PyResult<Self> {
466
py.enter_polars_df(|| match columns {
467
Some(cols) => self.df.read().columns_to_dummies(
468
cols.iter().map(|x| x as &str).collect(),
469
separator,
470
drop_first,
471
drop_nulls,
472
),
473
None => self.df.read().to_dummies(separator, drop_first, drop_nulls),
474
})
475
}
476
477
pub fn null_count(&self, py: Python) -> PyResult<Self> {
478
py.enter_polars_df(|| Ok(self.df.read().null_count()))
479
}
480
481
pub fn shrink_to_fit(&self, py: Python) -> PyResult<()> {
482
py.enter_polars_ok(|| self.df.write().shrink_to_fit())
483
}
484
485
pub fn hash_rows(
486
&self,
487
py: Python<'_>,
488
k0: u64,
489
k1: u64,
490
k2: u64,
491
k3: u64,
492
) -> PyResult<PySeries> {
493
// TODO: don't expose all these seeds.
494
let seed = PlFixedStateQuality::default().hash_one((k0, k1, k2, k3));
495
let hb = PlSeedableRandomStateQuality::seed_from_u64(seed);
496
py.enter_polars_series(|| self.df.write().hash_rows(Some(hb)))
497
}
498
499
#[pyo3(signature = (keep_names_as, column_names))]
500
pub fn transpose(
501
&self,
502
py: Python<'_>,
503
keep_names_as: Option<&str>,
504
column_names: &Bound<PyAny>,
505
) -> PyResult<Self> {
506
let new_col_names = if let Ok(name) = column_names.extract::<Vec<String>>() {
507
Some(Either::Right(name))
508
} else if let Ok(name) = column_names.extract::<String>() {
509
Some(Either::Left(name))
510
} else {
511
None
512
};
513
py.enter_polars_df(|| self.df.write().transpose(keep_names_as, new_col_names))
514
}
515
516
pub fn upsample(
517
&self,
518
py: Python<'_>,
519
by: Vec<String>,
520
index_column: &str,
521
every: &str,
522
stable: bool,
523
) -> PyResult<Self> {
524
let every = Duration::try_parse(every).map_err(PyPolarsErr::from)?;
525
py.enter_polars_df(|| {
526
if stable {
527
self.df.read().upsample_stable(by, index_column, every)
528
} else {
529
self.df.read().upsample(by, index_column, every)
530
}
531
})
532
}
533
534
pub fn to_struct(
535
&self,
536
py: Python<'_>,
537
name: &str,
538
invalid_indices: Vec<usize>,
539
) -> PyResult<PySeries> {
540
py.enter_polars_series(|| {
541
let mut ca = self.df.read().clone().into_struct(name.into());
542
543
if !invalid_indices.is_empty() {
544
let mut validity = MutableBitmap::with_capacity(ca.len());
545
validity.extend_constant(ca.len(), true);
546
for i in invalid_indices {
547
validity.set(i, false);
548
}
549
ca.rechunk_mut();
550
Ok(ca.with_outer_validity(Some(validity.freeze())))
551
} else {
552
Ok(ca)
553
}
554
})
555
}
556
557
pub fn clear(&self, py: Python) -> PyResult<Self> {
558
py.enter_polars_df(|| Ok(self.df.read().clear()))
559
}
560
561
/// Export the columns via polars-ffi
562
/// # Safety
563
/// Needs a preallocated *mut SeriesExport that has allocated space for n_columns.
564
pub unsafe fn _export_columns(&self, location: usize) {
565
use polars_ffi::version_0::export_column;
566
567
let df = self.df.read();
568
let cols = df.columns();
569
570
let location = location as *mut SeriesExport;
571
572
for (i, col) in cols.iter().enumerate() {
573
let e = export_column(col);
574
// SAFETY:
575
// Caller should ensure address is allocated.
576
// Be careful not to drop `e` here as that should be dropped by the ffi consumer
577
unsafe { core::ptr::write(location.add(i), e) };
578
}
579
}
580
581
/// Import [`Self`] via polars-ffi
582
/// # Safety
583
/// [`location`] should be an address that contains [`width`] properly initialized
584
/// [`SeriesExport`]s
585
#[classmethod]
586
pub unsafe fn _import_columns(
587
_cls: &Bound<PyType>,
588
location: usize,
589
width: usize,
590
) -> PyResult<Self> {
591
use polars_ffi::version_0::import_df;
592
593
let location = location as *mut SeriesExport;
594
595
let df = unsafe { import_df(location, width) }.map_err(PyPolarsErr::from)?;
596
Ok(PyDataFrame::from(df))
597
}
598
599
/// Internal utility function to allow direct access to the row encoding from python.
600
#[pyo3(signature = (opts))]
601
fn _row_encode(&self, py: Python<'_>, opts: Vec<(bool, bool, bool)>) -> PyResult<PySeries> {
602
py.enter_polars_series(|| {
603
let name = PlSmallStr::from_static("row_enc");
604
let is_unordered = opts.first().is_some_and(|(_, _, v)| *v);
605
606
let ca = if is_unordered {
607
_get_rows_encoded_ca_unordered(name, self.df.read().columns())
608
} else {
609
let descending = opts.iter().map(|(v, _, _)| *v).collect::<Vec<_>>();
610
let nulls_last = opts.iter().map(|(_, v, _)| *v).collect::<Vec<_>>();
611
612
_get_rows_encoded_ca(
613
name,
614
self.df.read().columns(),
615
descending.as_slice(),
616
nulls_last.as_slice(),
617
false,
618
)
619
}?;
620
621
Ok(ca)
622
})
623
}
624
}
625
626