Path: blob/main/crates/polars-python/src/dataframe/general.rs
8353 views
use std::hash::BuildHasher;12use arrow::bitmap::MutableBitmap;3use either::Either;4use parking_lot::RwLock;5use polars::prelude::*;6use polars_ffi::version_0::SeriesExport;7use pyo3::exceptions::PyIndexError;8use pyo3::prelude::*;9use pyo3::pybacked::PyBackedStr;10use pyo3::types::{PyList, PyType};1112use self::row_encode::{_get_rows_encoded_ca, _get_rows_encoded_ca_unordered};13use super::PyDataFrame;14use crate::PyLazyFrame;15use crate::conversion::Wrap;16use crate::error::PyPolarsErr;17use crate::prelude::strings_to_pl_smallstr;18use crate::py_modules::polars;19use crate::series::{PySeries, ToPySeries, ToSeries};20use crate::utils::{EnterPolarsExt, to_py_err};2122#[pymethods]23impl PyDataFrame {24#[new]25pub fn __init__(columns: Vec<PySeries>) -> PyResult<Self> {26let columns = columns.to_series();27// @scalar-opt28let columns = columns.into_iter().map(|s| s.into()).collect();29let df = DataFrame::new_infer_height(columns).map_err(PyPolarsErr::from)?;30Ok(PyDataFrame::new(df))31}3233#[staticmethod]34pub fn empty_with_height(height: u64) -> PyResult<Self> {35Ok(PyDataFrame::new(DataFrame::empty_with_height(36IdxSize::try_from(height)37.map_err(|_| polars_err!(bigidx, ctx = "DataFrame(height = _)", size = height))38.map_err(to_py_err)? as usize,39)))40}4142pub fn estimated_size(&self) -> usize {43self.df.read().estimated_size()44}4546pub fn dtype_strings(&self) -> Vec<String> {47self.df48.read()49.columns()50.iter()51.map(|s| format!("{}", s.dtype()))52.collect()53}5455pub fn add(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {56py.enter_polars_df(|| &*self.df.read() + &*s.series.read())57}5859pub fn sub(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {60py.enter_polars_df(|| &*self.df.read() - &*s.series.read())61}6263pub fn mul(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {64py.enter_polars_df(|| &*self.df.read() * &*s.series.read())65}6667pub fn div(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {68py.enter_polars_df(|| &*self.df.read() / &*s.series.read())69}7071pub fn rem(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {72py.enter_polars_df(|| &*self.df.read() % &*s.series.read())73}7475pub fn add_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {76py.enter_polars_df(|| &*self.df.read() + &*s.df.read())77}7879pub fn sub_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {80py.enter_polars_df(|| &*self.df.read() - &*s.df.read())81}8283pub fn mul_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {84py.enter_polars_df(|| &*self.df.read() * &*s.df.read())85}8687pub fn div_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {88py.enter_polars_df(|| &*self.df.read() / &*s.df.read())89}9091pub fn rem_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {92py.enter_polars_df(|| &*self.df.read() % &*s.df.read())93}9495#[pyo3(signature = (n, with_replacement, shuffle, seed=None))]96pub fn sample_n(97&self,98py: Python<'_>,99n: &PySeries,100with_replacement: bool,101shuffle: bool,102seed: Option<u64>,103) -> PyResult<Self> {104py.enter_polars_df(|| {105self.df106.read()107.sample_n(&n.series.read(), with_replacement, shuffle, seed)108})109}110111#[pyo3(signature = (frac, with_replacement, shuffle, seed=None))]112pub fn sample_frac(113&self,114py: Python<'_>,115frac: &PySeries,116with_replacement: bool,117shuffle: bool,118seed: Option<u64>,119) -> PyResult<Self> {120py.enter_polars_df(|| {121self.df122.read()123.sample_frac(&frac.series.read(), with_replacement, shuffle, seed)124})125}126127pub fn rechunk(&self, py: Python) -> PyResult<Self> {128py.enter_polars_df(|| {129let mut df = self.df.read().clone();130df.rechunk_mut_par();131Ok(df)132})133}134135/// Format `DataFrame` as String136pub fn as_str(&self) -> String {137format!("{:?}", self.df.read())138}139140pub fn get_columns(&self) -> Vec<PySeries> {141let cols = self.df.read().columns().to_vec();142cols.to_pyseries()143}144145/// Get column names146pub fn columns(&self) -> Vec<String> {147self.df148.read()149.columns()150.iter()151.map(|s| s.name().to_string())152.collect()153}154155/// set column names156pub fn set_column_names(&self, names: Vec<PyBackedStr>) -> PyResult<()> {157self.df158.write()159.set_column_names(&names)160.map_err(PyPolarsErr::from)?;161Ok(())162}163164/// Get datatypes165pub fn dtypes<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyList>> {166let df = self.df.read();167let iter = df168.columns()169.iter()170.map(|s| Wrap(s.dtype().clone()).into_pyobject(py).unwrap());171PyList::new(py, iter)172}173174pub fn n_chunks(&self) -> usize {175self.df.read().first_col_n_chunks()176}177178pub fn shape(&self) -> (usize, usize) {179self.df.read().shape()180}181182pub fn height(&self) -> usize {183self.df.read().height()184}185186pub fn width(&self) -> usize {187self.df.read().width()188}189190pub fn is_empty(&self) -> bool {191self.df.read().shape_has_zero()192}193194pub fn hstack(&self, py: Python<'_>, columns: Vec<PySeries>) -> PyResult<Self> {195let columns = columns.to_series();196// @scalar-opt197let columns = columns.into_iter().map(Into::into).collect::<Vec<_>>();198py.enter_polars_df(|| self.df.read().hstack(&columns))199}200201pub fn hstack_mut(&self, py: Python<'_>, columns: Vec<PySeries>) -> PyResult<()> {202let columns = columns.to_series();203// @scalar-opt204let columns = columns.into_iter().map(Into::into).collect::<Vec<_>>();205py.enter_polars(|| self.df.write().hstack_mut(&columns).map(drop))?;206Ok(())207}208209pub fn vstack(&self, py: Python<'_>, other: &PyDataFrame) -> PyResult<Self> {210py.enter_polars_df(|| self.df.read().vstack(&other.df.read()))211}212213pub fn vstack_mut(&self, py: Python<'_>, other: &PyDataFrame) -> PyResult<()> {214py.enter_polars(|| {215// Prevent self-vstack deadlocks.216let other = other.df.read().clone();217self.df.write().vstack_mut_owned(other)?;218PolarsResult::Ok(())219})?;220Ok(())221}222223pub fn extend(&self, py: Python<'_>, other: &PyDataFrame) -> PyResult<()> {224py.enter_polars(|| {225// Prevent self-extend deadlocks.226let other = other.df.read().clone();227self.df.write().extend(&other)228})?;229Ok(())230}231232pub fn drop_in_place(&self, name: &str) -> PyResult<PySeries> {233let s = self234.df235.write()236.drop_in_place(name)237.map_err(PyPolarsErr::from)?;238let s = s.take_materialized_series();239Ok(PySeries::from(s))240}241242pub fn to_series(&self, index: isize) -> PyResult<PySeries> {243let df = &self.df.read();244245let index_adjusted = if index < 0 {246df.width().checked_sub(index.unsigned_abs())247} else {248Some(usize::try_from(index).unwrap())249};250251let s = index_adjusted.and_then(|i| df.select_at_idx(i));252match s {253Some(s) => Ok(PySeries::new(s.as_materialized_series().clone())),254None => Err(PyIndexError::new_err(255polars_err!(oob = index, df.width()).to_string(),256)),257}258}259260pub fn get_column_index(&self, name: &str) -> PyResult<usize> {261Ok(self262.df263.read()264.try_get_column_index(name)265.map_err(PyPolarsErr::from)?)266}267268pub fn get_column(&self, name: &str) -> PyResult<PySeries> {269let series = self270.df271.read()272.column(name)273.map(|s| PySeries::new(s.as_materialized_series().clone()))274.map_err(PyPolarsErr::from)?;275Ok(series)276}277278pub fn select(&self, py: Python<'_>, columns: Vec<PyBackedStr>) -> PyResult<Self> {279py.enter_polars_df(|| self.df.read().select(columns.iter().map(|x| &**x)))280}281282pub fn gather(&self, py: Python<'_>, indices: Wrap<Vec<IdxSize>>) -> PyResult<Self> {283let indices = indices.0;284let indices = IdxCa::from_vec("".into(), indices);285py.enter_polars_df(|| self.df.read().take(&indices))286}287288pub fn gather_with_series(&self, py: Python<'_>, indices: &PySeries) -> PyResult<Self> {289let idx_s = indices.series.read();290let indices = idx_s.idx().map_err(PyPolarsErr::from)?;291py.enter_polars_df(|| self.df.read().take(indices))292}293294pub fn replace(&self, column: &str, new_col: PySeries) -> PyResult<()> {295self.df296.write()297.replace(column, new_col.series.into_inner().into_column())298.map_err(PyPolarsErr::from)?;299Ok(())300}301302pub fn replace_column(&self, index: usize, new_column: PySeries) -> PyResult<()> {303self.df304.write()305.replace_column(index, new_column.series.into_inner().into_column())306.map_err(PyPolarsErr::from)?;307Ok(())308}309310pub fn insert_column(&self, index: usize, column: PySeries) -> PyResult<()> {311self.df312.write()313.insert_column(index, column.series.into_inner().into_column())314.map_err(PyPolarsErr::from)?;315Ok(())316}317318#[pyo3(signature = (offset, length))]319pub fn slice(&self, py: Python<'_>, offset: i64, length: Option<usize>) -> PyResult<Self> {320py.enter_polars_df(|| {321let df = self.df.read();322let len = length.unwrap_or(usize::MAX);323Ok(df.slice(offset, len))324})325}326327pub fn head(&self, py: Python<'_>, n: usize) -> PyResult<Self> {328py.enter_polars_df(|| Ok(self.df.read().head(Some(n))))329}330331pub fn tail(&self, py: Python<'_>, n: usize) -> PyResult<Self> {332py.enter_polars_df(|| Ok(self.df.read().tail(Some(n))))333}334335pub fn is_unique(&self, py: Python) -> PyResult<PySeries> {336py.enter_polars_series(|| self.df.read().is_unique())337}338339pub fn is_duplicated(&self, py: Python) -> PyResult<PySeries> {340py.enter_polars_series(|| self.df.read().is_duplicated())341}342343pub fn equals(&self, py: Python<'_>, other: &PyDataFrame, null_equal: bool) -> PyResult<bool> {344if null_equal {345py.enter_polars_ok(|| self.df.read().equals_missing(&other.df.read()))346} else {347py.enter_polars_ok(|| self.df.read().equals(&other.df.read()))348}349}350351#[pyo3(signature = (name, offset=None))]352pub fn with_row_index(353&self,354py: Python<'_>,355name: &str,356offset: Option<IdxSize>,357) -> PyResult<Self> {358py.enter_polars_df(|| self.df.read().with_row_index(name.into(), offset))359}360361pub fn _to_metadata(&self) -> Self {362Self {363df: RwLock::new(self.df.read()._to_metadata()),364}365}366367pub fn group_by_map_groups(368&self,369py: Python<'_>,370by: Vec<PyBackedStr>,371lambda: Py<PyAny>,372maintain_order: bool,373) -> PyResult<Self> {374py.enter_polars_df(|| {375let df = self.df.read().clone(); // Clone so we can't deadlock on re-entrance from lambda.376let gb = if maintain_order {377df.group_by_stable(by.iter().map(|x| &**x))378} else {379df.group_by(by.iter().map(|x| &**x))380}?;381382let function = move |df: DataFrame| {383Python::attach(|py| {384let pypolars = polars(py).bind(py);385let pydf = PyDataFrame::new(df);386let python_df_wrapper =387pypolars.getattr("wrap_df").unwrap().call1((pydf,)).unwrap();388389// Call the lambda and get a python-side DataFrame wrapper.390let result_df_wrapper = match lambda.call1(py, (python_df_wrapper,)) {391Ok(pyobj) => pyobj,392Err(e) => panic!("UDF failed: {}", e.value(py)),393};394let py_pydf = result_df_wrapper.getattr(py, "_df").expect(395"Could not get DataFrame attribute '_df'. Make sure that you return a DataFrame object.",396);397398let pydf = py_pydf.extract::<PyDataFrame>(py).unwrap();399Ok(pydf.df.into_inner())400})401};402403gb.apply(function)404})405}406407#[allow(clippy::should_implement_trait)]408pub fn clone(&self) -> Self {409Clone::clone(self)410}411412#[cfg(feature = "pivot")]413#[pyo3(signature = (on, index, value_name=None, variable_name=None))]414pub fn unpivot(415&self,416py: Python<'_>,417on: Option<Vec<PyBackedStr>>,418index: Vec<PyBackedStr>,419value_name: Option<&str>,420variable_name: Option<&str>,421) -> PyResult<Self> {422use polars_ops::unpivot::UnpivotDF;423let args = UnpivotArgsIR::new(424self.df.read().get_column_names_owned(),425on.map(strings_to_pl_smallstr),426strings_to_pl_smallstr(index),427value_name.map(|s| s.into()),428variable_name.map(|s| s.into()),429);430431py.enter_polars_df(|| self.df.read().unpivot2(args))432}433434pub fn partition_by(435&self,436py: Python<'_>,437by: Vec<String>,438maintain_order: bool,439include_key: bool,440) -> PyResult<Vec<Self>> {441let out = py.enter_polars(|| {442if maintain_order {443self.df.read().partition_by_stable(by, include_key)444} else {445self.df.read().partition_by(by, include_key)446}447})?;448449Ok(out.into_iter().map(PyDataFrame::from).collect())450}451452pub fn lazy(&self) -> PyLazyFrame {453self.df.read().clone().lazy().into()454}455456#[pyo3(signature = (columns, separator, drop_first, drop_nulls))]457pub fn to_dummies(458&self,459py: Python<'_>,460columns: Option<Vec<String>>,461separator: Option<&str>,462drop_first: bool,463drop_nulls: bool,464) -> PyResult<Self> {465py.enter_polars_df(|| match columns {466Some(cols) => self.df.read().columns_to_dummies(467cols.iter().map(|x| x as &str).collect(),468separator,469drop_first,470drop_nulls,471),472None => self.df.read().to_dummies(separator, drop_first, drop_nulls),473})474}475476pub fn null_count(&self, py: Python) -> PyResult<Self> {477py.enter_polars_df(|| Ok(self.df.read().null_count()))478}479480pub fn shrink_to_fit(&self, py: Python) -> PyResult<()> {481py.enter_polars_ok(|| self.df.write().shrink_to_fit())482}483484pub fn hash_rows(485&self,486py: Python<'_>,487k0: u64,488k1: u64,489k2: u64,490k3: u64,491) -> PyResult<PySeries> {492// TODO: don't expose all these seeds.493let seed = PlFixedStateQuality::default().hash_one((k0, k1, k2, k3));494let hb = PlSeedableRandomStateQuality::seed_from_u64(seed);495py.enter_polars_series(|| self.df.write().hash_rows(Some(hb)))496}497498#[pyo3(signature = (keep_names_as, column_names))]499pub fn transpose(500&self,501py: Python<'_>,502keep_names_as: Option<&str>,503column_names: &Bound<PyAny>,504) -> PyResult<Self> {505let new_col_names = if let Ok(name) = column_names.extract::<Vec<String>>() {506Some(Either::Right(name))507} else if let Ok(name) = column_names.extract::<String>() {508Some(Either::Left(name))509} else {510None511};512py.enter_polars_df(|| self.df.write().transpose(keep_names_as, new_col_names))513}514515pub fn upsample(516&self,517py: Python<'_>,518by: Vec<String>,519index_column: &str,520every: &str,521stable: bool,522) -> PyResult<Self> {523let every = Duration::try_parse(every).map_err(PyPolarsErr::from)?;524py.enter_polars_df(|| {525if stable {526self.df.read().upsample_stable(by, index_column, every)527} else {528self.df.read().upsample(by, index_column, every)529}530})531}532533pub fn to_struct(534&self,535py: Python<'_>,536name: &str,537invalid_indices: Vec<usize>,538) -> PyResult<PySeries> {539py.enter_polars_series(|| {540let mut ca = self.df.read().clone().into_struct(name.into());541542if !invalid_indices.is_empty() {543let mut validity = MutableBitmap::with_capacity(ca.len());544validity.extend_constant(ca.len(), true);545for i in invalid_indices {546validity.set(i, false);547}548ca.rechunk_mut();549Ok(ca.with_outer_validity(Some(validity.freeze())))550} else {551Ok(ca)552}553})554}555556pub fn clear(&self, py: Python) -> PyResult<Self> {557py.enter_polars_df(|| Ok(self.df.read().clear()))558}559560/// Export the columns via polars-ffi561/// # Safety562/// Needs a preallocated *mut SeriesExport that has allocated space for n_columns.563pub unsafe fn _export_columns(&self, location: usize) {564use polars_ffi::version_0::export_column;565566let df = self.df.read();567let cols = df.columns();568569let location = location as *mut SeriesExport;570571for (i, col) in cols.iter().enumerate() {572let e = export_column(col);573// SAFETY:574// Caller should ensure address is allocated.575// Be careful not to drop `e` here as that should be dropped by the ffi consumer576unsafe { core::ptr::write(location.add(i), e) };577}578}579580/// Import [`Self`] via polars-ffi581/// # Safety582/// [`location`] should be an address that contains [`width`] properly initialized583/// [`SeriesExport`]s584#[classmethod]585pub unsafe fn _import_columns(586_cls: &Bound<PyType>,587location: usize,588width: usize,589) -> PyResult<Self> {590use polars_ffi::version_0::import_df;591592let location = location as *mut SeriesExport;593594let df = unsafe { import_df(location, width) }.map_err(PyPolarsErr::from)?;595Ok(PyDataFrame::from(df))596}597598/// Internal utility function to allow direct access to the row encoding from python.599#[pyo3(signature = (opts))]600fn _row_encode(&self, py: Python<'_>, opts: Vec<(bool, bool, bool)>) -> PyResult<PySeries> {601py.enter_polars_series(|| {602let name = PlSmallStr::from_static("row_enc");603let is_unordered = opts.first().is_some_and(|(_, _, v)| *v);604605let ca = if is_unordered {606_get_rows_encoded_ca_unordered(name, self.df.read().columns())607} else {608let descending = opts.iter().map(|(v, _, _)| *v).collect::<Vec<_>>();609let nulls_last = opts.iter().map(|(_, v, _)| *v).collect::<Vec<_>>();610611_get_rows_encoded_ca(612name,613self.df.read().columns(),614descending.as_slice(),615nulls_last.as_slice(),616false,617)618}?;619620Ok(ca)621})622}623}624625626