Path: blob/main/crates/polars-python/src/dataframe/general.rs
7889 views
use std::hash::BuildHasher;12use arrow::bitmap::MutableBitmap;3use either::Either;4use parking_lot::RwLock;5use polars::prelude::*;6use polars_ffi::version_0::SeriesExport;7use pyo3::exceptions::PyIndexError;8use pyo3::prelude::*;9use pyo3::pybacked::PyBackedStr;10use pyo3::types::{PyList, PyType};1112use self::row_encode::{_get_rows_encoded_ca, _get_rows_encoded_ca_unordered};13use super::PyDataFrame;14use crate::PyLazyFrame;15use crate::conversion::Wrap;16use crate::error::PyPolarsErr;17use crate::prelude::strings_to_pl_smallstr;18use crate::py_modules::polars;19use crate::series::{PySeries, ToPySeries, ToSeries};20use crate::utils::EnterPolarsExt;2122#[pymethods]23impl PyDataFrame {24#[new]25pub fn __init__(columns: Vec<PySeries>) -> PyResult<Self> {26let columns = columns.to_series();27// @scalar-opt28let columns = columns.into_iter().map(|s| s.into()).collect();29let df = DataFrame::new(columns).map_err(PyPolarsErr::from)?;30Ok(PyDataFrame::new(df))31}3233pub fn estimated_size(&self) -> usize {34self.df.read().estimated_size()35}3637pub fn dtype_strings(&self) -> Vec<String> {38self.df39.read()40.get_columns()41.iter()42.map(|s| format!("{}", s.dtype()))43.collect()44}4546pub fn add(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {47py.enter_polars_df(|| &*self.df.read() + &*s.series.read())48}4950pub fn sub(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {51py.enter_polars_df(|| &*self.df.read() - &*s.series.read())52}5354pub fn mul(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {55py.enter_polars_df(|| &*self.df.read() * &*s.series.read())56}5758pub fn div(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {59py.enter_polars_df(|| &*self.df.read() / &*s.series.read())60}6162pub fn rem(&self, py: Python<'_>, s: &PySeries) -> PyResult<Self> {63py.enter_polars_df(|| &*self.df.read() % &*s.series.read())64}6566pub fn add_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {67py.enter_polars_df(|| &*self.df.read() + &*s.df.read())68}6970pub fn sub_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {71py.enter_polars_df(|| &*self.df.read() - &*s.df.read())72}7374pub fn mul_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {75py.enter_polars_df(|| &*self.df.read() * &*s.df.read())76}7778pub fn div_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {79py.enter_polars_df(|| &*self.df.read() / &*s.df.read())80}8182pub fn rem_df(&self, py: Python<'_>, s: &Self) -> PyResult<Self> {83py.enter_polars_df(|| &*self.df.read() % &*s.df.read())84}8586#[pyo3(signature = (n, with_replacement, shuffle, seed=None))]87pub fn sample_n(88&self,89py: Python<'_>,90n: &PySeries,91with_replacement: bool,92shuffle: bool,93seed: Option<u64>,94) -> PyResult<Self> {95py.enter_polars_df(|| {96self.df97.read()98.sample_n(&n.series.read(), with_replacement, shuffle, seed)99})100}101102#[pyo3(signature = (frac, with_replacement, shuffle, seed=None))]103pub fn sample_frac(104&self,105py: Python<'_>,106frac: &PySeries,107with_replacement: bool,108shuffle: bool,109seed: Option<u64>,110) -> PyResult<Self> {111py.enter_polars_df(|| {112self.df113.read()114.sample_frac(&frac.series.read(), with_replacement, shuffle, seed)115})116}117118pub fn rechunk(&self, py: Python) -> PyResult<Self> {119py.enter_polars_df(|| {120let mut df = self.df.read().clone();121df.as_single_chunk_par();122Ok(df)123})124}125126/// Format `DataFrame` as String127pub fn as_str(&self) -> String {128format!("{:?}", self.df.read())129}130131pub fn get_columns(&self) -> Vec<PySeries> {132let cols = self.df.read().get_columns().to_vec();133cols.to_pyseries()134}135136/// Get column names137pub fn columns(&self) -> Vec<String> {138self.df139.read()140.get_columns()141.iter()142.map(|s| s.name().to_string())143.collect()144}145146/// set column names147pub fn set_column_names(&self, names: Vec<PyBackedStr>) -> PyResult<()> {148self.df149.write()150.set_column_names(names.iter().map(|x| &**x))151.map_err(PyPolarsErr::from)?;152Ok(())153}154155/// Get datatypes156pub fn dtypes<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyList>> {157let df = self.df.read();158let iter = df159.iter()160.map(|s| Wrap(s.dtype().clone()).into_pyobject(py).unwrap());161PyList::new(py, iter)162}163164pub fn n_chunks(&self) -> usize {165self.df.read().first_col_n_chunks()166}167168pub fn shape(&self) -> (usize, usize) {169self.df.read().shape()170}171172pub fn height(&self) -> usize {173self.df.read().height()174}175176pub fn width(&self) -> usize {177self.df.read().width()178}179180pub fn is_empty(&self) -> bool {181self.df.read().is_empty()182}183184pub fn hstack(&self, py: Python<'_>, columns: Vec<PySeries>) -> PyResult<Self> {185let columns = columns.to_series();186// @scalar-opt187let columns = columns.into_iter().map(Into::into).collect::<Vec<_>>();188py.enter_polars_df(|| self.df.read().hstack(&columns))189}190191pub fn hstack_mut(&self, py: Python<'_>, columns: Vec<PySeries>) -> PyResult<()> {192let columns = columns.to_series();193// @scalar-opt194let columns = columns.into_iter().map(Into::into).collect::<Vec<_>>();195py.enter_polars(|| self.df.write().hstack_mut(&columns).map(drop))?;196Ok(())197}198199pub fn vstack(&self, py: Python<'_>, other: &PyDataFrame) -> PyResult<Self> {200py.enter_polars_df(|| self.df.read().vstack(&other.df.read()))201}202203pub fn vstack_mut(&self, py: Python<'_>, other: &PyDataFrame) -> PyResult<()> {204py.enter_polars(|| {205// Prevent self-vstack deadlocks.206let other = other.df.read().clone();207self.df.write().vstack_mut(&other)?;208PolarsResult::Ok(())209})?;210Ok(())211}212213pub fn extend(&self, py: Python<'_>, other: &PyDataFrame) -> PyResult<()> {214py.enter_polars(|| {215// Prevent self-extend deadlocks.216let other = other.df.read().clone();217self.df.write().extend(&other)218})?;219Ok(())220}221222pub fn drop_in_place(&self, name: &str) -> PyResult<PySeries> {223let s = self224.df225.write()226.drop_in_place(name)227.map_err(PyPolarsErr::from)?;228let s = s.take_materialized_series();229Ok(PySeries::from(s))230}231232pub fn to_series(&self, index: isize) -> PyResult<PySeries> {233let df = &self.df.read();234235let index_adjusted = if index < 0 {236df.width().checked_sub(index.unsigned_abs())237} else {238Some(usize::try_from(index).unwrap())239};240241let s = index_adjusted.and_then(|i| df.select_at_idx(i));242match s {243Some(s) => Ok(PySeries::new(s.as_materialized_series().clone())),244None => Err(PyIndexError::new_err(245polars_err!(oob = index, df.width()).to_string(),246)),247}248}249250pub fn get_column_index(&self, name: &str) -> PyResult<usize> {251Ok(self252.df253.read()254.try_get_column_index(name)255.map_err(PyPolarsErr::from)?)256}257258pub fn get_column(&self, name: &str) -> PyResult<PySeries> {259let series = self260.df261.read()262.column(name)263.map(|s| PySeries::new(s.as_materialized_series().clone()))264.map_err(PyPolarsErr::from)?;265Ok(series)266}267268pub fn select(&self, py: Python<'_>, columns: Vec<PyBackedStr>) -> PyResult<Self> {269py.enter_polars_df(|| self.df.read().select(columns.iter().map(|x| &**x)))270}271272pub fn gather(&self, py: Python<'_>, indices: Wrap<Vec<IdxSize>>) -> PyResult<Self> {273let indices = indices.0;274let indices = IdxCa::from_vec("".into(), indices);275py.enter_polars_df(|| self.df.read().take(&indices))276}277278pub fn gather_with_series(&self, py: Python<'_>, indices: &PySeries) -> PyResult<Self> {279let idx_s = indices.series.read();280let indices = idx_s.idx().map_err(PyPolarsErr::from)?;281py.enter_polars_df(|| self.df.read().take(indices))282}283284pub fn replace(&self, column: &str, new_col: PySeries) -> PyResult<()> {285self.df286.write()287.replace(column, new_col.series.into_inner())288.map_err(PyPolarsErr::from)?;289Ok(())290}291292pub fn replace_column(&self, index: usize, new_column: PySeries) -> PyResult<()> {293self.df294.write()295.replace_column(index, new_column.series.into_inner())296.map_err(PyPolarsErr::from)?;297Ok(())298}299300pub fn insert_column(&self, index: usize, column: PySeries) -> PyResult<()> {301self.df302.write()303.insert_column(index, column.series.into_inner())304.map_err(PyPolarsErr::from)?;305Ok(())306}307308#[pyo3(signature = (offset, length))]309pub fn slice(&self, py: Python<'_>, offset: i64, length: Option<usize>) -> PyResult<Self> {310py.enter_polars_df(|| {311let df = self.df.read();312Ok(df.slice(offset, length.unwrap_or_else(|| df.height())))313})314}315316pub fn head(&self, py: Python<'_>, n: usize) -> PyResult<Self> {317py.enter_polars_df(|| Ok(self.df.read().head(Some(n))))318}319320pub fn tail(&self, py: Python<'_>, n: usize) -> PyResult<Self> {321py.enter_polars_df(|| Ok(self.df.read().tail(Some(n))))322}323324pub fn is_unique(&self, py: Python) -> PyResult<PySeries> {325py.enter_polars_series(|| self.df.read().is_unique())326}327328pub fn is_duplicated(&self, py: Python) -> PyResult<PySeries> {329py.enter_polars_series(|| self.df.read().is_duplicated())330}331332pub fn equals(&self, py: Python<'_>, other: &PyDataFrame, null_equal: bool) -> PyResult<bool> {333if null_equal {334py.enter_polars_ok(|| self.df.read().equals_missing(&other.df.read()))335} else {336py.enter_polars_ok(|| self.df.read().equals(&other.df.read()))337}338}339340#[pyo3(signature = (name, offset=None))]341pub fn with_row_index(342&self,343py: Python<'_>,344name: &str,345offset: Option<IdxSize>,346) -> PyResult<Self> {347py.enter_polars_df(|| self.df.read().with_row_index(name.into(), offset))348}349350pub fn _to_metadata(&self) -> Self {351Self {352df: RwLock::new(self.df.read()._to_metadata()),353}354}355356pub fn group_by_map_groups(357&self,358py: Python<'_>,359by: Vec<PyBackedStr>,360lambda: Py<PyAny>,361maintain_order: bool,362) -> PyResult<Self> {363py.enter_polars_df(|| {364let df = self.df.read().clone(); // Clone so we can't deadlock on re-entrance from lambda.365let gb = if maintain_order {366df.group_by_stable(by.iter().map(|x| &**x))367} else {368df.group_by(by.iter().map(|x| &**x))369}?;370371let function = move |df: DataFrame| {372Python::attach(|py| {373let pypolars = polars(py).bind(py);374let pydf = PyDataFrame::new(df);375let python_df_wrapper =376pypolars.getattr("wrap_df").unwrap().call1((pydf,)).unwrap();377378// Call the lambda and get a python-side DataFrame wrapper.379let result_df_wrapper = match lambda.call1(py, (python_df_wrapper,)) {380Ok(pyobj) => pyobj,381Err(e) => panic!("UDF failed: {}", e.value(py)),382};383let py_pydf = result_df_wrapper.getattr(py, "_df").expect(384"Could not get DataFrame attribute '_df'. Make sure that you return a DataFrame object.",385);386387let pydf = py_pydf.extract::<PyDataFrame>(py).unwrap();388Ok(pydf.df.into_inner())389})390};391392gb.apply(function)393})394}395396#[allow(clippy::should_implement_trait)]397pub fn clone(&self) -> Self {398Clone::clone(self)399}400401#[cfg(feature = "pivot")]402#[pyo3(signature = (on, index, value_name=None, variable_name=None))]403pub fn unpivot(404&self,405py: Python<'_>,406on: Option<Vec<PyBackedStr>>,407index: Vec<PyBackedStr>,408value_name: Option<&str>,409variable_name: Option<&str>,410) -> PyResult<Self> {411use polars_ops::unpivot::UnpivotDF;412let args = UnpivotArgsIR::new(413self.df.read().get_column_names_owned(),414on.map(strings_to_pl_smallstr),415strings_to_pl_smallstr(index),416value_name.map(|s| s.into()),417variable_name.map(|s| s.into()),418);419420py.enter_polars_df(|| self.df.read().unpivot2(args))421}422423pub fn partition_by(424&self,425py: Python<'_>,426by: Vec<String>,427maintain_order: bool,428include_key: bool,429) -> PyResult<Vec<Self>> {430let out = py.enter_polars(|| {431if maintain_order {432self.df.read().partition_by_stable(by, include_key)433} else {434self.df.read().partition_by(by, include_key)435}436})?;437438Ok(out.into_iter().map(PyDataFrame::from).collect())439}440441pub fn lazy(&self) -> PyLazyFrame {442self.df.read().clone().lazy().into()443}444445#[pyo3(signature = (columns, separator, drop_first, drop_nulls))]446pub fn to_dummies(447&self,448py: Python<'_>,449columns: Option<Vec<String>>,450separator: Option<&str>,451drop_first: bool,452drop_nulls: bool,453) -> PyResult<Self> {454py.enter_polars_df(|| match columns {455Some(cols) => self.df.read().columns_to_dummies(456cols.iter().map(|x| x as &str).collect(),457separator,458drop_first,459drop_nulls,460),461None => self.df.read().to_dummies(separator, drop_first, drop_nulls),462})463}464465pub fn null_count(&self, py: Python) -> PyResult<Self> {466py.enter_polars_df(|| Ok(self.df.read().null_count()))467}468469pub fn shrink_to_fit(&self, py: Python) -> PyResult<()> {470py.enter_polars_ok(|| self.df.write().shrink_to_fit())471}472473pub fn hash_rows(474&self,475py: Python<'_>,476k0: u64,477k1: u64,478k2: u64,479k3: u64,480) -> PyResult<PySeries> {481// TODO: don't expose all these seeds.482let seed = PlFixedStateQuality::default().hash_one((k0, k1, k2, k3));483let hb = PlSeedableRandomStateQuality::seed_from_u64(seed);484py.enter_polars_series(|| self.df.write().hash_rows(Some(hb)))485}486487#[pyo3(signature = (keep_names_as, column_names))]488pub fn transpose(489&self,490py: Python<'_>,491keep_names_as: Option<&str>,492column_names: &Bound<PyAny>,493) -> PyResult<Self> {494let new_col_names = if let Ok(name) = column_names.extract::<Vec<String>>() {495Some(Either::Right(name))496} else if let Ok(name) = column_names.extract::<String>() {497Some(Either::Left(name))498} else {499None500};501py.enter_polars_df(|| self.df.write().transpose(keep_names_as, new_col_names))502}503504pub fn upsample(505&self,506py: Python<'_>,507by: Vec<String>,508index_column: &str,509every: &str,510stable: bool,511) -> PyResult<Self> {512let every = Duration::try_parse(every).map_err(PyPolarsErr::from)?;513py.enter_polars_df(|| {514if stable {515self.df.read().upsample_stable(by, index_column, every)516} else {517self.df.read().upsample(by, index_column, every)518}519})520}521522pub fn to_struct(523&self,524py: Python<'_>,525name: &str,526invalid_indices: Vec<usize>,527) -> PyResult<PySeries> {528py.enter_polars_series(|| {529let mut ca = self.df.read().clone().into_struct(name.into());530531if !invalid_indices.is_empty() {532let mut validity = MutableBitmap::with_capacity(ca.len());533validity.extend_constant(ca.len(), true);534for i in invalid_indices {535validity.set(i, false);536}537ca.rechunk_mut();538Ok(ca.with_outer_validity(Some(validity.freeze())))539} else {540Ok(ca)541}542})543}544545pub fn clear(&self, py: Python) -> PyResult<Self> {546py.enter_polars_df(|| Ok(self.df.read().clear()))547}548549/// Export the columns via polars-ffi550/// # Safety551/// Needs a preallocated *mut SeriesExport that has allocated space for n_columns.552pub unsafe fn _export_columns(&self, location: usize) {553use polars_ffi::version_0::export_column;554555let df = self.df.read();556let cols = df.get_columns();557558let location = location as *mut SeriesExport;559560for (i, col) in cols.iter().enumerate() {561let e = export_column(col);562// SAFETY:563// Caller should ensure address is allocated.564// Be careful not to drop `e` here as that should be dropped by the ffi consumer565unsafe { core::ptr::write(location.add(i), e) };566}567}568569/// Import [`Self`] via polars-ffi570/// # Safety571/// [`location`] should be an address that contains [`width`] properly initialized572/// [`SeriesExport`]s573#[classmethod]574pub unsafe fn _import_columns(575_cls: &Bound<PyType>,576location: usize,577width: usize,578) -> PyResult<Self> {579use polars_ffi::version_0::import_df;580581let location = location as *mut SeriesExport;582583let df = unsafe { import_df(location, width) }.map_err(PyPolarsErr::from)?;584Ok(PyDataFrame::from(df))585}586587/// Internal utility function to allow direct access to the row encoding from python.588#[pyo3(signature = (opts))]589fn _row_encode(&self, py: Python<'_>, opts: Vec<(bool, bool, bool)>) -> PyResult<PySeries> {590py.enter_polars_series(|| {591let name = PlSmallStr::from_static("row_enc");592let is_unordered = opts.first().is_some_and(|(_, _, v)| *v);593594let ca = if is_unordered {595_get_rows_encoded_ca_unordered(name, self.df.read().get_columns())596} else {597let descending = opts.iter().map(|(v, _, _)| *v).collect::<Vec<_>>();598let nulls_last = opts.iter().map(|(_, v, _)| *v).collect::<Vec<_>>();599600_get_rows_encoded_ca(601name,602self.df.read().get_columns(),603descending.as_slice(),604nulls_last.as_slice(),605)606}?;607608Ok(ca)609})610}611}612613614