#![allow(unsafe_op_in_unsafe_fn)]1//! DataFrame module.2use std::sync::OnceLock;3use std::{mem, ops};45use arrow::datatypes::ArrowSchemaRef;6use polars_row::ArrayRef;7use polars_schema::schema::ensure_matching_schema_names;8use polars_utils::itertools::Itertools;9use rayon::prelude::*;1011use crate::chunked_array::flags::StatisticsFlags;12#[cfg(feature = "algorithm_group_by")]13use crate::chunked_array::ops::unique::is_unique_helper;14use crate::prelude::*;15#[cfg(feature = "row_hash")]16use crate::utils::split_df;17use crate::utils::{Container, NoNull, slice_offsets, try_get_supertype};18use crate::{HEAD_DEFAULT_LENGTH, TAIL_DEFAULT_LENGTH};1920#[cfg(feature = "dataframe_arithmetic")]21mod arithmetic;22pub mod builder;23mod chunks;24pub use chunks::chunk_df_for_writing;25pub mod column;26pub mod explode;27mod from;28#[cfg(feature = "algorithm_group_by")]29pub mod group_by;30pub(crate) mod horizontal;31#[cfg(any(feature = "rows", feature = "object"))]32pub mod row;33mod top_k;34mod upstream_traits;35mod validation;3637use arrow::record_batch::{RecordBatch, RecordBatchT};38use polars_utils::pl_str::PlSmallStr;39#[cfg(feature = "serde")]40use serde::{Deserialize, Serialize};41use strum_macros::IntoStaticStr;4243use crate::POOL;44#[cfg(feature = "row_hash")]45use crate::hashing::_df_rows_to_hashes_threaded_vertical;46use crate::prelude::sort::{argsort_multiple_row_fmt, prepare_arg_sort};47use crate::series::IsSorted;4849#[derive(Copy, Clone, Debug, PartialEq, Eq, Default, Hash, IntoStaticStr)]50#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]51#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]52#[strum(serialize_all = "snake_case")]53pub enum UniqueKeepStrategy {54/// Keep the first unique row.55First,56/// Keep the last unique row.57Last,58/// Keep None of the unique rows.59None,60/// Keep any of the unique rows61/// This allows more optimizations62#[default]63Any,64}6566fn ensure_names_unique<T, F>(items: &[T], mut get_name: F) -> PolarsResult<()>67where68F: for<'a> FnMut(&'a T) -> &'a str,69{70// Always unique.71if items.len() <= 1 {72return Ok(());73}7475if items.len() <= 4 {76// Too small to be worth spawning a hashmap for, this is at most 6 comparisons.77for i in 0..items.len() - 1 {78let name = get_name(&items[i]);79for other in items.iter().skip(i + 1) {80if name == get_name(other) {81polars_bail!(duplicate = name);82}83}84}85} else {86let mut names = PlHashSet::with_capacity(items.len());87for item in items {88let name = get_name(item);89if !names.insert(name) {90polars_bail!(duplicate = name);91}92}93}94Ok(())95}9697/// A contiguous growable collection of `Series` that have the same length.98///99/// ## Use declarations100///101/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).102///103/// ```rust104/// use polars_core::prelude::*; // if the crate polars-core is used directly105/// // use polars::prelude::*; if the crate polars is used106/// ```107///108/// # Initialization109/// ## Default110///111/// A `DataFrame` can be initialized empty:112///113/// ```rust114/// # use polars_core::prelude::*;115/// let df = DataFrame::default();116/// assert!(df.is_empty());117/// ```118///119/// ## Wrapping a `Vec<Series>`120///121/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.122///123/// ```rust124/// # use polars_core::prelude::*;125/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);126/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);127///128/// let df: PolarsResult<DataFrame> = DataFrame::new(vec![s1, s2]);129/// ```130///131/// ## Using a macro132///133/// The [`df!`] macro is a convenient method:134///135/// ```rust136/// # use polars_core::prelude::*;137/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],138/// "Color" => ["Red", "Yellow", "Green"]);139/// ```140///141/// ## Using a CSV file142///143/// See the `polars_io::csv::CsvReader`.144///145/// # Indexing146/// ## By a number147///148/// The `Index<usize>` is implemented for the `DataFrame`.149///150/// ```rust151/// # use polars_core::prelude::*;152/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],153/// "Color" => ["Red", "Yellow", "Green"])?;154///155/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));156/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));157/// # Ok::<(), PolarsError>(())158/// ```159///160/// ## By a `Series` name161///162/// ```rust163/// # use polars_core::prelude::*;164/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],165/// "Color" => ["Red", "Yellow", "Green"])?;166///167/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));168/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));169/// # Ok::<(), PolarsError>(())170/// ```171#[derive(Clone)]172pub struct DataFrame {173height: usize,174// invariant: columns[i].len() == height for each 0 >= i > columns.len()175pub(crate) columns: Vec<Column>,176177/// A cached schema. This might not give correct results if the DataFrame was modified in place178/// between schema and reading.179cached_schema: OnceLock<SchemaRef>,180}181182impl DataFrame {183pub fn clear_schema(&mut self) {184self.cached_schema = OnceLock::new();185}186187#[inline]188pub fn column_iter(&self) -> impl ExactSizeIterator<Item = &Column> {189self.columns.iter()190}191192#[inline]193pub fn materialized_column_iter(&self) -> impl ExactSizeIterator<Item = &Series> {194self.columns.iter().map(Column::as_materialized_series)195}196197#[inline]198pub fn par_materialized_column_iter(&self) -> impl ParallelIterator<Item = &Series> {199self.columns.par_iter().map(Column::as_materialized_series)200}201202/// Returns an estimation of the total (heap) allocated size of the `DataFrame` in bytes.203///204/// # Implementation205/// This estimation is the sum of the size of its buffers, validity, including nested arrays.206/// Multiple arrays may share buffers and bitmaps. Therefore, the size of 2 arrays is not the207/// sum of the sizes computed from this function. In particular, [`StructArray`]'s size is an upper bound.208///209/// When an array is sliced, its allocated size remains constant because the buffer unchanged.210/// However, this function will yield a smaller number. This is because this function returns211/// the visible size of the buffer, not its total capacity.212///213/// FFI buffers are included in this estimation.214pub fn estimated_size(&self) -> usize {215self.columns.iter().map(Column::estimated_size).sum()216}217218// Reduce monomorphization.219fn try_apply_columns(220&self,221func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),222) -> PolarsResult<Vec<Column>> {223self.columns.iter().map(func).collect()224}225// Reduce monomorphization.226pub fn _apply_columns(&self, func: &dyn Fn(&Column) -> Column) -> Vec<Column> {227self.columns.iter().map(func).collect()228}229// Reduce monomorphization.230fn try_apply_columns_par(231&self,232func: &(dyn Fn(&Column) -> PolarsResult<Column> + Send + Sync),233) -> PolarsResult<Vec<Column>> {234POOL.install(|| self.columns.par_iter().map(func).collect())235}236// Reduce monomorphization.237pub fn _apply_columns_par(238&self,239func: &(dyn Fn(&Column) -> Column + Send + Sync),240) -> Vec<Column> {241POOL.install(|| self.columns.par_iter().map(func).collect())242}243244/// Get the index of the column.245fn check_name_to_idx(&self, name: &str) -> PolarsResult<usize> {246self.get_column_index(name)247.ok_or_else(|| polars_err!(col_not_found = name))248}249250fn check_already_present(&self, name: &str) -> PolarsResult<()> {251polars_ensure!(252self.columns.iter().all(|s| s.name().as_str() != name),253Duplicate: "column with name {:?} is already present in the DataFrame", name254);255Ok(())256}257258/// Reserve additional slots into the chunks of the series.259pub(crate) fn reserve_chunks(&mut self, additional: usize) {260for s in &mut self.columns {261if let Column::Series(s) = s {262// SAFETY:263// do not modify the data, simply resize.264unsafe { s.chunks_mut().reserve(additional) }265}266}267}268269/// Create a DataFrame from a Vector of Series.270///271/// Errors if a column names are not unique, or if heights are not all equal.272///273/// # Example274///275/// ```276/// # use polars_core::prelude::*;277/// let s0 = Column::new("days".into(), [0, 1, 2].as_ref());278/// let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());279///280/// let df = DataFrame::new(vec![s0, s1])?;281/// # Ok::<(), PolarsError>(())282/// ```283pub fn new(columns: Vec<Column>) -> PolarsResult<Self> {284DataFrame::validate_columns_slice(&columns)285.map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {e}")))?;286Ok(unsafe { Self::new_no_checks_height_from_first(columns) })287}288289pub fn new_with_height(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {290for col in &columns {291polars_ensure!(292col.len() == height,293ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}",294columns[0].name(), height, col.name(), col.len()295);296}297298Ok(DataFrame {299height,300columns,301cached_schema: OnceLock::new(),302})303}304305/// Converts a sequence of columns into a DataFrame, broadcasting length-1306/// columns to match the other columns.307pub fn new_with_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {308// The length of the longest non-unit length column determines the309// broadcast length. If all columns are unit-length the broadcast length310// is one.311let broadcast_len = columns312.iter()313.map(|s| s.len())314.filter(|l| *l != 1)315.max()316.unwrap_or(1);317Self::new_with_broadcast_len(columns, broadcast_len)318}319320/// Converts a sequence of columns into a DataFrame, broadcasting length-1321/// columns to broadcast_len.322pub fn new_with_broadcast_len(323columns: Vec<Column>,324broadcast_len: usize,325) -> PolarsResult<Self> {326ensure_names_unique(&columns, |s| s.name().as_str())?;327unsafe { Self::new_with_broadcast_no_namecheck(columns, broadcast_len) }328}329330/// Converts a sequence of columns into a DataFrame, broadcasting length-1331/// columns to match the other columns.332///333/// # Safety334/// Does not check that the column names are unique (which they must be).335pub unsafe fn new_with_broadcast_no_namecheck(336mut columns: Vec<Column>,337broadcast_len: usize,338) -> PolarsResult<Self> {339for col in &mut columns {340// Length not equal to the broadcast len, needs broadcast or is an error.341let len = col.len();342if len != broadcast_len {343if len != 1 {344let name = col.name().to_owned();345let extra_info =346if let Some(c) = columns.iter().find(|c| c.len() == broadcast_len) {347format!(" (matching column '{}')", c.name())348} else {349String::new()350};351polars_bail!(352ShapeMismatch: "could not create a new DataFrame: series {name:?} has length {len} while trying to broadcast to length {broadcast_len}{extra_info}",353);354}355*col = col.new_from_index(0, broadcast_len);356}357}358359let length = if columns.is_empty() { 0 } else { broadcast_len };360361Ok(unsafe { DataFrame::new_no_checks(length, columns) })362}363364pub fn new_from_index(&self, index: usize, height: usize) -> Self {365let cols = self.columns.iter().map(|c| c.new_from_index(index, height));366unsafe { Self::new_no_checks(height, cols.collect()) }367}368369/// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).370///371/// # Example372///373/// ```rust374/// use polars_core::prelude::DataFrame;375/// static EMPTY: DataFrame = DataFrame::empty();376/// ```377pub const fn empty() -> Self {378Self::empty_with_height(0)379}380381/// Creates an empty `DataFrame` with a specific `height`.382pub const fn empty_with_height(height: usize) -> Self {383DataFrame {384height,385columns: vec![],386cached_schema: OnceLock::new(),387}388}389390/// Create an empty `DataFrame` with empty columns as per the `schema`.391pub fn empty_with_schema(schema: &Schema) -> Self {392let cols = schema393.iter()394.map(|(name, dtype)| Column::from(Series::new_empty(name.clone(), dtype)))395.collect();396unsafe { DataFrame::new_no_checks(0, cols) }397}398399/// Create an empty `DataFrame` with empty columns as per the `schema`.400pub fn empty_with_arrow_schema(schema: &ArrowSchema) -> Self {401let cols = schema402.iter_values()403.map(|fld| {404Column::from(Series::new_empty(405fld.name.clone(),406&(DataType::from_arrow_field(fld)),407))408})409.collect();410unsafe { DataFrame::new_no_checks(0, cols) }411}412413/// Create a new `DataFrame` with the given schema, only containing nulls.414pub fn full_null(schema: &Schema, height: usize) -> Self {415let columns = schema416.iter_fields()417.map(|f| Column::full_null(f.name.clone(), height, f.dtype()))418.collect();419unsafe { DataFrame::new_no_checks(height, columns) }420}421422/// Removes the last `Series` from the `DataFrame` and returns it, or [`None`] if it is empty.423///424/// # Example425///426/// ```rust427/// # use polars_core::prelude::*;428/// let s1 = Column::new("Ocean".into(), ["Atlantic", "Indian"]);429/// let s2 = Column::new("Area (km²)".into(), [106_460_000, 70_560_000]);430/// let mut df = DataFrame::new(vec![s1.clone(), s2.clone()])?;431///432/// assert_eq!(df.pop(), Some(s2));433/// assert_eq!(df.pop(), Some(s1));434/// assert_eq!(df.pop(), None);435/// assert!(df.is_empty());436/// # Ok::<(), PolarsError>(())437/// ```438pub fn pop(&mut self) -> Option<Column> {439self.clear_schema();440441self.columns.pop()442}443444/// Add a new column at index 0 that counts the rows.445///446/// # Example447///448/// ```449/// # use polars_core::prelude::*;450/// let df1: DataFrame = df!("Name" => ["James", "Mary", "John", "Patricia"])?;451/// assert_eq!(df1.shape(), (4, 1));452///453/// let df2: DataFrame = df1.with_row_index("Id".into(), None)?;454/// assert_eq!(df2.shape(), (4, 2));455/// println!("{}", df2);456///457/// # Ok::<(), PolarsError>(())458/// ```459///460/// Output:461///462/// ```text463/// shape: (4, 2)464/// +-----+----------+465/// | Id | Name |466/// | --- | --- |467/// | u32 | str |468/// +=====+==========+469/// | 0 | James |470/// +-----+----------+471/// | 1 | Mary |472/// +-----+----------+473/// | 2 | John |474/// +-----+----------+475/// | 3 | Patricia |476/// +-----+----------+477/// ```478pub fn with_row_index(&self, name: PlSmallStr, offset: Option<IdxSize>) -> PolarsResult<Self> {479let mut columns = Vec::with_capacity(self.columns.len() + 1);480let offset = offset.unwrap_or(0);481482let col = Column::new_row_index(name, offset, self.height())?;483columns.push(col);484columns.extend_from_slice(&self.columns);485DataFrame::new(columns)486}487488/// Add a row index column in place.489///490/// # Safety491/// The caller should ensure the DataFrame does not already contain a column with the given name.492///493/// # Panics494/// Panics if the resulting column would reach or overflow IdxSize::MAX.495pub unsafe fn with_row_index_mut(496&mut self,497name: PlSmallStr,498offset: Option<IdxSize>,499) -> &mut Self {500// TODO: Make this function unsafe501debug_assert!(502self.columns.iter().all(|c| c.name() != &name),503"with_row_index_mut(): column with name {} already exists",504&name505);506507let offset = offset.unwrap_or(0);508let col = Column::new_row_index(name, offset, self.height()).unwrap();509510self.clear_schema();511self.columns.insert(0, col);512self513}514515/// Create a new `DataFrame` but does not check the length or duplicate occurrence of the516/// `Series`.517///518/// Calculates the height from the first column or `0` if no columns are given.519///520/// # Safety521///522/// It is the callers responsibility to uphold the contract of all `Series`523/// having an equal length and a unique name, if not this may panic down the line.524pub unsafe fn new_no_checks_height_from_first(columns: Vec<Column>) -> DataFrame {525let height = columns.first().map_or(0, Column::len);526unsafe { Self::new_no_checks(height, columns) }527}528529/// Create a new `DataFrame` but does not check the length or duplicate occurrence of the530/// `Series`.531///532/// It is advised to use [DataFrame::new] in favor of this method.533///534/// # Safety535///536/// It is the callers responsibility to uphold the contract of all `Series`537/// having an equal length and a unique name, if not this may panic down the line.538pub unsafe fn new_no_checks(height: usize, columns: Vec<Column>) -> DataFrame {539if cfg!(debug_assertions) {540DataFrame::validate_columns_slice(&columns).unwrap();541}542543unsafe { Self::_new_no_checks_impl(height, columns) }544}545546/// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame547/// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame548/// constructed with this method is generally highly unsafe and should not be long-lived.549#[allow(clippy::missing_safety_doc)]550pub const unsafe fn _new_no_checks_impl(height: usize, columns: Vec<Column>) -> DataFrame {551DataFrame {552height,553columns,554cached_schema: OnceLock::new(),555}556}557558/// Shrink the capacity of this DataFrame to fit its length.559pub fn shrink_to_fit(&mut self) {560// Don't parallelize this. Memory overhead561for s in &mut self.columns {562s.shrink_to_fit();563}564}565566/// Aggregate all the chunks in the DataFrame to a single chunk.567pub fn as_single_chunk(&mut self) -> &mut Self {568// Don't parallelize this. Memory overhead569for s in &mut self.columns {570*s = s.rechunk();571}572self573}574575/// Aggregate all the chunks in the DataFrame to a single chunk in parallel.576/// This may lead to more peak memory consumption.577pub fn as_single_chunk_par(&mut self) -> &mut Self {578if self.columns.iter().any(|c| c.n_chunks() > 1) {579self.columns = self._apply_columns_par(&|s| s.rechunk());580}581self582}583584/// Rechunks all columns to only have a single chunk.585pub fn rechunk_mut(&mut self) {586// SAFETY: We never adjust the length or names of the columns.587let columns = unsafe { self.get_columns_mut() };588589for col in columns.iter_mut().filter(|c| c.n_chunks() > 1) {590*col = col.rechunk();591}592}593594pub fn _deshare_views_mut(&mut self) {595// SAFETY: We never adjust the length or names of the columns.596unsafe {597let columns = self.get_columns_mut();598for col in columns {599let Column::Series(s) = col else { continue };600601if let Ok(ca) = s.binary() {602let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());603*col = Column::from(gc_ca.into_series());604} else if let Ok(ca) = s.str() {605let gc_ca = ca.apply_kernel(&|a| a.deshare().into_boxed());606*col = Column::from(gc_ca.into_series());607}608}609}610}611612/// Rechunks all columns to only have a single chunk and turns it into a [`RecordBatchT`].613pub fn rechunk_to_record_batch(614self,615compat_level: CompatLevel,616) -> RecordBatchT<Box<dyn Array>> {617let height = self.height();618619let (schema, arrays) = self620.columns621.into_iter()622.map(|col| {623let mut series = col.take_materialized_series();624// Rechunk to one chunk if necessary625if series.n_chunks() > 1 {626series = series.rechunk();627}628(629series.field().to_arrow(compat_level),630series.to_arrow(0, compat_level),631)632})633.collect();634635RecordBatchT::new(height, Arc::new(schema), arrays)636}637638/// Returns true if the chunks of the columns do not align and re-chunking should be done639pub fn should_rechunk(&self) -> bool {640// Fast check. It is also needed for correctness, as code below doesn't check if the number641// of chunks is equal.642if !self643.get_columns()644.iter()645.filter_map(|c| c.as_series().map(|s| s.n_chunks()))646.all_equal()647{648return true;649}650651// From here we check chunk lengths.652let mut chunk_lengths = self.materialized_column_iter().map(|s| s.chunk_lengths());653match chunk_lengths.next() {654None => false,655Some(first_column_chunk_lengths) => {656// Fast Path for single Chunk Series657if first_column_chunk_lengths.size_hint().0 == 1 {658return chunk_lengths.any(|cl| cl.size_hint().0 != 1);659}660// Always rechunk if we have more chunks than rows.661// except when we have an empty df containing a single chunk662let height = self.height();663let n_chunks = first_column_chunk_lengths.size_hint().0;664if n_chunks > height && !(height == 0 && n_chunks == 1) {665return true;666}667// Slow Path for multi Chunk series668let v: Vec<_> = first_column_chunk_lengths.collect();669for cl in chunk_lengths {670if cl.enumerate().any(|(idx, el)| Some(&el) != v.get(idx)) {671return true;672}673}674false675},676}677}678679/// Ensure all the chunks in the [`DataFrame`] are aligned.680pub fn align_chunks_par(&mut self) -> &mut Self {681if self.should_rechunk() {682self.as_single_chunk_par()683} else {684self685}686}687688pub fn align_chunks(&mut self) -> &mut Self {689if self.should_rechunk() {690self.as_single_chunk()691} else {692self693}694}695696/// Get the [`DataFrame`] schema.697///698/// # Example699///700/// ```rust701/// # use polars_core::prelude::*;702/// let df: DataFrame = df!("Thing" => ["Observable universe", "Human stupidity"],703/// "Diameter (m)" => [8.8e26, f64::INFINITY])?;704///705/// let f1: Field = Field::new("Thing".into(), DataType::String);706/// let f2: Field = Field::new("Diameter (m)".into(), DataType::Float64);707/// let sc: Schema = Schema::from_iter(vec![f1, f2]);708///709/// assert_eq!(&**df.schema(), &sc);710/// # Ok::<(), PolarsError>(())711/// ```712pub fn schema(&self) -> &SchemaRef {713let out = self.cached_schema.get_or_init(|| {714Arc::new(715self.columns716.iter()717.map(|x| (x.name().clone(), x.dtype().clone()))718.collect(),719)720});721722debug_assert_eq!(out.len(), self.width());723724out725}726727/// Get a reference to the [`DataFrame`] columns.728///729/// # Example730///731/// ```rust732/// # use polars_core::prelude::*;733/// let df: DataFrame = df!("Name" => ["Adenine", "Cytosine", "Guanine", "Thymine"],734/// "Symbol" => ["A", "C", "G", "T"])?;735/// let columns: &[Column] = df.get_columns();736///737/// assert_eq!(columns[0].name(), "Name");738/// assert_eq!(columns[1].name(), "Symbol");739/// # Ok::<(), PolarsError>(())740/// ```741#[inline]742pub fn get_columns(&self) -> &[Column] {743&self.columns744}745746#[inline]747/// Get mutable access to the underlying columns.748///749/// # Safety750///751/// The caller must ensure the length of all [`Series`] remains equal to `height` or752/// [`DataFrame::set_height`] is called afterwards with the appropriate `height`.753/// The caller must ensure that the cached schema is cleared if it modifies the schema by754/// calling [`DataFrame::clear_schema`].755pub unsafe fn get_columns_mut(&mut self) -> &mut Vec<Column> {756&mut self.columns757}758759#[inline]760/// Remove all the columns in the [`DataFrame`] but keep the `height`.761pub fn clear_columns(&mut self) {762unsafe { self.get_columns_mut() }.clear();763self.clear_schema();764}765766#[inline]767/// Extend the columns without checking for name collisions or height.768///769/// # Safety770///771/// The caller needs to ensure that:772/// - Column names are unique within the resulting [`DataFrame`].773/// - The length of each appended column matches the height of the [`DataFrame`]. For774/// `DataFrame`]s with no columns (ZCDFs), it is important that the height is set afterwards775/// with [`DataFrame::set_height`].776pub unsafe fn column_extend_unchecked(&mut self, iter: impl IntoIterator<Item = Column>) {777unsafe { self.get_columns_mut() }.extend(iter);778self.clear_schema();779}780781/// Take ownership of the underlying columns vec.782pub fn take_columns(self) -> Vec<Column> {783self.columns784}785786/// Iterator over the columns as [`Series`].787///788/// # Example789///790/// ```rust791/// # use polars_core::prelude::*;792/// let s1 = Column::new("Name".into(), ["Pythagoras' theorem", "Shannon entropy"]);793/// let s2 = Column::new("Formula".into(), ["a²+b²=c²", "H=-Σ[P(x)log|P(x)|]"]);794/// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2.clone()])?;795///796/// let mut iterator = df.iter();797///798/// assert_eq!(iterator.next(), Some(s1.as_materialized_series()));799/// assert_eq!(iterator.next(), Some(s2.as_materialized_series()));800/// assert_eq!(iterator.next(), None);801/// # Ok::<(), PolarsError>(())802/// ```803pub fn iter(&self) -> impl ExactSizeIterator<Item = &Series> {804self.materialized_column_iter()805}806807/// # Example808///809/// ```rust810/// # use polars_core::prelude::*;811/// let df: DataFrame = df!("Language" => ["Rust", "Python"],812/// "Designer" => ["Graydon Hoare", "Guido van Rossum"])?;813///814/// assert_eq!(df.get_column_names(), &["Language", "Designer"]);815/// # Ok::<(), PolarsError>(())816/// ```817pub fn get_column_names(&self) -> Vec<&PlSmallStr> {818self.columns.iter().map(|s| s.name()).collect()819}820821/// Get the [`Vec<PlSmallStr>`] representing the column names.822pub fn get_column_names_owned(&self) -> Vec<PlSmallStr> {823self.columns.iter().map(|s| s.name().clone()).collect()824}825826pub fn get_column_names_str(&self) -> Vec<&str> {827self.columns.iter().map(|s| s.name().as_str()).collect()828}829830/// Set the column names.831/// # Example832///833/// ```rust834/// # use polars_core::prelude::*;835/// let mut df: DataFrame = df!("Mathematical set" => ["ℕ", "ℤ", "𝔻", "ℚ", "ℝ", "ℂ"])?;836/// df.set_column_names(["Set"])?;837///838/// assert_eq!(df.get_column_names(), &["Set"]);839/// # Ok::<(), PolarsError>(())840/// ```841pub fn set_column_names<I, S>(&mut self, names: I) -> PolarsResult<()>842where843I: IntoIterator<Item = S>,844S: Into<PlSmallStr>,845{846let names = names.into_iter().map(Into::into).collect::<Vec<_>>();847self._set_column_names_impl(names.as_slice())848}849850fn _set_column_names_impl(&mut self, names: &[PlSmallStr]) -> PolarsResult<()> {851polars_ensure!(852names.len() == self.width(),853ShapeMismatch: "{} column names provided for a DataFrame of width {}",854names.len(), self.width()855);856ensure_names_unique(names, |s| s.as_str())?;857858let columns = mem::take(&mut self.columns);859self.columns = columns860.into_iter()861.zip(names)862.map(|(s, name)| {863let mut s = s;864s.rename(name.clone());865s866})867.collect();868self.clear_schema();869Ok(())870}871872/// Get the data types of the columns in the [`DataFrame`].873///874/// # Example875///876/// ```rust877/// # use polars_core::prelude::*;878/// let venus_air: DataFrame = df!("Element" => ["Carbon dioxide", "Nitrogen"],879/// "Fraction" => [0.965, 0.035])?;880///881/// assert_eq!(venus_air.dtypes(), &[DataType::String, DataType::Float64]);882/// # Ok::<(), PolarsError>(())883/// ```884pub fn dtypes(&self) -> Vec<DataType> {885self.columns.iter().map(|s| s.dtype().clone()).collect()886}887888pub(crate) fn first_series_column(&self) -> Option<&Series> {889self.columns.iter().find_map(|col| col.as_series())890}891892/// The number of chunks for the first column.893pub fn first_col_n_chunks(&self) -> usize {894match self.first_series_column() {895None if self.columns.is_empty() => 0,896None => 1,897Some(s) => s.n_chunks(),898}899}900901/// The highest number of chunks for any column.902pub fn max_n_chunks(&self) -> usize {903self.columns904.iter()905.map(|s| s.as_series().map(|s| s.n_chunks()).unwrap_or(1))906.max()907.unwrap_or(0)908}909910/// Get a reference to the schema fields of the [`DataFrame`].911///912/// # Example913///914/// ```rust915/// # use polars_core::prelude::*;916/// let earth: DataFrame = df!("Surface type" => ["Water", "Land"],917/// "Fraction" => [0.708, 0.292])?;918///919/// let f1: Field = Field::new("Surface type".into(), DataType::String);920/// let f2: Field = Field::new("Fraction".into(), DataType::Float64);921///922/// assert_eq!(earth.fields(), &[f1, f2]);923/// # Ok::<(), PolarsError>(())924/// ```925pub fn fields(&self) -> Vec<Field> {926self.columns927.iter()928.map(|s| s.field().into_owned())929.collect()930}931932/// Get (height, width) of the [`DataFrame`].933///934/// # Example935///936/// ```rust937/// # use polars_core::prelude::*;938/// let df0: DataFrame = DataFrame::default();939/// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;940/// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],941/// "2" => [1, 2, 3, 4, 5])?;942///943/// assert_eq!(df0.shape(), (0 ,0));944/// assert_eq!(df1.shape(), (5, 1));945/// assert_eq!(df2.shape(), (5, 2));946/// # Ok::<(), PolarsError>(())947/// ```948pub fn shape(&self) -> (usize, usize) {949(self.height, self.columns.len())950}951952/// Get the width of the [`DataFrame`] which is the number of columns.953///954/// # Example955///956/// ```rust957/// # use polars_core::prelude::*;958/// let df0: DataFrame = DataFrame::default();959/// let df1: DataFrame = df!("Series 1" => [0; 0])?;960/// let df2: DataFrame = df!("Series 1" => [0; 0],961/// "Series 2" => [0; 0])?;962///963/// assert_eq!(df0.width(), 0);964/// assert_eq!(df1.width(), 1);965/// assert_eq!(df2.width(), 2);966/// # Ok::<(), PolarsError>(())967/// ```968pub fn width(&self) -> usize {969self.columns.len()970}971972/// Get the height of the [`DataFrame`] which is the number of rows.973///974/// # Example975///976/// ```rust977/// # use polars_core::prelude::*;978/// let df0: DataFrame = DataFrame::default();979/// let df1: DataFrame = df!("Currency" => ["€", "$"])?;980/// let df2: DataFrame = df!("Currency" => ["€", "$", "¥", "£", "₿"])?;981///982/// assert_eq!(df0.height(), 0);983/// assert_eq!(df1.height(), 2);984/// assert_eq!(df2.height(), 5);985/// # Ok::<(), PolarsError>(())986/// ```987pub fn height(&self) -> usize {988self.height989}990991/// Returns the size as number of rows * number of columns992pub fn size(&self) -> usize {993let s = self.shape();994s.0 * s.1995}996997/// Returns `true` if the [`DataFrame`] contains no rows.998///999/// # Example1000///1001/// ```rust1002/// # use polars_core::prelude::*;1003/// let df1: DataFrame = DataFrame::default();1004/// assert!(df1.is_empty());1005///1006/// let df2: DataFrame = df!("First name" => ["Forever"],1007/// "Last name" => ["Alone"])?;1008/// assert!(!df2.is_empty());1009/// # Ok::<(), PolarsError>(())1010/// ```1011pub fn is_empty(&self) -> bool {1012matches!(self.shape(), (0, _) | (_, 0))1013}10141015/// Set the height (i.e. number of rows) of this [`DataFrame`].1016///1017/// # Safety1018///1019/// This needs to be equal to the length of all the columns.1020pub unsafe fn set_height(&mut self, height: usize) {1021self.height = height;1022}10231024/// Add multiple [`Series`] to a [`DataFrame`].1025/// The added `Series` are required to have the same length.1026///1027/// # Example1028///1029/// ```rust1030/// # use polars_core::prelude::*;1031/// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"])?;1032/// let s1 = Column::new("Proton".into(), [29, 47, 79]);1033/// let s2 = Column::new("Electron".into(), [29, 47, 79]);1034///1035/// let df2: DataFrame = df1.hstack(&[s1, s2])?;1036/// assert_eq!(df2.shape(), (3, 3));1037/// println!("{}", df2);1038/// # Ok::<(), PolarsError>(())1039/// ```1040///1041/// Output:1042///1043/// ```text1044/// shape: (3, 3)1045/// +---------+--------+----------+1046/// | Element | Proton | Electron |1047/// | --- | --- | --- |1048/// | str | i32 | i32 |1049/// +=========+========+==========+1050/// | Copper | 29 | 29 |1051/// +---------+--------+----------+1052/// | Silver | 47 | 47 |1053/// +---------+--------+----------+1054/// | Gold | 79 | 79 |1055/// +---------+--------+----------+1056/// ```1057pub fn hstack(&self, columns: &[Column]) -> PolarsResult<Self> {1058let mut new_cols = self.columns.clone();1059new_cols.extend_from_slice(columns);1060DataFrame::new(new_cols)1061}10621063/// Concatenate a [`DataFrame`] to this [`DataFrame`] and return as newly allocated [`DataFrame`].1064///1065/// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].1066///1067/// # Example1068///1069/// ```rust1070/// # use polars_core::prelude::*;1071/// let df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],1072/// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;1073/// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],1074/// "Melting Point (K)" => [2041.4, 1828.05])?;1075///1076/// let df3: DataFrame = df1.vstack(&df2)?;1077///1078/// assert_eq!(df3.shape(), (5, 2));1079/// println!("{}", df3);1080/// # Ok::<(), PolarsError>(())1081/// ```1082///1083/// Output:1084///1085/// ```text1086/// shape: (5, 2)1087/// +-----------+-------------------+1088/// | Element | Melting Point (K) |1089/// | --- | --- |1090/// | str | f64 |1091/// +===========+===================+1092/// | Copper | 1357.77 |1093/// +-----------+-------------------+1094/// | Silver | 1234.93 |1095/// +-----------+-------------------+1096/// | Gold | 1337.33 |1097/// +-----------+-------------------+1098/// | Platinum | 2041.4 |1099/// +-----------+-------------------+1100/// | Palladium | 1828.05 |1101/// +-----------+-------------------+1102/// ```1103pub fn vstack(&self, other: &DataFrame) -> PolarsResult<Self> {1104let mut df = self.clone();1105df.vstack_mut(other)?;1106Ok(df)1107}11081109/// Concatenate a [`DataFrame`] to this [`DataFrame`]1110///1111/// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].1112///1113/// # Example1114///1115/// ```rust1116/// # use polars_core::prelude::*;1117/// let mut df1: DataFrame = df!("Element" => ["Copper", "Silver", "Gold"],1118/// "Melting Point (K)" => [1357.77, 1234.93, 1337.33])?;1119/// let df2: DataFrame = df!("Element" => ["Platinum", "Palladium"],1120/// "Melting Point (K)" => [2041.4, 1828.05])?;1121///1122/// df1.vstack_mut(&df2)?;1123///1124/// assert_eq!(df1.shape(), (5, 2));1125/// println!("{}", df1);1126/// # Ok::<(), PolarsError>(())1127/// ```1128///1129/// Output:1130///1131/// ```text1132/// shape: (5, 2)1133/// +-----------+-------------------+1134/// | Element | Melting Point (K) |1135/// | --- | --- |1136/// | str | f64 |1137/// +===========+===================+1138/// | Copper | 1357.77 |1139/// +-----------+-------------------+1140/// | Silver | 1234.93 |1141/// +-----------+-------------------+1142/// | Gold | 1337.33 |1143/// +-----------+-------------------+1144/// | Platinum | 2041.4 |1145/// +-----------+-------------------+1146/// | Palladium | 1828.05 |1147/// +-----------+-------------------+1148/// ```1149pub fn vstack_mut(&mut self, other: &DataFrame) -> PolarsResult<&mut Self> {1150if self.width() != other.width() {1151polars_ensure!(1152self.width() == 0,1153ShapeMismatch:1154"unable to append to a DataFrame of width {} with a DataFrame of width {}",1155self.width(), other.width(),1156);1157self.columns.clone_from(&other.columns);1158self.height = other.height;1159return Ok(self);1160}11611162self.columns1163.iter_mut()1164.zip(other.columns.iter())1165.try_for_each::<_, PolarsResult<_>>(|(left, right)| {1166ensure_can_extend(&*left, right)?;1167left.append(right).map_err(|e| {1168e.context(format!("failed to vstack column '{}'", right.name()).into())1169})?;1170Ok(())1171})?;1172self.height += other.height;1173Ok(self)1174}11751176pub fn vstack_mut_owned(&mut self, other: DataFrame) -> PolarsResult<&mut Self> {1177if self.width() != other.width() {1178polars_ensure!(1179self.width() == 0,1180ShapeMismatch:1181"unable to append to a DataFrame of width {} with a DataFrame of width {}",1182self.width(), other.width(),1183);1184self.columns = other.columns;1185self.height = other.height;1186return Ok(self);1187}11881189self.columns1190.iter_mut()1191.zip(other.columns.into_iter())1192.try_for_each::<_, PolarsResult<_>>(|(left, right)| {1193ensure_can_extend(&*left, &right)?;1194let right_name = right.name().clone();1195left.append_owned(right).map_err(|e| {1196e.context(format!("failed to vstack column '{right_name}'").into())1197})?;1198Ok(())1199})?;1200self.height += other.height;1201Ok(self)1202}12031204/// Concatenate a [`DataFrame`] to this [`DataFrame`]1205///1206/// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].1207///1208/// # Panics1209/// Panics if the schema's don't match.1210pub fn vstack_mut_unchecked(&mut self, other: &DataFrame) {1211self.columns1212.iter_mut()1213.zip(other.columns.iter())1214.for_each(|(left, right)| {1215left.append(right)1216.map_err(|e| {1217e.context(format!("failed to vstack column '{}'", right.name()).into())1218})1219.expect("should not fail");1220});1221self.height += other.height;1222}12231224/// Concatenate a [`DataFrame`] to this [`DataFrame`]1225///1226/// If many `vstack` operations are done, it is recommended to call [`DataFrame::align_chunks_par`].1227///1228/// # Panics1229/// Panics if the schema's don't match.1230pub fn vstack_mut_owned_unchecked(&mut self, other: DataFrame) {1231self.columns1232.iter_mut()1233.zip(other.columns)1234.for_each(|(left, right)| {1235left.append_owned(right).expect("should not fail");1236});1237self.height += other.height;1238}12391240/// Extend the memory backed by this [`DataFrame`] with the values from `other`.1241///1242/// Different from [`vstack`](Self::vstack) which adds the chunks from `other` to the chunks of this [`DataFrame`]1243/// `extend` appends the data from `other` to the underlying memory locations and thus may cause a reallocation.1244///1245/// If this does not cause a reallocation, the resulting data structure will not have any extra chunks1246/// and thus will yield faster queries.1247///1248/// Prefer `extend` over `vstack` when you want to do a query after a single append. For instance during1249/// online operations where you add `n` rows and rerun a query.1250///1251/// Prefer `vstack` over `extend` when you want to append many times before doing a query. For instance1252/// when you read in multiple files and when to store them in a single `DataFrame`. In the latter case, finish the sequence1253/// of `append` operations with a [`rechunk`](Self::align_chunks_par).1254pub fn extend(&mut self, other: &DataFrame) -> PolarsResult<()> {1255polars_ensure!(1256self.width() == other.width(),1257ShapeMismatch:1258"unable to extend a DataFrame of width {} with a DataFrame of width {}",1259self.width(), other.width(),1260);12611262self.columns1263.iter_mut()1264.zip(other.columns.iter())1265.try_for_each::<_, PolarsResult<_>>(|(left, right)| {1266ensure_can_extend(&*left, right)?;1267left.extend(right).map_err(|e| {1268e.context(format!("failed to extend column '{}'", right.name()).into())1269})?;1270Ok(())1271})?;1272self.height += other.height;1273self.clear_schema();1274Ok(())1275}12761277/// Remove a column by name and return the column removed.1278///1279/// # Example1280///1281/// ```rust1282/// # use polars_core::prelude::*;1283/// let mut df: DataFrame = df!("Animal" => ["Tiger", "Lion", "Great auk"],1284/// "IUCN" => ["Endangered", "Vulnerable", "Extinct"])?;1285///1286/// let s1: PolarsResult<Column> = df.drop_in_place("Average weight");1287/// assert!(s1.is_err());1288///1289/// let s2: Column = df.drop_in_place("Animal")?;1290/// assert_eq!(s2, Column::new("Animal".into(), &["Tiger", "Lion", "Great auk"]));1291/// # Ok::<(), PolarsError>(())1292/// ```1293pub fn drop_in_place(&mut self, name: &str) -> PolarsResult<Column> {1294let idx = self.check_name_to_idx(name)?;1295self.clear_schema();1296Ok(self.columns.remove(idx))1297}12981299/// Return a new [`DataFrame`] where all null values are dropped.1300///1301/// # Example1302///1303/// ```no_run1304/// # use polars_core::prelude::*;1305/// let df1: DataFrame = df!("Country" => ["Malta", "Liechtenstein", "North Korea"],1306/// "Tax revenue (% GDP)" => [Some(32.7), None, None])?;1307/// assert_eq!(df1.shape(), (3, 2));1308///1309/// let df2: DataFrame = df1.drop_nulls::<String>(None)?;1310/// assert_eq!(df2.shape(), (1, 2));1311/// println!("{}", df2);1312/// # Ok::<(), PolarsError>(())1313/// ```1314///1315/// Output:1316///1317/// ```text1318/// shape: (1, 2)1319/// +---------+---------------------+1320/// | Country | Tax revenue (% GDP) |1321/// | --- | --- |1322/// | str | f64 |1323/// +=========+=====================+1324/// | Malta | 32.7 |1325/// +---------+---------------------+1326/// ```1327pub fn drop_nulls<S>(&self, subset: Option<&[S]>) -> PolarsResult<Self>1328where1329for<'a> &'a S: Into<PlSmallStr>,1330{1331if let Some(v) = subset {1332let v = self.select_columns(v)?;1333self._drop_nulls_impl(v.as_slice())1334} else {1335self._drop_nulls_impl(self.columns.as_slice())1336}1337}13381339fn _drop_nulls_impl(&self, subset: &[Column]) -> PolarsResult<Self> {1340// fast path for no nulls in df1341if subset.iter().all(|s| !s.has_nulls()) {1342return Ok(self.clone());1343}13441345let mut iter = subset.iter();13461347let mask = iter1348.next()1349.ok_or_else(|| polars_err!(NoData: "no data to drop nulls from"))?;1350let mut mask = mask.is_not_null();13511352for c in iter {1353mask = mask & c.is_not_null();1354}1355self.filter(&mask)1356}13571358/// Drop a column by name.1359/// This is a pure method and will return a new [`DataFrame`] instead of modifying1360/// the current one in place.1361///1362/// # Example1363///1364/// ```rust1365/// # use polars_core::prelude::*;1366/// let df1: DataFrame = df!("Ray type" => ["α", "β", "X", "γ"])?;1367/// let df2: DataFrame = df1.drop("Ray type")?;1368///1369/// assert!(df2.is_empty());1370/// # Ok::<(), PolarsError>(())1371/// ```1372pub fn drop(&self, name: &str) -> PolarsResult<Self> {1373let idx = self.check_name_to_idx(name)?;1374let mut new_cols = Vec::with_capacity(self.columns.len() - 1);13751376self.columns.iter().enumerate().for_each(|(i, s)| {1377if i != idx {1378new_cols.push(s.clone())1379}1380});13811382Ok(unsafe { DataFrame::new_no_checks(self.height(), new_cols) })1383}13841385/// Drop columns that are in `names`.1386pub fn drop_many<I, S>(&self, names: I) -> Self1387where1388I: IntoIterator<Item = S>,1389S: Into<PlSmallStr>,1390{1391let names: PlHashSet<PlSmallStr> = names.into_iter().map(|s| s.into()).collect();1392self.drop_many_amortized(&names)1393}13941395/// Drop columns that are in `names` without allocating a [`HashSet`](std::collections::HashSet).1396pub fn drop_many_amortized(&self, names: &PlHashSet<PlSmallStr>) -> DataFrame {1397if names.is_empty() {1398return self.clone();1399}1400let mut new_cols = Vec::with_capacity(self.columns.len().saturating_sub(names.len()));1401self.columns.iter().for_each(|s| {1402if !names.contains(s.name()) {1403new_cols.push(s.clone())1404}1405});14061407unsafe { DataFrame::new_no_checks(self.height(), new_cols) }1408}14091410/// Insert a new column at a given index without checking for duplicates.1411/// This can leave the [`DataFrame`] at an invalid state1412fn insert_column_no_name_check(1413&mut self,1414index: usize,1415column: Column,1416) -> PolarsResult<&mut Self> {1417polars_ensure!(1418self.width() == 0 || column.len() == self.height(),1419ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",1420column.len(), self.height(),1421);14221423if self.width() == 0 {1424self.height = column.len();1425}14261427self.columns.insert(index, column);1428self.clear_schema();1429Ok(self)1430}14311432/// Insert a new column at a given index.1433pub fn insert_column<S: IntoColumn>(1434&mut self,1435index: usize,1436column: S,1437) -> PolarsResult<&mut Self> {1438let column = column.into_column();1439self.check_already_present(column.name().as_str())?;1440self.insert_column_no_name_check(index, column)1441}14421443fn add_column_by_search(&mut self, column: Column) -> PolarsResult<()> {1444if let Some(idx) = self.get_column_index(column.name().as_str()) {1445self.replace_column(idx, column)?;1446} else {1447if self.width() == 0 {1448self.height = column.len();1449}14501451self.columns.push(column);1452self.clear_schema();1453}1454Ok(())1455}14561457/// Add a new column to this [`DataFrame`] or replace an existing one.1458pub fn with_column<C: IntoColumn>(&mut self, column: C) -> PolarsResult<&mut Self> {1459fn inner(df: &mut DataFrame, mut column: Column) -> PolarsResult<&mut DataFrame> {1460let height = df.height();1461if column.len() == 1 && height > 1 {1462column = column.new_from_index(0, height);1463}14641465if column.len() == height || df.get_columns().is_empty() {1466df.add_column_by_search(column)?;1467Ok(df)1468}1469// special case for literals1470else if height == 0 && column.len() == 1 {1471let s = column.clear();1472df.add_column_by_search(s)?;1473Ok(df)1474} else {1475polars_bail!(1476ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",1477column.len(), height,1478);1479}1480}1481let column = column.into_column();1482inner(self, column)1483}14841485/// Adds a column to the [`DataFrame`] without doing any checks1486/// on length or duplicates.1487///1488/// # Safety1489/// The caller must ensure `self.width() == 0 || column.len() == self.height()` .1490pub unsafe fn with_column_unchecked(&mut self, column: Column) -> &mut Self {1491debug_assert!(self.width() == 0 || self.height() == column.len());1492debug_assert!(self.get_column_index(column.name().as_str()).is_none());14931494// SAFETY: Invariant of function guarantees for case `width` > 0. We set the height1495// properly for `width` == 0.1496if self.width() == 0 {1497unsafe { self.set_height(column.len()) };1498}1499unsafe { self.get_columns_mut() }.push(column);1500self.clear_schema();15011502self1503}15041505// Note: Schema can be both input or output_schema1506fn add_column_by_schema(&mut self, c: Column, schema: &Schema) -> PolarsResult<()> {1507let name = c.name();1508if let Some((idx, _, _)) = schema.get_full(name.as_str()) {1509if self.columns.get(idx).map(|s| s.name()) != Some(name) {1510// Given schema is output_schema and we can push.1511if idx == self.columns.len() {1512if self.width() == 0 {1513self.height = c.len();1514}15151516self.columns.push(c);1517self.clear_schema();1518}1519// Schema is incorrect fallback to search1520else {1521debug_assert!(false);1522self.add_column_by_search(c)?;1523}1524} else {1525self.replace_column(idx, c)?;1526}1527} else {1528if self.width() == 0 {1529self.height = c.len();1530}15311532self.columns.push(c);1533self.clear_schema();1534}15351536Ok(())1537}15381539// Note: Schema can be both input or output_schema1540pub fn _add_series(&mut self, series: Vec<Series>, schema: &Schema) -> PolarsResult<()> {1541for (i, s) in series.into_iter().enumerate() {1542// we need to branch here1543// because users can add multiple columns with the same name1544if i == 0 || schema.get(s.name().as_str()).is_some() {1545self.with_column_and_schema(s.into_column(), schema)?;1546} else {1547self.with_column(s.clone().into_column())?;1548}1549}1550Ok(())1551}15521553pub fn _add_columns(&mut self, columns: Vec<Column>, schema: &Schema) -> PolarsResult<()> {1554for (i, s) in columns.into_iter().enumerate() {1555// we need to branch here1556// because users can add multiple columns with the same name1557if i == 0 || schema.get(s.name().as_str()).is_some() {1558self.with_column_and_schema(s, schema)?;1559} else {1560self.with_column(s.clone())?;1561}1562}15631564Ok(())1565}15661567/// Add a new column to this [`DataFrame`] or replace an existing one.1568/// Uses an existing schema to amortize lookups.1569/// If the schema is incorrect, we will fallback to linear search.1570///1571/// Note: Schema can be both input or output_schema1572pub fn with_column_and_schema<C: IntoColumn>(1573&mut self,1574column: C,1575schema: &Schema,1576) -> PolarsResult<&mut Self> {1577let mut column = column.into_column();15781579let height = self.height();1580if column.len() == 1 && height > 1 {1581column = column.new_from_index(0, height);1582}15831584if column.len() == height || self.columns.is_empty() {1585self.add_column_by_schema(column, schema)?;1586Ok(self)1587}1588// special case for literals1589else if height == 0 && column.len() == 1 {1590let s = column.clear();1591self.add_column_by_schema(s, schema)?;1592Ok(self)1593} else {1594polars_bail!(1595ShapeMismatch: "unable to add a column of length {} to a DataFrame of height {}",1596column.len(), height,1597);1598}1599}16001601/// Get a row in the [`DataFrame`]. Beware this is slow.1602///1603/// # Example1604///1605/// ```1606/// # use polars_core::prelude::*;1607/// fn example(df: &mut DataFrame, idx: usize) -> Option<Vec<AnyValue>> {1608/// df.get(idx)1609/// }1610/// ```1611pub fn get(&self, idx: usize) -> Option<Vec<AnyValue<'_>>> {1612match self.columns.first() {1613Some(s) => {1614if s.len() <= idx {1615return None;1616}1617},1618None => return None,1619}1620// SAFETY: we just checked bounds1621unsafe { Some(self.columns.iter().map(|c| c.get_unchecked(idx)).collect()) }1622}16231624/// Select a [`Series`] by index.1625///1626/// # Example1627///1628/// ```rust1629/// # use polars_core::prelude::*;1630/// let df: DataFrame = df!("Star" => ["Sun", "Betelgeuse", "Sirius A", "Sirius B"],1631/// "Absolute magnitude" => [4.83, -5.85, 1.42, 11.18])?;1632///1633/// let s1: Option<&Column> = df.select_at_idx(0);1634/// let s2 = Column::new("Star".into(), ["Sun", "Betelgeuse", "Sirius A", "Sirius B"]);1635///1636/// assert_eq!(s1, Some(&s2));1637/// # Ok::<(), PolarsError>(())1638/// ```1639pub fn select_at_idx(&self, idx: usize) -> Option<&Column> {1640self.columns.get(idx)1641}16421643/// Select column(s) from this [`DataFrame`] by range and return a new [`DataFrame`]1644///1645/// # Examples1646///1647/// ```rust1648/// # use polars_core::prelude::*;1649/// let df = df! {1650/// "0" => [0, 0, 0],1651/// "1" => [1, 1, 1],1652/// "2" => [2, 2, 2]1653/// }?;1654///1655/// assert!(df.select(["0", "1"])?.equals(&df.select_by_range(0..=1)?));1656/// assert!(df.equals(&df.select_by_range(..)?));1657/// # Ok::<(), PolarsError>(())1658/// ```1659pub fn select_by_range<R>(&self, range: R) -> PolarsResult<Self>1660where1661R: ops::RangeBounds<usize>,1662{1663// This function is copied from std::slice::range (https://doc.rust-lang.org/std/slice/fn.range.html)1664// because it is the nightly feature. We should change here if this function were stable.1665fn get_range<R>(range: R, bounds: ops::RangeTo<usize>) -> ops::Range<usize>1666where1667R: ops::RangeBounds<usize>,1668{1669let len = bounds.end;16701671let start: ops::Bound<&usize> = range.start_bound();1672let start = match start {1673ops::Bound::Included(&start) => start,1674ops::Bound::Excluded(start) => start.checked_add(1).unwrap_or_else(|| {1675panic!("attempted to index slice from after maximum usize");1676}),1677ops::Bound::Unbounded => 0,1678};16791680let end: ops::Bound<&usize> = range.end_bound();1681let end = match end {1682ops::Bound::Included(end) => end.checked_add(1).unwrap_or_else(|| {1683panic!("attempted to index slice up to maximum usize");1684}),1685ops::Bound::Excluded(&end) => end,1686ops::Bound::Unbounded => len,1687};16881689if start > end {1690panic!("slice index starts at {start} but ends at {end}");1691}1692if end > len {1693panic!("range end index {end} out of range for slice of length {len}",);1694}16951696ops::Range { start, end }1697}16981699let colnames = self.get_column_names_owned();1700let range = get_range(range, ..colnames.len());17011702self._select_impl(&colnames[range])1703}17041705/// Get column index of a [`Series`] by name.1706/// # Example1707///1708/// ```rust1709/// # use polars_core::prelude::*;1710/// let df: DataFrame = df!("Name" => ["Player 1", "Player 2", "Player 3"],1711/// "Health" => [100, 200, 500],1712/// "Mana" => [250, 100, 0],1713/// "Strength" => [30, 150, 300])?;1714///1715/// assert_eq!(df.get_column_index("Name"), Some(0));1716/// assert_eq!(df.get_column_index("Health"), Some(1));1717/// assert_eq!(df.get_column_index("Mana"), Some(2));1718/// assert_eq!(df.get_column_index("Strength"), Some(3));1719/// assert_eq!(df.get_column_index("Haste"), None);1720/// # Ok::<(), PolarsError>(())1721/// ```1722pub fn get_column_index(&self, name: &str) -> Option<usize> {1723let schema = self.schema();1724if let Some(idx) = schema.index_of(name) {1725if self1726.get_columns()1727.get(idx)1728.is_some_and(|c| c.name() == name)1729{1730return Some(idx);1731}1732}17331734self.columns.iter().position(|s| s.name().as_str() == name)1735}17361737/// Get column index of a [`Series`] by name.1738pub fn try_get_column_index(&self, name: &str) -> PolarsResult<usize> {1739self.get_column_index(name)1740.ok_or_else(|| polars_err!(col_not_found = name))1741}17421743/// Select a single column by name.1744///1745/// # Example1746///1747/// ```rust1748/// # use polars_core::prelude::*;1749/// let s1 = Column::new("Password".into(), ["123456", "[]B$u$g$s$B#u#n#n#y[]{}"]);1750/// let s2 = Column::new("Robustness".into(), ["Weak", "Strong"]);1751/// let df: DataFrame = DataFrame::new(vec![s1.clone(), s2])?;1752///1753/// assert_eq!(df.column("Password")?, &s1);1754/// # Ok::<(), PolarsError>(())1755/// ```1756pub fn column(&self, name: &str) -> PolarsResult<&Column> {1757let idx = self.try_get_column_index(name)?;1758Ok(self.select_at_idx(idx).unwrap())1759}17601761/// Selected multiple columns by name.1762///1763/// # Example1764///1765/// ```rust1766/// # use polars_core::prelude::*;1767/// let df: DataFrame = df!("Latin name" => ["Oncorhynchus kisutch", "Salmo salar"],1768/// "Max weight (kg)" => [16.0, 35.89])?;1769/// let sv: Vec<&Column> = df.columns(["Latin name", "Max weight (kg)"])?;1770///1771/// assert_eq!(&df[0], sv[0]);1772/// assert_eq!(&df[1], sv[1]);1773/// # Ok::<(), PolarsError>(())1774/// ```1775pub fn columns<I, S>(&self, names: I) -> PolarsResult<Vec<&Column>>1776where1777I: IntoIterator<Item = S>,1778S: AsRef<str>,1779{1780names1781.into_iter()1782.map(|name| self.column(name.as_ref()))1783.collect()1784}17851786/// Select column(s) from this [`DataFrame`] and return a new [`DataFrame`].1787///1788/// # Examples1789///1790/// ```1791/// # use polars_core::prelude::*;1792/// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {1793/// df.select(["foo", "bar"])1794/// }1795/// ```1796pub fn select<I, S>(&self, selection: I) -> PolarsResult<Self>1797where1798I: IntoIterator<Item = S>,1799S: Into<PlSmallStr>,1800{1801let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();1802self._select_impl(cols.as_slice())1803}18041805pub fn _select_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {1806ensure_names_unique(cols, |s| s.as_str())?;1807self._select_impl_unchecked(cols)1808}18091810pub fn _select_impl_unchecked(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {1811let selected = self.select_columns_impl(cols)?;1812Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })1813}18141815/// Select with a known schema. The schema names must match the column names of this DataFrame.1816pub fn select_with_schema<I, S>(&self, selection: I, schema: &SchemaRef) -> PolarsResult<Self>1817where1818I: IntoIterator<Item = S>,1819S: Into<PlSmallStr>,1820{1821let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();1822self._select_with_schema_impl(&cols, schema, true)1823}18241825/// Select with a known schema without checking for duplicates in `selection`.1826/// The schema names must match the column names of this DataFrame.1827pub fn select_with_schema_unchecked<I, S>(1828&self,1829selection: I,1830schema: &Schema,1831) -> PolarsResult<Self>1832where1833I: IntoIterator<Item = S>,1834S: Into<PlSmallStr>,1835{1836let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();1837self._select_with_schema_impl(&cols, schema, false)1838}18391840/// * The schema names must match the column names of this DataFrame.1841pub fn _select_with_schema_impl(1842&self,1843cols: &[PlSmallStr],1844schema: &Schema,1845check_duplicates: bool,1846) -> PolarsResult<Self> {1847if check_duplicates {1848ensure_names_unique(cols, |s| s.as_str())?;1849}18501851let selected = self.select_columns_impl_with_schema(cols, schema)?;1852Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })1853}18541855/// A non generic implementation to reduce compiler bloat.1856fn select_columns_impl_with_schema(1857&self,1858cols: &[PlSmallStr],1859schema: &Schema,1860) -> PolarsResult<Vec<Column>> {1861if cfg!(debug_assertions) {1862ensure_matching_schema_names(schema, self.schema())?;1863}18641865cols.iter()1866.map(|name| {1867let index = schema.try_get_full(name.as_str())?.0;1868Ok(self.columns[index].clone())1869})1870.collect()1871}18721873pub fn select_physical<I, S>(&self, selection: I) -> PolarsResult<Self>1874where1875I: IntoIterator<Item = S>,1876S: Into<PlSmallStr>,1877{1878let cols = selection.into_iter().map(|s| s.into()).collect::<Vec<_>>();1879self.select_physical_impl(&cols)1880}18811882fn select_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Self> {1883ensure_names_unique(cols, |s| s.as_str())?;1884let selected = self.select_columns_physical_impl(cols)?;1885Ok(unsafe { DataFrame::new_no_checks(self.height(), selected) })1886}18871888pub fn project(&self, to: SchemaRef) -> PolarsResult<Self> {1889let from = self.schema();1890let columns = to1891.iter_names()1892.map(|name| Ok(self.columns[from.try_index_of(name.as_str())?].clone()))1893.collect::<PolarsResult<Vec<_>>>()?;1894let mut df = unsafe { Self::new_no_checks(self.height(), columns) };1895df.cached_schema = to.into();1896Ok(df)1897}18981899/// Select column(s) from this [`DataFrame`] and return them into a [`Vec`].1900///1901/// # Example1902///1903/// ```rust1904/// # use polars_core::prelude::*;1905/// let df: DataFrame = df!("Name" => ["Methane", "Ethane", "Propane"],1906/// "Carbon" => [1, 2, 3],1907/// "Hydrogen" => [4, 6, 8])?;1908/// let sv: Vec<Column> = df.select_columns(["Carbon", "Hydrogen"])?;1909///1910/// assert_eq!(df["Carbon"], sv[0]);1911/// assert_eq!(df["Hydrogen"], sv[1]);1912/// # Ok::<(), PolarsError>(())1913/// ```1914pub fn select_columns(&self, selection: impl IntoVec<PlSmallStr>) -> PolarsResult<Vec<Column>> {1915let cols = selection.into_vec();1916self.select_columns_impl(&cols)1917}19181919fn _names_to_idx_map(&self) -> PlHashMap<&str, usize> {1920self.columns1921.iter()1922.enumerate()1923.map(|(i, s)| (s.name().as_str(), i))1924.collect()1925}19261927/// A non generic implementation to reduce compiler bloat.1928fn select_columns_physical_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {1929let selected = if cols.len() > 1 && self.columns.len() > 10 {1930let name_to_idx = self._names_to_idx_map();1931cols.iter()1932.map(|name| {1933let idx = *name_to_idx1934.get(name.as_str())1935.ok_or_else(|| polars_err!(col_not_found = name))?;1936Ok(self.select_at_idx(idx).unwrap().to_physical_repr())1937})1938.collect::<PolarsResult<Vec<_>>>()?1939} else {1940cols.iter()1941.map(|c| self.column(c.as_str()).map(|s| s.to_physical_repr()))1942.collect::<PolarsResult<Vec<_>>>()?1943};19441945Ok(selected)1946}19471948/// A non generic implementation to reduce compiler bloat.1949fn select_columns_impl(&self, cols: &[PlSmallStr]) -> PolarsResult<Vec<Column>> {1950let selected = if cols.len() > 1 && self.columns.len() > 10 {1951// we hash, because there are user that having millions of columns.1952// # https://github.com/pola-rs/polars/issues/10231953let name_to_idx = self._names_to_idx_map();19541955cols.iter()1956.map(|name| {1957let idx = *name_to_idx1958.get(name.as_str())1959.ok_or_else(|| polars_err!(col_not_found = name))?;1960Ok(self.select_at_idx(idx).unwrap().clone())1961})1962.collect::<PolarsResult<Vec<_>>>()?1963} else {1964cols.iter()1965.map(|c| self.column(c.as_str()).cloned())1966.collect::<PolarsResult<Vec<_>>>()?1967};19681969Ok(selected)1970}19711972fn filter_height(&self, filtered: &[Column], mask: &BooleanChunked) -> usize {1973// If there is a filtered column just see how many columns there are left.1974if let Some(fst) = filtered.first() {1975return fst.len();1976}19771978// Otherwise, count the number of values that would be filtered and return that height.1979let num_trues = mask.num_trues();1980if mask.len() == self.height() {1981num_trues1982} else {1983// This is for broadcasting masks1984debug_assert!(num_trues == 0 || num_trues == 1);1985self.height() * num_trues1986}1987}19881989/// Take the [`DataFrame`] rows by a boolean mask.1990///1991/// # Example1992///1993/// ```1994/// # use polars_core::prelude::*;1995/// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {1996/// let mask = df.column("sepal_width")?.is_not_null();1997/// df.filter(&mask)1998/// }1999/// ```2000pub fn filter(&self, mask: &BooleanChunked) -> PolarsResult<Self> {2001let new_col = self.try_apply_columns_par(&|s| s.filter(mask))?;2002let height = self.filter_height(&new_col, mask);20032004Ok(unsafe { DataFrame::new_no_checks(height, new_col) })2005}20062007/// Same as `filter` but does not parallelize.2008pub fn _filter_seq(&self, mask: &BooleanChunked) -> PolarsResult<Self> {2009let new_col = self.try_apply_columns(&|s| s.filter(mask))?;2010let height = self.filter_height(&new_col, mask);20112012Ok(unsafe { DataFrame::new_no_checks(height, new_col) })2013}20142015/// Take [`DataFrame`] rows by index values.2016///2017/// # Example2018///2019/// ```2020/// # use polars_core::prelude::*;2021/// fn example(df: &DataFrame) -> PolarsResult<DataFrame> {2022/// let idx = IdxCa::new("idx".into(), [0, 1, 9]);2023/// df.take(&idx)2024/// }2025/// ```2026pub fn take(&self, indices: &IdxCa) -> PolarsResult<Self> {2027let new_col = POOL.install(|| self.try_apply_columns_par(&|s| s.take(indices)))?;20282029Ok(unsafe { DataFrame::new_no_checks(indices.len(), new_col) })2030}20312032/// # Safety2033/// The indices must be in-bounds.2034pub unsafe fn take_unchecked(&self, idx: &IdxCa) -> Self {2035self.take_unchecked_impl(idx, true)2036}20372038/// # Safety2039/// The indices must be in-bounds.2040pub unsafe fn take_unchecked_impl(&self, idx: &IdxCa, allow_threads: bool) -> Self {2041let cols = if allow_threads {2042POOL.install(|| self._apply_columns_par(&|c| c.take_unchecked(idx)))2043} else {2044self._apply_columns(&|s| s.take_unchecked(idx))2045};2046unsafe { DataFrame::new_no_checks(idx.len(), cols) }2047}20482049/// # Safety2050/// The indices must be in-bounds.2051pub unsafe fn take_slice_unchecked(&self, idx: &[IdxSize]) -> Self {2052self.take_slice_unchecked_impl(idx, true)2053}20542055/// # Safety2056/// The indices must be in-bounds.2057pub unsafe fn take_slice_unchecked_impl(&self, idx: &[IdxSize], allow_threads: bool) -> Self {2058let cols = if allow_threads {2059POOL.install(|| self._apply_columns_par(&|s| s.take_slice_unchecked(idx)))2060} else {2061self._apply_columns(&|s| s.take_slice_unchecked(idx))2062};2063unsafe { DataFrame::new_no_checks(idx.len(), cols) }2064}20652066/// Rename a column in the [`DataFrame`].2067///2068/// # Example2069///2070/// ```2071/// # use polars_core::prelude::*;2072/// fn example(df: &mut DataFrame) -> PolarsResult<&mut DataFrame> {2073/// let original_name = "foo";2074/// let new_name = "bar";2075/// df.rename(original_name, new_name.into())2076/// }2077/// ```2078pub fn rename(&mut self, column: &str, name: PlSmallStr) -> PolarsResult<&mut Self> {2079if column == name.as_str() {2080return Ok(self);2081}2082polars_ensure!(2083!self.schema().contains(&name),2084Duplicate: "column rename attempted with already existing name \"{name}\""2085);20862087self.get_column_index(column)2088.and_then(|idx| self.columns.get_mut(idx))2089.ok_or_else(|| polars_err!(col_not_found = column))2090.map(|c| c.rename(name))?;2091self.clear_schema();20922093Ok(self)2094}20952096/// Sort [`DataFrame`] in place.2097///2098/// See [`DataFrame::sort`] for more instruction.2099pub fn sort_in_place(2100&mut self,2101by: impl IntoVec<PlSmallStr>,2102sort_options: SortMultipleOptions,2103) -> PolarsResult<&mut Self> {2104let by_column = self.select_columns(by)?;2105self.columns = self.sort_impl(by_column, sort_options, None)?.columns;2106Ok(self)2107}21082109#[doc(hidden)]2110/// This is the dispatch of Self::sort, and exists to reduce compile bloat by monomorphization.2111pub fn sort_impl(2112&self,2113by_column: Vec<Column>,2114mut sort_options: SortMultipleOptions,2115slice: Option<(i64, usize)>,2116) -> PolarsResult<Self> {2117if by_column.is_empty() {2118// If no columns selected, any order (including original order) is correct.2119return if let Some((offset, len)) = slice {2120Ok(self.slice(offset, len))2121} else {2122Ok(self.clone())2123};2124}21252126// note that the by_column argument also contains evaluated expression from2127// polars-lazy that may not even be present in this dataframe. therefore2128// when we try to set the first columns as sorted, we ignore the error as2129// expressions are not present (they are renamed to _POLARS_SORT_COLUMN_i.2130let first_descending = sort_options.descending[0];2131let first_by_column = by_column[0].name().to_string();21322133let set_sorted = |df: &mut DataFrame| {2134// Mark the first sort column as sorted; if the column does not exist it2135// is ok, because we sorted by an expression not present in the dataframe2136let _ = df.apply(&first_by_column, |s| {2137let mut s = s.clone();2138if first_descending {2139s.set_sorted_flag(IsSorted::Descending)2140} else {2141s.set_sorted_flag(IsSorted::Ascending)2142}2143s2144});2145};2146if self.is_empty() {2147let mut out = self.clone();2148set_sorted(&mut out);2149return Ok(out);2150}21512152if let Some((0, k)) = slice {2153if k < self.len() {2154return self.bottom_k_impl(k, by_column, sort_options);2155}2156}2157// Check if the required column is already sorted; if so we can exit early2158// We can do so when there is only one column to sort by, for multiple columns2159// it will be complicated to do so2160#[cfg(feature = "dtype-categorical")]2161let is_not_categorical_enum =2162!(matches!(by_column[0].dtype(), DataType::Categorical(_, _))2163|| matches!(by_column[0].dtype(), DataType::Enum(_, _)));21642165#[cfg(not(feature = "dtype-categorical"))]2166#[allow(non_upper_case_globals)]2167const is_not_categorical_enum: bool = true;21682169if by_column.len() == 1 && is_not_categorical_enum {2170let required_sorting = if sort_options.descending[0] {2171IsSorted::Descending2172} else {2173IsSorted::Ascending2174};2175// If null count is 0 then nulls_last doesnt matter2176// Safe to get value at last position since the dataframe is not empty (taken care above)2177let no_sorting_required = (by_column[0].is_sorted_flag() == required_sorting)2178&& ((by_column[0].null_count() == 0)2179|| by_column[0].get(by_column[0].len() - 1).unwrap().is_null()2180== sort_options.nulls_last[0]);21812182if no_sorting_required {2183return if let Some((offset, len)) = slice {2184Ok(self.slice(offset, len))2185} else {2186Ok(self.clone())2187};2188}2189}21902191let has_nested = by_column.iter().any(|s| s.dtype().is_nested());21922193// a lot of indirection in both sorting and take2194let mut df = self.clone();2195let df = df.as_single_chunk_par();2196let mut take = match (by_column.len(), has_nested) {2197(1, false) => {2198let s = &by_column[0];2199let options = SortOptions {2200descending: sort_options.descending[0],2201nulls_last: sort_options.nulls_last[0],2202multithreaded: sort_options.multithreaded,2203maintain_order: sort_options.maintain_order,2204limit: sort_options.limit,2205};2206// fast path for a frame with a single series2207// no need to compute the sort indices and then take by these indices2208// simply sort and return as frame2209if df.width() == 1 && df.check_name_to_idx(s.name().as_str()).is_ok() {2210let mut out = s.sort_with(options)?;2211if let Some((offset, len)) = slice {2212out = out.slice(offset, len);2213}2214return Ok(out.into_frame());2215}2216s.arg_sort(options)2217},2218_ => {2219if sort_options.nulls_last.iter().all(|&x| x)2220|| has_nested2221|| std::env::var("POLARS_ROW_FMT_SORT").is_ok()2222{2223argsort_multiple_row_fmt(2224&by_column,2225sort_options.descending,2226sort_options.nulls_last,2227sort_options.multithreaded,2228)?2229} else {2230let (first, other) = prepare_arg_sort(by_column, &mut sort_options)?;2231first2232.as_materialized_series()2233.arg_sort_multiple(&other, &sort_options)?2234}2235},2236};22372238if let Some((offset, len)) = slice {2239take = take.slice(offset, len);2240}22412242// SAFETY:2243// the created indices are in bounds2244let mut df = unsafe { df.take_unchecked_impl(&take, sort_options.multithreaded) };2245set_sorted(&mut df);2246Ok(df)2247}22482249/// Create a `DataFrame` that has fields for all the known runtime metadata for each column.2250///2251/// This dataframe does not necessarily have a specified schema and may be changed at any2252/// point. It is primarily used for debugging.2253pub fn _to_metadata(&self) -> DataFrame {2254let num_columns = self.columns.len();22552256let mut column_names =2257StringChunkedBuilder::new(PlSmallStr::from_static("column_name"), num_columns);2258let mut repr_ca = StringChunkedBuilder::new(PlSmallStr::from_static("repr"), num_columns);2259let mut sorted_asc_ca =2260BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_asc"), num_columns);2261let mut sorted_dsc_ca =2262BooleanChunkedBuilder::new(PlSmallStr::from_static("sorted_dsc"), num_columns);2263let mut fast_explode_list_ca =2264BooleanChunkedBuilder::new(PlSmallStr::from_static("fast_explode_list"), num_columns);2265let mut materialized_at_ca =2266StringChunkedBuilder::new(PlSmallStr::from_static("materialized_at"), num_columns);22672268for col in &self.columns {2269let flags = col.get_flags();22702271let (repr, materialized_at) = match col {2272Column::Series(s) => ("series", s.materialized_at()),2273Column::Partitioned(_) => ("partitioned", None),2274Column::Scalar(_) => ("scalar", None),2275};2276let sorted_asc = flags.contains(StatisticsFlags::IS_SORTED_ASC);2277let sorted_dsc = flags.contains(StatisticsFlags::IS_SORTED_DSC);2278let fast_explode_list = flags.contains(StatisticsFlags::CAN_FAST_EXPLODE_LIST);22792280column_names.append_value(col.name().clone());2281repr_ca.append_value(repr);2282sorted_asc_ca.append_value(sorted_asc);2283sorted_dsc_ca.append_value(sorted_dsc);2284fast_explode_list_ca.append_value(fast_explode_list);2285materialized_at_ca.append_option(materialized_at.map(|v| format!("{v:#?}")));2286}22872288unsafe {2289DataFrame::new_no_checks(2290self.width(),2291vec![2292column_names.finish().into_column(),2293repr_ca.finish().into_column(),2294sorted_asc_ca.finish().into_column(),2295sorted_dsc_ca.finish().into_column(),2296fast_explode_list_ca.finish().into_column(),2297materialized_at_ca.finish().into_column(),2298],2299)2300}2301}23022303/// Return a sorted clone of this [`DataFrame`].2304///2305/// In many cases the output chunks will be continuous in memory but this is not guaranteed2306/// # Example2307///2308/// Sort by a single column with default options:2309/// ```2310/// # use polars_core::prelude::*;2311/// fn sort_by_sepal_width(df: &DataFrame) -> PolarsResult<DataFrame> {2312/// df.sort(["sepal_width"], Default::default())2313/// }2314/// ```2315/// Sort by a single column with specific order:2316/// ```2317/// # use polars_core::prelude::*;2318/// fn sort_with_specific_order(df: &DataFrame, descending: bool) -> PolarsResult<DataFrame> {2319/// df.sort(2320/// ["sepal_width"],2321/// SortMultipleOptions::new()2322/// .with_order_descending(descending)2323/// )2324/// }2325/// ```2326/// Sort by multiple columns with specifying order for each column:2327/// ```2328/// # use polars_core::prelude::*;2329/// fn sort_by_multiple_columns_with_specific_order(df: &DataFrame) -> PolarsResult<DataFrame> {2330/// df.sort(2331/// ["sepal_width", "sepal_length"],2332/// SortMultipleOptions::new()2333/// .with_order_descending_multi([false, true])2334/// )2335/// }2336/// ```2337/// See [`SortMultipleOptions`] for more options.2338///2339/// Also see [`DataFrame::sort_in_place`].2340pub fn sort(2341&self,2342by: impl IntoVec<PlSmallStr>,2343sort_options: SortMultipleOptions,2344) -> PolarsResult<Self> {2345let mut df = self.clone();2346df.sort_in_place(by, sort_options)?;2347Ok(df)2348}23492350/// Replace a column with a [`Series`].2351///2352/// # Example2353///2354/// ```rust2355/// # use polars_core::prelude::*;2356/// let mut df: DataFrame = df!("Country" => ["United States", "China"],2357/// "Area (km²)" => [9_833_520, 9_596_961])?;2358/// let s: Series = Series::new("Country".into(), ["USA", "PRC"]);2359///2360/// assert!(df.replace("Nation", s.clone()).is_err());2361/// assert!(df.replace("Country", s).is_ok());2362/// # Ok::<(), PolarsError>(())2363/// ```2364pub fn replace<S: IntoSeries>(&mut self, column: &str, new_col: S) -> PolarsResult<&mut Self> {2365self.apply(column, |_| new_col.into_series())2366}23672368/// Replace or update a column. The difference between this method and [DataFrame::with_column]2369/// is that now the value of `column: &str` determines the name of the column and not the name2370/// of the `Series` passed to this method.2371pub fn replace_or_add<S: IntoSeries>(2372&mut self,2373column: PlSmallStr,2374new_col: S,2375) -> PolarsResult<&mut Self> {2376let mut new_col = new_col.into_series();2377new_col.rename(column);2378self.with_column(new_col)2379}23802381/// Replace column at index `idx` with a [`Series`].2382///2383/// # Example2384///2385/// ```ignored2386/// # use polars_core::prelude::*;2387/// let s0 = Series::new("foo".into(), ["ham", "spam", "egg"]);2388/// let s1 = Series::new("ascii".into(), [70, 79, 79]);2389/// let mut df = DataFrame::new(vec![s0, s1])?;2390///2391/// // Add 32 to get lowercase ascii values2392/// df.replace_column(1, df.select_at_idx(1).unwrap() + 32);2393/// # Ok::<(), PolarsError>(())2394/// ```2395pub fn replace_column<C: IntoColumn>(2396&mut self,2397index: usize,2398new_column: C,2399) -> PolarsResult<&mut Self> {2400polars_ensure!(2401index < self.width(),2402ShapeMismatch:2403"unable to replace at index {}, the DataFrame has only {} columns",2404index, self.width(),2405);2406let mut new_column = new_column.into_column();2407polars_ensure!(2408new_column.len() == self.height(),2409ShapeMismatch:2410"unable to replace a column, series length {} doesn't match the DataFrame height {}",2411new_column.len(), self.height(),2412);2413let old_col = &mut self.columns[index];2414mem::swap(old_col, &mut new_column);2415self.clear_schema();2416Ok(self)2417}24182419/// Apply a closure to a column. This is the recommended way to do in place modification.2420///2421/// # Example2422///2423/// ```rust2424/// # use polars_core::prelude::*;2425/// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);2426/// let s1 = Column::new("names".into(), ["Jean", "Claude", "van"]);2427/// let mut df = DataFrame::new(vec![s0, s1])?;2428///2429/// fn str_to_len(str_val: &Column) -> Column {2430/// str_val.str()2431/// .unwrap()2432/// .into_iter()2433/// .map(|opt_name: Option<&str>| {2434/// opt_name.map(|name: &str| name.len() as u32)2435/// })2436/// .collect::<UInt32Chunked>()2437/// .into_column()2438/// }2439///2440/// // Replace the names column by the length of the names.2441/// df.apply("names", str_to_len);2442/// # Ok::<(), PolarsError>(())2443/// ```2444/// Results in:2445///2446/// ```text2447/// +--------+-------+2448/// | foo | |2449/// | --- | names |2450/// | str | u32 |2451/// +========+=======+2452/// | "ham" | 4 |2453/// +--------+-------+2454/// | "spam" | 6 |2455/// +--------+-------+2456/// | "egg" | 3 |2457/// +--------+-------+2458/// ```2459pub fn apply<F, C>(&mut self, name: &str, f: F) -> PolarsResult<&mut Self>2460where2461F: FnOnce(&Column) -> C,2462C: IntoColumn,2463{2464let idx = self.check_name_to_idx(name)?;2465self.apply_at_idx(idx, f)?;2466Ok(self)2467}24682469/// Apply a closure to a column at index `idx`. This is the recommended way to do in place2470/// modification.2471///2472/// # Example2473///2474/// ```rust2475/// # use polars_core::prelude::*;2476/// let s0 = Column::new("foo".into(), ["ham", "spam", "egg"]);2477/// let s1 = Column::new("ascii".into(), [70, 79, 79]);2478/// let mut df = DataFrame::new(vec![s0, s1])?;2479///2480/// // Add 32 to get lowercase ascii values2481/// df.apply_at_idx(1, |s| s + 32);2482/// # Ok::<(), PolarsError>(())2483/// ```2484/// Results in:2485///2486/// ```text2487/// +--------+-------+2488/// | foo | ascii |2489/// | --- | --- |2490/// | str | i32 |2491/// +========+=======+2492/// | "ham" | 102 |2493/// +--------+-------+2494/// | "spam" | 111 |2495/// +--------+-------+2496/// | "egg" | 111 |2497/// +--------+-------+2498/// ```2499pub fn apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>2500where2501F: FnOnce(&Column) -> C,2502C: IntoColumn,2503{2504let df_height = self.height();2505let width = self.width();2506let col = self.columns.get_mut(idx).ok_or_else(|| {2507polars_err!(2508ComputeError: "invalid column index: {} for a DataFrame with {} columns",2509idx, width2510)2511})?;2512let name = col.name().clone();2513let dtype_before = col.dtype().clone();2514let new_col = f(col).into_column();2515match new_col.len() {25161 => {2517let new_col = new_col.new_from_index(0, df_height);2518let _ = mem::replace(col, new_col);2519},2520len if (len == df_height) => {2521let _ = mem::replace(col, new_col);2522},2523len => polars_bail!(2524ShapeMismatch:2525"resulting Series has length {} while the DataFrame has height {}",2526len, df_height2527),2528}25292530// make sure the name remains the same after applying the closure2531unsafe {2532let col = self.columns.get_unchecked_mut(idx);2533col.rename(name);25342535if col.dtype() != &dtype_before {2536self.clear_schema();2537}2538}2539Ok(self)2540}25412542/// Apply a closure that may fail to a column at index `idx`. This is the recommended way to do in place2543/// modification.2544///2545/// # Example2546///2547/// This is the idiomatic way to replace some values a column of a `DataFrame` given range of indexes.2548///2549/// ```rust2550/// # use polars_core::prelude::*;2551/// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);2552/// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);2553/// let mut df = DataFrame::new(vec![s0, s1])?;2554///2555/// let idx = vec![0, 1, 4];2556///2557/// df.try_apply("foo", |c| {2558/// c.str()?2559/// .scatter_with(idx, |opt_val| opt_val.map(|string| format!("{}-is-modified", string)))2560/// });2561/// # Ok::<(), PolarsError>(())2562/// ```2563/// Results in:2564///2565/// ```text2566/// +---------------------+--------+2567/// | foo | values |2568/// | --- | --- |2569/// | str | i32 |2570/// +=====================+========+2571/// | "ham-is-modified" | 1 |2572/// +---------------------+--------+2573/// | "spam-is-modified" | 2 |2574/// +---------------------+--------+2575/// | "egg" | 3 |2576/// +---------------------+--------+2577/// | "bacon" | 4 |2578/// +---------------------+--------+2579/// | "quack-is-modified" | 5 |2580/// +---------------------+--------+2581/// ```2582pub fn try_apply_at_idx<F, C>(&mut self, idx: usize, f: F) -> PolarsResult<&mut Self>2583where2584F: FnOnce(&Column) -> PolarsResult<C>,2585C: IntoColumn,2586{2587let width = self.width();2588let col = self.columns.get_mut(idx).ok_or_else(|| {2589polars_err!(2590ComputeError: "invalid column index: {} for a DataFrame with {} columns",2591idx, width2592)2593})?;2594let name = col.name().clone();25952596let _ = mem::replace(col, f(col).map(|c| c.into_column())?);25972598// make sure the name remains the same after applying the closure2599unsafe {2600let col = self.columns.get_unchecked_mut(idx);2601col.rename(name);2602}2603Ok(self)2604}26052606/// Apply a closure that may fail to a column. This is the recommended way to do in place2607/// modification.2608///2609/// # Example2610///2611/// This is the idiomatic way to replace some values a column of a `DataFrame` given a boolean mask.2612///2613/// ```rust2614/// # use polars_core::prelude::*;2615/// let s0 = Column::new("foo".into(), ["ham", "spam", "egg", "bacon", "quack"]);2616/// let s1 = Column::new("values".into(), [1, 2, 3, 4, 5]);2617/// let mut df = DataFrame::new(vec![s0, s1])?;2618///2619/// // create a mask2620/// let values = df.column("values")?.as_materialized_series();2621/// let mask = values.lt_eq(1)? | values.gt_eq(5_i32)?;2622///2623/// df.try_apply("foo", |c| {2624/// c.str()?2625/// .set(&mask, Some("not_within_bounds"))2626/// });2627/// # Ok::<(), PolarsError>(())2628/// ```2629/// Results in:2630///2631/// ```text2632/// +---------------------+--------+2633/// | foo | values |2634/// | --- | --- |2635/// | str | i32 |2636/// +=====================+========+2637/// | "not_within_bounds" | 1 |2638/// +---------------------+--------+2639/// | "spam" | 2 |2640/// +---------------------+--------+2641/// | "egg" | 3 |2642/// +---------------------+--------+2643/// | "bacon" | 4 |2644/// +---------------------+--------+2645/// | "not_within_bounds" | 5 |2646/// +---------------------+--------+2647/// ```2648pub fn try_apply<F, C>(&mut self, column: &str, f: F) -> PolarsResult<&mut Self>2649where2650F: FnOnce(&Series) -> PolarsResult<C>,2651C: IntoColumn,2652{2653let idx = self.try_get_column_index(column)?;2654self.try_apply_at_idx(idx, |c| f(c.as_materialized_series()))2655}26562657/// Slice the [`DataFrame`] along the rows.2658///2659/// # Example2660///2661/// ```rust2662/// # use polars_core::prelude::*;2663/// let df: DataFrame = df!("Fruit" => ["Apple", "Grape", "Grape", "Fig", "Fig"],2664/// "Color" => ["Green", "Red", "White", "White", "Red"])?;2665/// let sl: DataFrame = df.slice(2, 3);2666///2667/// assert_eq!(sl.shape(), (3, 2));2668/// println!("{}", sl);2669/// # Ok::<(), PolarsError>(())2670/// ```2671/// Output:2672/// ```text2673/// shape: (3, 2)2674/// +-------+-------+2675/// | Fruit | Color |2676/// | --- | --- |2677/// | str | str |2678/// +=======+=======+2679/// | Grape | White |2680/// +-------+-------+2681/// | Fig | White |2682/// +-------+-------+2683/// | Fig | Red |2684/// +-------+-------+2685/// ```2686#[must_use]2687pub fn slice(&self, offset: i64, length: usize) -> Self {2688if offset == 0 && length == self.height() {2689return self.clone();2690}2691if length == 0 {2692return self.clear();2693}2694let col = self2695.columns2696.iter()2697.map(|s| s.slice(offset, length))2698.collect::<Vec<_>>();26992700let height = if let Some(fst) = col.first() {2701fst.len()2702} else {2703let (_, length) = slice_offsets(offset, length, self.height());2704length2705};27062707unsafe { DataFrame::new_no_checks(height, col) }2708}27092710/// Split [`DataFrame`] at the given `offset`.2711pub fn split_at(&self, offset: i64) -> (Self, Self) {2712let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();27132714let (idx, _) = slice_offsets(offset, 0, self.height());27152716let a = unsafe { DataFrame::new_no_checks(idx, a) };2717let b = unsafe { DataFrame::new_no_checks(self.height() - idx, b) };2718(a, b)2719}27202721pub fn clear(&self) -> Self {2722let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();2723unsafe { DataFrame::new_no_checks(0, col) }2724}27252726#[must_use]2727pub fn slice_par(&self, offset: i64, length: usize) -> Self {2728if offset == 0 && length == self.height() {2729return self.clone();2730}2731let columns = self._apply_columns_par(&|s| s.slice(offset, length));2732unsafe { DataFrame::new_no_checks(length, columns) }2733}27342735#[must_use]2736pub fn _slice_and_realloc(&self, offset: i64, length: usize) -> Self {2737if offset == 0 && length == self.height() {2738return self.clone();2739}2740// @scalar-opt2741let columns = self._apply_columns(&|s| {2742let mut out = s.slice(offset, length);2743out.shrink_to_fit();2744out2745});2746unsafe { DataFrame::new_no_checks(length, columns) }2747}27482749/// Get the head of the [`DataFrame`].2750///2751/// # Example2752///2753/// ```rust2754/// # use polars_core::prelude::*;2755/// let countries: DataFrame =2756/// df!("Rank by GDP (2021)" => [1, 2, 3, 4, 5],2757/// "Continent" => ["North America", "Asia", "Asia", "Europe", "Europe"],2758/// "Country" => ["United States", "China", "Japan", "Germany", "United Kingdom"],2759/// "Capital" => ["Washington", "Beijing", "Tokyo", "Berlin", "London"])?;2760/// assert_eq!(countries.shape(), (5, 4));2761///2762/// println!("{}", countries.head(Some(3)));2763/// # Ok::<(), PolarsError>(())2764/// ```2765///2766/// Output:2767///2768/// ```text2769/// shape: (3, 4)2770/// +--------------------+---------------+---------------+------------+2771/// | Rank by GDP (2021) | Continent | Country | Capital |2772/// | --- | --- | --- | --- |2773/// | i32 | str | str | str |2774/// +====================+===============+===============+============+2775/// | 1 | North America | United States | Washington |2776/// +--------------------+---------------+---------------+------------+2777/// | 2 | Asia | China | Beijing |2778/// +--------------------+---------------+---------------+------------+2779/// | 3 | Asia | Japan | Tokyo |2780/// +--------------------+---------------+---------------+------------+2781/// ```2782#[must_use]2783pub fn head(&self, length: Option<usize>) -> Self {2784let col = self2785.columns2786.iter()2787.map(|c| c.head(length))2788.collect::<Vec<_>>();27892790let height = length.unwrap_or(HEAD_DEFAULT_LENGTH);2791let height = usize::min(height, self.height());2792unsafe { DataFrame::new_no_checks(height, col) }2793}27942795/// Get the tail of the [`DataFrame`].2796///2797/// # Example2798///2799/// ```rust2800/// # use polars_core::prelude::*;2801/// let countries: DataFrame =2802/// df!("Rank (2021)" => [105, 106, 107, 108, 109],2803/// "Apple Price (€/kg)" => [0.75, 0.70, 0.70, 0.65, 0.52],2804/// "Country" => ["Kosovo", "Moldova", "North Macedonia", "Syria", "Turkey"])?;2805/// assert_eq!(countries.shape(), (5, 3));2806///2807/// println!("{}", countries.tail(Some(2)));2808/// # Ok::<(), PolarsError>(())2809/// ```2810///2811/// Output:2812///2813/// ```text2814/// shape: (2, 3)2815/// +-------------+--------------------+---------+2816/// | Rank (2021) | Apple Price (€/kg) | Country |2817/// | --- | --- | --- |2818/// | i32 | f64 | str |2819/// +=============+====================+=========+2820/// | 108 | 0.63 | Syria |2821/// +-------------+--------------------+---------+2822/// | 109 | 0.63 | Turkey |2823/// +-------------+--------------------+---------+2824/// ```2825#[must_use]2826pub fn tail(&self, length: Option<usize>) -> Self {2827let col = self2828.columns2829.iter()2830.map(|c| c.tail(length))2831.collect::<Vec<_>>();28322833let height = length.unwrap_or(TAIL_DEFAULT_LENGTH);2834let height = usize::min(height, self.height());2835unsafe { DataFrame::new_no_checks(height, col) }2836}28372838/// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches.2839///2840/// # Panics2841///2842/// Panics if the [`DataFrame`] that is passed is not rechunked.2843///2844/// This responsibility is left to the caller as we don't want to take mutable references here,2845/// but we also don't want to rechunk here, as this operation is costly and would benefit the caller2846/// as well.2847pub fn iter_chunks(&self, compat_level: CompatLevel, parallel: bool) -> RecordBatchIter<'_> {2848debug_assert!(!self.should_rechunk(), "expected equal chunks");2849// If any of the columns is binview and we don't convert `compat_level` we allow parallelism2850// as we must allocate arrow strings/binaries.2851let must_convert = compat_level.0 == 0;2852let parallel = parallel2853&& must_convert2854&& self.columns.len() > 12855&& self2856.columns2857.iter()2858.any(|s| matches!(s.dtype(), DataType::String | DataType::Binary));28592860RecordBatchIter {2861columns: &self.columns,2862schema: Arc::new(2863self.columns2864.iter()2865.map(|c| c.field().to_arrow(compat_level))2866.collect(),2867),2868idx: 0,2869n_chunks: self.first_col_n_chunks(),2870compat_level,2871parallel,2872}2873}28742875/// Iterator over the rows in this [`DataFrame`] as Arrow RecordBatches as physical values.2876///2877/// # Panics2878///2879/// Panics if the [`DataFrame`] that is passed is not rechunked.2880///2881/// This responsibility is left to the caller as we don't want to take mutable references here,2882/// but we also don't want to rechunk here, as this operation is costly and would benefit the caller2883/// as well.2884pub fn iter_chunks_physical(&self) -> PhysRecordBatchIter<'_> {2885debug_assert!(!self.should_rechunk());2886PhysRecordBatchIter {2887schema: Arc::new(2888self.get_columns()2889.iter()2890.map(|c| c.field().to_arrow(CompatLevel::newest()))2891.collect(),2892),2893arr_iters: self2894.materialized_column_iter()2895.map(|s| s.chunks().iter())2896.collect(),2897}2898}28992900/// Get a [`DataFrame`] with all the columns in reversed order.2901#[must_use]2902pub fn reverse(&self) -> Self {2903let col = self.columns.iter().map(|s| s.reverse()).collect::<Vec<_>>();2904unsafe { DataFrame::new_no_checks(self.height(), col) }2905}29062907/// Shift the values by a given period and fill the parts that will be empty due to this operation2908/// with `Nones`.2909///2910/// See the method on [Series](crate::series::SeriesTrait::shift) for more info on the `shift` operation.2911#[must_use]2912pub fn shift(&self, periods: i64) -> Self {2913let col = self._apply_columns_par(&|s| s.shift(periods));2914unsafe { DataFrame::new_no_checks(self.height(), col) }2915}29162917/// Replace None values with one of the following strategies:2918/// * Forward fill (replace None with the previous value)2919/// * Backward fill (replace None with the next value)2920/// * Mean fill (replace None with the mean of the whole array)2921/// * Min fill (replace None with the minimum of the whole array)2922/// * Max fill (replace None with the maximum of the whole array)2923///2924/// See the method on [Series](crate::series::Series::fill_null) for more info on the `fill_null` operation.2925pub fn fill_null(&self, strategy: FillNullStrategy) -> PolarsResult<Self> {2926let col = self.try_apply_columns_par(&|s| s.fill_null(strategy))?;29272928Ok(unsafe { DataFrame::new_no_checks(self.height(), col) })2929}29302931/// Pipe different functions/ closure operations that work on a DataFrame together.2932pub fn pipe<F, B>(self, f: F) -> PolarsResult<B>2933where2934F: Fn(DataFrame) -> PolarsResult<B>,2935{2936f(self)2937}29382939/// Pipe different functions/ closure operations that work on a DataFrame together.2940pub fn pipe_mut<F, B>(&mut self, f: F) -> PolarsResult<B>2941where2942F: Fn(&mut DataFrame) -> PolarsResult<B>,2943{2944f(self)2945}29462947/// Pipe different functions/ closure operations that work on a DataFrame together.2948pub fn pipe_with_args<F, B, Args>(self, f: F, args: Args) -> PolarsResult<B>2949where2950F: Fn(DataFrame, Args) -> PolarsResult<B>,2951{2952f(self, args)2953}29542955/// Drop duplicate rows from a [`DataFrame`].2956/// *This fails when there is a column of type List in DataFrame*2957///2958/// Stable means that the order is maintained. This has a higher cost than an unstable distinct.2959///2960/// # Example2961///2962/// ```no_run2963/// # use polars_core::prelude::*;2964/// let df = df! {2965/// "flt" => [1., 1., 2., 2., 3., 3.],2966/// "int" => [1, 1, 2, 2, 3, 3, ],2967/// "str" => ["a", "a", "b", "b", "c", "c"]2968/// }?;2969///2970/// println!("{}", df.unique_stable(None, UniqueKeepStrategy::First, None)?);2971/// # Ok::<(), PolarsError>(())2972/// ```2973/// Returns2974///2975/// ```text2976/// +-----+-----+-----+2977/// | flt | int | str |2978/// | --- | --- | --- |2979/// | f64 | i32 | str |2980/// +=====+=====+=====+2981/// | 1 | 1 | "a" |2982/// +-----+-----+-----+2983/// | 2 | 2 | "b" |2984/// +-----+-----+-----+2985/// | 3 | 3 | "c" |2986/// +-----+-----+-----+2987/// ```2988#[cfg(feature = "algorithm_group_by")]2989pub fn unique_stable(2990&self,2991subset: Option<&[String]>,2992keep: UniqueKeepStrategy,2993slice: Option<(i64, usize)>,2994) -> PolarsResult<DataFrame> {2995self.unique_impl(2996true,2997subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),2998keep,2999slice,3000)3001}30023003/// Unstable distinct. See [`DataFrame::unique_stable`].3004#[cfg(feature = "algorithm_group_by")]3005pub fn unique<I, S>(3006&self,3007subset: Option<&[String]>,3008keep: UniqueKeepStrategy,3009slice: Option<(i64, usize)>,3010) -> PolarsResult<DataFrame> {3011self.unique_impl(3012false,3013subset.map(|v| v.iter().map(|x| PlSmallStr::from_str(x.as_str())).collect()),3014keep,3015slice,3016)3017}30183019#[cfg(feature = "algorithm_group_by")]3020pub fn unique_impl(3021&self,3022maintain_order: bool,3023subset: Option<Vec<PlSmallStr>>,3024keep: UniqueKeepStrategy,3025slice: Option<(i64, usize)>,3026) -> PolarsResult<Self> {3027let names = subset.unwrap_or_else(|| self.get_column_names_owned());3028let mut df = self.clone();3029// take on multiple chunks is terrible3030df.as_single_chunk_par();30313032let columns = match (keep, maintain_order) {3033(UniqueKeepStrategy::First | UniqueKeepStrategy::Any, true) => {3034let gb = df.group_by_stable(names)?;3035let groups = gb.get_groups();3036let (offset, len) = slice.unwrap_or((0, groups.len()));3037let groups = groups.slice(offset, len);3038df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })3039},3040(UniqueKeepStrategy::Last, true) => {3041// maintain order by last values, so the sorted groups are not correct as they3042// are sorted by the first value3043let gb = df.group_by_stable(names)?;3044let groups = gb.get_groups();30453046let last_idx: NoNull<IdxCa> = groups3047.iter()3048.map(|g| match g {3049GroupsIndicator::Idx((_first, idx)) => idx[idx.len() - 1],3050GroupsIndicator::Slice([first, len]) => first + len - 1,3051})3052.collect();30533054let mut last_idx = last_idx.into_inner().sort(false);30553056if let Some((offset, len)) = slice {3057last_idx = last_idx.slice(offset, len);3058}30593060let last_idx = NoNull::new(last_idx);3061let out = unsafe { df.take_unchecked(&last_idx) };3062return Ok(out);3063},3064(UniqueKeepStrategy::First | UniqueKeepStrategy::Any, false) => {3065let gb = df.group_by(names)?;3066let groups = gb.get_groups();3067let (offset, len) = slice.unwrap_or((0, groups.len()));3068let groups = groups.slice(offset, len);3069df._apply_columns_par(&|s| unsafe { s.agg_first(&groups) })3070},3071(UniqueKeepStrategy::Last, false) => {3072let gb = df.group_by(names)?;3073let groups = gb.get_groups();3074let (offset, len) = slice.unwrap_or((0, groups.len()));3075let groups = groups.slice(offset, len);3076df._apply_columns_par(&|s| unsafe { s.agg_last(&groups) })3077},3078(UniqueKeepStrategy::None, _) => {3079let df_part = df.select(names)?;3080let mask = df_part.is_unique()?;3081let mut filtered = df.filter(&mask)?;30823083if let Some((offset, len)) = slice {3084filtered = filtered.slice(offset, len);3085}3086return Ok(filtered);3087},3088};3089let height = Self::infer_height(&columns);3090Ok(unsafe { DataFrame::new_no_checks(height, columns) })3091}30923093/// Get a mask of all the unique rows in the [`DataFrame`].3094///3095/// # Example3096///3097/// ```no_run3098/// # use polars_core::prelude::*;3099/// let df: DataFrame = df!("Company" => ["Apple", "Microsoft"],3100/// "ISIN" => ["US0378331005", "US5949181045"])?;3101/// let ca: ChunkedArray<BooleanType> = df.is_unique()?;3102///3103/// assert!(ca.all());3104/// # Ok::<(), PolarsError>(())3105/// ```3106#[cfg(feature = "algorithm_group_by")]3107pub fn is_unique(&self) -> PolarsResult<BooleanChunked> {3108let gb = self.group_by(self.get_column_names_owned())?;3109let groups = gb.get_groups();3110Ok(is_unique_helper(3111groups,3112self.height() as IdxSize,3113true,3114false,3115))3116}31173118/// Get a mask of all the duplicated rows in the [`DataFrame`].3119///3120/// # Example3121///3122/// ```no_run3123/// # use polars_core::prelude::*;3124/// let df: DataFrame = df!("Company" => ["Alphabet", "Alphabet"],3125/// "ISIN" => ["US02079K3059", "US02079K1079"])?;3126/// let ca: ChunkedArray<BooleanType> = df.is_duplicated()?;3127///3128/// assert!(!ca.all());3129/// # Ok::<(), PolarsError>(())3130/// ```3131#[cfg(feature = "algorithm_group_by")]3132pub fn is_duplicated(&self) -> PolarsResult<BooleanChunked> {3133let gb = self.group_by(self.get_column_names_owned())?;3134let groups = gb.get_groups();3135Ok(is_unique_helper(3136groups,3137self.height() as IdxSize,3138false,3139true,3140))3141}31423143/// Create a new [`DataFrame`] that shows the null counts per column.3144#[must_use]3145pub fn null_count(&self) -> Self {3146let cols = self3147.columns3148.iter()3149.map(|c| Column::new(c.name().clone(), [c.null_count() as IdxSize]))3150.collect();3151unsafe { Self::new_no_checks(1, cols) }3152}31533154/// Hash and combine the row values3155#[cfg(feature = "row_hash")]3156pub fn hash_rows(3157&mut self,3158hasher_builder: Option<PlSeedableRandomStateQuality>,3159) -> PolarsResult<UInt64Chunked> {3160let dfs = split_df(self, POOL.current_num_threads(), false);3161let (cas, _) = _df_rows_to_hashes_threaded_vertical(&dfs, hasher_builder)?;31623163let mut iter = cas.into_iter();3164let mut acc_ca = iter.next().unwrap();3165for ca in iter {3166acc_ca.append(&ca)?;3167}3168Ok(acc_ca.rechunk().into_owned())3169}31703171/// Get the supertype of the columns in this DataFrame3172pub fn get_supertype(&self) -> Option<PolarsResult<DataType>> {3173self.columns3174.iter()3175.map(|s| Ok(s.dtype().clone()))3176.reduce(|acc, b| try_get_supertype(&acc?, &b.unwrap()))3177}31783179/// Take by index values given by the slice `idx`.3180/// # Warning3181/// Be careful with allowing threads when calling this in a large hot loop3182/// every thread split may be on rayon stack and lead to SO3183#[doc(hidden)]3184pub unsafe fn _take_unchecked_slice(&self, idx: &[IdxSize], allow_threads: bool) -> Self {3185self._take_unchecked_slice_sorted(idx, allow_threads, IsSorted::Not)3186}31873188/// Take by index values given by the slice `idx`. Use this over `_take_unchecked_slice`3189/// if the index value in `idx` are sorted. This will maintain sorted flags.3190///3191/// # Warning3192/// Be careful with allowing threads when calling this in a large hot loop3193/// every thread split may be on rayon stack and lead to SO3194#[doc(hidden)]3195pub unsafe fn _take_unchecked_slice_sorted(3196&self,3197idx: &[IdxSize],3198allow_threads: bool,3199sorted: IsSorted,3200) -> Self {3201#[cfg(debug_assertions)]3202{3203if idx.len() > 2 {3204match sorted {3205IsSorted::Ascending => {3206assert!(idx[0] <= idx[idx.len() - 1]);3207},3208IsSorted::Descending => {3209assert!(idx[0] >= idx[idx.len() - 1]);3210},3211_ => {},3212}3213}3214}3215let mut ca = IdxCa::mmap_slice(PlSmallStr::EMPTY, idx);3216ca.set_sorted_flag(sorted);3217self.take_unchecked_impl(&ca, allow_threads)3218}32193220#[cfg(all(feature = "partition_by", feature = "algorithm_group_by"))]3221#[doc(hidden)]3222pub fn _partition_by_impl(3223&self,3224cols: &[PlSmallStr],3225stable: bool,3226include_key: bool,3227parallel: bool,3228) -> PolarsResult<Vec<DataFrame>> {3229let selected_keys = self.select_columns(cols.iter().cloned())?;3230let groups = self.group_by_with_series(selected_keys, parallel, stable)?;3231let groups = groups.take_groups();32323233// drop key columns prior to calculation if requested3234let df = if include_key {3235self.clone()3236} else {3237self.drop_many(cols.iter().cloned())3238};32393240if parallel {3241// don't parallelize this3242// there is a lot of parallelization in take and this may easily SO3243POOL.install(|| {3244match groups.as_ref() {3245GroupsType::Idx(idx) => {3246// Rechunk as the gather may rechunk for every group #17562.3247let mut df = df.clone();3248df.as_single_chunk_par();3249Ok(idx3250.into_par_iter()3251.map(|(_, group)| {3252// groups are in bounds3253unsafe {3254df._take_unchecked_slice_sorted(3255group,3256false,3257IsSorted::Ascending,3258)3259}3260})3261.collect())3262},3263GroupsType::Slice { groups, .. } => Ok(groups3264.into_par_iter()3265.map(|[first, len]| df.slice(*first as i64, *len as usize))3266.collect()),3267}3268})3269} else {3270match groups.as_ref() {3271GroupsType::Idx(idx) => {3272// Rechunk as the gather may rechunk for every group #17562.3273let mut df = df;3274df.as_single_chunk();3275Ok(idx3276.into_iter()3277.map(|(_, group)| {3278// groups are in bounds3279unsafe {3280df._take_unchecked_slice_sorted(group, false, IsSorted::Ascending)3281}3282})3283.collect())3284},3285GroupsType::Slice { groups, .. } => Ok(groups3286.iter()3287.map(|[first, len]| df.slice(*first as i64, *len as usize))3288.collect()),3289}3290}3291}32923293/// Split into multiple DataFrames partitioned by groups3294#[cfg(feature = "partition_by")]3295pub fn partition_by<I, S>(&self, cols: I, include_key: bool) -> PolarsResult<Vec<DataFrame>>3296where3297I: IntoIterator<Item = S>,3298S: Into<PlSmallStr>,3299{3300let cols = cols3301.into_iter()3302.map(Into::into)3303.collect::<Vec<PlSmallStr>>();3304self._partition_by_impl(cols.as_slice(), false, include_key, true)3305}33063307/// Split into multiple DataFrames partitioned by groups3308/// Order of the groups are maintained.3309#[cfg(feature = "partition_by")]3310pub fn partition_by_stable<I, S>(3311&self,3312cols: I,3313include_key: bool,3314) -> PolarsResult<Vec<DataFrame>>3315where3316I: IntoIterator<Item = S>,3317S: Into<PlSmallStr>,3318{3319let cols = cols3320.into_iter()3321.map(Into::into)3322.collect::<Vec<PlSmallStr>>();3323self._partition_by_impl(cols.as_slice(), true, include_key, true)3324}33253326/// Unnest the given `Struct` columns. This means that the fields of the `Struct` type will be3327/// inserted as columns.3328#[cfg(feature = "dtype-struct")]3329pub fn unnest<I: IntoVec<PlSmallStr>>(&self, cols: I) -> PolarsResult<DataFrame> {3330let cols = cols.into_vec();3331self.unnest_impl(cols.into_iter().collect())3332}33333334#[cfg(feature = "dtype-struct")]3335fn unnest_impl(&self, cols: PlHashSet<PlSmallStr>) -> PolarsResult<DataFrame> {3336let mut new_cols = Vec::with_capacity(std::cmp::min(self.width() * 2, self.width() + 128));3337let mut count = 0;3338for s in &self.columns {3339if cols.contains(s.name()) {3340let ca = s.struct_()?.clone();3341new_cols.extend(ca.fields_as_series().into_iter().map(Column::from));3342count += 1;3343} else {3344new_cols.push(s.clone())3345}3346}3347if count != cols.len() {3348// one or more columns not found3349// the code below will return an error with the missing name3350let schema = self.schema();3351for col in cols {3352let _ = schema3353.get(col.as_str())3354.ok_or_else(|| polars_err!(col_not_found = col))?;3355}3356}3357DataFrame::new(new_cols)3358}33593360pub(crate) fn infer_height(cols: &[Column]) -> usize {3361cols.first().map_or(0, Column::len)3362}33633364pub fn append_record_batch(&mut self, rb: RecordBatchT<ArrayRef>) -> PolarsResult<()> {3365// @Optimize: this does a lot of unnecessary allocations. We should probably have a3366// append_chunk or something like this. It is just quite difficult to make that safe.3367let df = DataFrame::from(rb);3368polars_ensure!(3369self.schema() == df.schema(),3370SchemaMismatch: "cannot append record batch with different schema\n\n3371Got {:?}\nexpected: {:?}", df.schema(), self.schema(),3372);3373self.vstack_mut_owned_unchecked(df);3374Ok(())3375}3376}33773378pub struct RecordBatchIter<'a> {3379columns: &'a Vec<Column>,3380schema: ArrowSchemaRef,3381idx: usize,3382n_chunks: usize,3383compat_level: CompatLevel,3384parallel: bool,3385}33863387impl Iterator for RecordBatchIter<'_> {3388type Item = RecordBatch;33893390fn next(&mut self) -> Option<Self::Item> {3391if self.idx >= self.n_chunks {3392return None;3393}33943395// Create a batch of the columns with the same chunk no.3396let batch_cols: Vec<ArrayRef> = if self.parallel {3397let iter = self3398.columns3399.par_iter()3400.map(Column::as_materialized_series)3401.map(|s| s.to_arrow(self.idx, self.compat_level));3402POOL.install(|| iter.collect())3403} else {3404self.columns3405.iter()3406.map(Column::as_materialized_series)3407.map(|s| s.to_arrow(self.idx, self.compat_level))3408.collect()3409};3410self.idx += 1;34113412let length = batch_cols.first().map_or(0, |arr| arr.len());3413Some(RecordBatch::new(length, self.schema.clone(), batch_cols))3414}34153416fn size_hint(&self) -> (usize, Option<usize>) {3417let n = self.n_chunks - self.idx;3418(n, Some(n))3419}3420}34213422pub struct PhysRecordBatchIter<'a> {3423schema: ArrowSchemaRef,3424arr_iters: Vec<std::slice::Iter<'a, ArrayRef>>,3425}34263427impl Iterator for PhysRecordBatchIter<'_> {3428type Item = RecordBatch;34293430fn next(&mut self) -> Option<Self::Item> {3431let arrs = self3432.arr_iters3433.iter_mut()3434.map(|phys_iter| phys_iter.next().cloned())3435.collect::<Option<Vec<_>>>()?;34363437let length = arrs.first().map_or(0, |arr| arr.len());3438Some(RecordBatch::new(length, self.schema.clone(), arrs))3439}34403441fn size_hint(&self) -> (usize, Option<usize>) {3442if let Some(iter) = self.arr_iters.first() {3443iter.size_hint()3444} else {3445(0, None)3446}3447}3448}34493450impl Default for DataFrame {3451fn default() -> Self {3452DataFrame::empty()3453}3454}34553456impl From<DataFrame> for Vec<Column> {3457fn from(df: DataFrame) -> Self {3458df.columns3459}3460}34613462// utility to test if we can vstack/extend the columns3463fn ensure_can_extend(left: &Column, right: &Column) -> PolarsResult<()> {3464polars_ensure!(3465left.name() == right.name(),3466ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",3467left.name(), right.name(),3468);3469Ok(())3470}34713472#[cfg(test)]3473mod test {3474use super::*;34753476fn create_frame() -> DataFrame {3477let s0 = Column::new("days".into(), [0, 1, 2].as_ref());3478let s1 = Column::new("temp".into(), [22.1, 19.9, 7.].as_ref());3479DataFrame::new(vec![s0, s1]).unwrap()3480}34813482#[test]3483#[cfg_attr(miri, ignore)]3484fn test_recordbatch_iterator() {3485let df = df!(3486"foo" => [1, 2, 3, 4, 5]3487)3488.unwrap();3489let mut iter = df.iter_chunks(CompatLevel::newest(), false);3490assert_eq!(5, iter.next().unwrap().len());3491assert!(iter.next().is_none());3492}34933494#[test]3495#[cfg_attr(miri, ignore)]3496fn test_select() {3497let df = create_frame();3498assert_eq!(3499df.column("days")3500.unwrap()3501.as_series()3502.unwrap()3503.equal(1)3504.unwrap()3505.sum(),3506Some(1)3507);3508}35093510#[test]3511#[cfg_attr(miri, ignore)]3512fn test_filter_broadcast_on_string_col() {3513let col_name = "some_col";3514let v = vec!["test".to_string()];3515let s0 = Column::new(PlSmallStr::from_str(col_name), v);3516let mut df = DataFrame::new(vec![s0]).unwrap();35173518df = df3519.filter(3520&df.column(col_name)3521.unwrap()3522.as_materialized_series()3523.equal("")3524.unwrap(),3525)3526.unwrap();3527assert_eq!(3528df.column(col_name)3529.unwrap()3530.as_materialized_series()3531.n_chunks(),353213533);3534}35353536#[test]3537#[cfg_attr(miri, ignore)]3538fn test_filter_broadcast_on_list_col() {3539let s1 = Series::new(PlSmallStr::EMPTY, [true, false, true]);3540let ll: ListChunked = [&s1].iter().copied().collect();35413542let mask = BooleanChunked::from_slice(PlSmallStr::EMPTY, &[false]);3543let new = ll.filter(&mask).unwrap();35443545assert_eq!(new.chunks.len(), 1);3546assert_eq!(new.len(), 0);3547}35483549#[test]3550fn slice() {3551let df = create_frame();3552let sliced_df = df.slice(0, 2);3553assert_eq!(sliced_df.shape(), (2, 2));3554}35553556#[test]3557fn rechunk_false() {3558let df = create_frame();3559assert!(!df.should_rechunk())3560}35613562#[test]3563fn rechunk_true() -> PolarsResult<()> {3564let mut base = df!(3565"a" => [1, 2, 3],3566"b" => [1, 2, 3]3567)?;35683569// Create a series with multiple chunks3570let mut s = Series::new("foo".into(), 0..2);3571let s2 = Series::new("bar".into(), 0..1);3572s.append(&s2)?;35733574// Append series to frame3575let out = base.with_column(s)?;35763577// Now we should rechunk3578assert!(out.should_rechunk());3579Ok(())3580}35813582#[test]3583fn test_duplicate_column() {3584let mut df = df! {3585"foo" => [1, 2, 3]3586}3587.unwrap();3588// check if column is replaced3589assert!(3590df.with_column(Series::new("foo".into(), &[1, 2, 3]))3591.is_ok()3592);3593assert!(3594df.with_column(Series::new("bar".into(), &[1, 2, 3]))3595.is_ok()3596);3597assert!(df.column("bar").is_ok())3598}35993600#[test]3601#[cfg_attr(miri, ignore)]3602fn distinct() {3603let df = df! {3604"flt" => [1., 1., 2., 2., 3., 3.],3605"int" => [1, 1, 2, 2, 3, 3, ],3606"str" => ["a", "a", "b", "b", "c", "c"]3607}3608.unwrap();3609let df = df3610.unique_stable(None, UniqueKeepStrategy::First, None)3611.unwrap()3612.sort(["flt"], SortMultipleOptions::default())3613.unwrap();3614let valid = df! {3615"flt" => [1., 2., 3.],3616"int" => [1, 2, 3],3617"str" => ["a", "b", "c"]3618}3619.unwrap();3620assert!(df.equals(&valid));3621}36223623#[test]3624fn test_vstack() {3625// check that it does not accidentally rechunks3626let mut df = df! {3627"flt" => [1., 1., 2., 2., 3., 3.],3628"int" => [1, 1, 2, 2, 3, 3, ],3629"str" => ["a", "a", "b", "b", "c", "c"]3630}3631.unwrap();36323633df.vstack_mut(&df.slice(0, 3)).unwrap();3634assert_eq!(df.first_col_n_chunks(), 2)3635}36363637#[test]3638fn test_vstack_on_empty_dataframe() {3639let mut df = DataFrame::empty();36403641let df_data = df! {3642"flt" => [1., 1., 2., 2., 3., 3.],3643"int" => [1, 1, 2, 2, 3, 3, ],3644"str" => ["a", "a", "b", "b", "c", "c"]3645}3646.unwrap();36473648df.vstack_mut(&df_data).unwrap();3649assert_eq!(df.height, 6)3650}36513652#[test]3653fn test_replace_or_add() -> PolarsResult<()> {3654let mut df = df!(3655"a" => [1, 2, 3],3656"b" => [1, 2, 3]3657)?;36583659// check that the new column is "c" and not "bar".3660df.replace_or_add("c".into(), Series::new("bar".into(), [1, 2, 3]))?;36613662assert_eq!(df.get_column_names(), &["a", "b", "c"]);3663Ok(())3664}36653666#[test]3667fn test_unique_keep_none_with_slice() {3668let df = df! {3669"x" => [1, 2, 3, 2, 1]3670}3671.unwrap();3672let out = df3673.unique_stable(3674Some(&["x".to_string()][..]),3675UniqueKeepStrategy::None,3676Some((0, 2)),3677)3678.unwrap();3679let expected = df! {3680"x" => [3]3681}3682.unwrap();3683assert!(out.equals(&expected));3684}36853686#[test]3687#[cfg(feature = "dtype-i8")]3688fn test_apply_result_schema() {3689let mut df = df! {3690"x" => [1, 2, 3, 2, 1]3691}3692.unwrap();36933694let schema_before = df.schema().clone();3695df.apply("x", |f| f.cast(&DataType::Int8).unwrap()).unwrap();3696assert_ne!(&schema_before, df.schema());3697}3698}369937003701