Path: blob/main/crates/polars-core/src/frame/dataframe.rs
8420 views
use std::sync::{Arc, OnceLock};12use polars_error::PolarsResult;34use super::broadcast::{broadcast_columns, infer_broadcast_height};5use super::validation::validate_columns_slice;6use crate::frame::column::Column;7use crate::schema::{Schema, SchemaRef};89/// A contiguous growable collection of [`Column`]s that have the same length.10///11/// ## Use declarations12///13/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).14///15/// ```rust16/// use polars_core::prelude::*; // if the crate polars-core is used directly17/// // use polars::prelude::*; if the crate polars is used18/// ```19///20/// # Initialization21/// ## Default22///23/// A `DataFrame` can be initialized empty:24///25/// ```rust26/// # use polars_core::prelude::*;27/// let df = DataFrame::empty();28/// assert_eq!(df.shape(), (0, 0));29/// ```30///31/// ## Wrapping a `Vec<Series>`32///33/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.34///35/// ```rust36/// # use polars_core::prelude::*;37/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);38/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);39///40/// let df: PolarsResult<DataFrame> = DataFrame::new_infer_height(vec![s1, s2]);41/// ```42///43/// ## Using a macro44///45/// The [`df!`] macro is a convenient method:46///47/// ```rust48/// # use polars_core::prelude::*;49/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],50/// "Color" => ["Red", "Yellow", "Green"]);51/// ```52///53/// ## Using a CSV file54///55/// See the `polars_io::csv::CsvReader`.56///57/// # Indexing58/// ## By a number59///60/// The `Index<usize>` is implemented for the `DataFrame`.61///62/// ```rust63/// # use polars_core::prelude::*;64/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],65/// "Color" => ["Red", "Yellow", "Green"])?;66///67/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));68/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));69/// # Ok::<(), PolarsError>(())70/// ```71///72/// ## By a `Series` name73///74/// ```rust75/// # use polars_core::prelude::*;76/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],77/// "Color" => ["Red", "Yellow", "Green"])?;78///79/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));80/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));81/// # Ok::<(), PolarsError>(())82/// ```83#[derive(Clone)]84pub struct DataFrame {85height: usize,86/// All columns must have length equal to `self.height`.87columns: Vec<Column>,88/// Cached schema. Must be cleared if column names / dtypes in `self.columns` change.89cached_schema: OnceLock<SchemaRef>,90}9192impl Default for DataFrame {93fn default() -> Self {94DataFrame::empty()95}96}9798impl DataFrame {99/// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).100///101/// # Example102///103/// ```rust104/// use polars_core::prelude::DataFrame;105/// static EMPTY: DataFrame = DataFrame::empty();106/// ```107pub const fn empty() -> Self {108DataFrame::empty_with_height(0)109}110111pub const fn empty_with_height(height: usize) -> Self {112DataFrame {113height,114columns: vec![],115cached_schema: OnceLock::new(),116}117}118119pub fn new(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {120validate_columns_slice(height, &columns)121.map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {e}")))?;122123Ok(unsafe { DataFrame::_new_unchecked_impl(height, columns) })124}125126/// Height is sourced from first column.127pub fn new_infer_height(columns: Vec<Column>) -> PolarsResult<Self> {128DataFrame::new(columns.first().map_or(0, |c| c.len()), columns)129}130131/// Create a new `DataFrame` but does not check the length or duplicate occurrence of the132/// [`Column`]s.133///134/// # Safety135/// [`Column`]s must have unique names and matching lengths.136pub unsafe fn new_unchecked(height: usize, columns: Vec<Column>) -> DataFrame {137if cfg!(debug_assertions) {138validate_columns_slice(height, &columns).unwrap();139}140141unsafe { DataFrame::_new_unchecked_impl(height, columns) }142}143144/// Height is sourced from first column. Does not check for matching height / duplicate names.145///146/// # Safety147/// [`Column`]s must have unique names and matching lengths.148pub unsafe fn new_unchecked_infer_height(columns: Vec<Column>) -> DataFrame {149DataFrame::new_unchecked(columns.first().map_or(0, |c| c.len()), columns)150}151152/// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame153/// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame154/// constructed with this method is generally highly unsafe and should not be long-lived.155#[expect(clippy::missing_safety_doc)]156pub const unsafe fn _new_unchecked_impl(height: usize, columns: Vec<Column>) -> DataFrame {157DataFrame {158height,159columns,160cached_schema: OnceLock::new(),161}162}163164/// Broadcasts unit-length columns to `height`. Errors if a column has height that is non-unit165/// length and not equal to `self.height()`.166pub fn new_with_broadcast(height: usize, mut columns: Vec<Column>) -> PolarsResult<Self> {167broadcast_columns(height, &mut columns)?;168DataFrame::new(height, columns)169}170171/// Infers height as the first non-unit length column or 1 if not found.172pub fn new_infer_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {173DataFrame::new_with_broadcast(infer_broadcast_height(&columns), columns)174}175176/// Broadcasts unit-length columns to `height`. Errors if a column has height that is non-unit177/// length and not equal to `self.height()`.178///179/// # Safety180/// [`Column`]s must have unique names.181pub unsafe fn new_unchecked_with_broadcast(182height: usize,183mut columns: Vec<Column>,184) -> PolarsResult<Self> {185broadcast_columns(height, &mut columns)?;186Ok(unsafe { DataFrame::new_unchecked(height, columns) })187}188189/// # Safety190/// [`Column`]s must have unique names.191pub unsafe fn new_unchecked_infer_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {192DataFrame::new_unchecked_with_broadcast(infer_broadcast_height(&columns), columns)193}194195/// Create a `DataFrame` 0 height and columns as per the `schema`.196pub fn empty_with_schema(schema: &Schema) -> Self {197let cols = schema198.iter()199.map(|(name, dtype)| Column::new_empty(name.clone(), dtype))200.collect();201202unsafe { DataFrame::_new_unchecked_impl(0, cols) }203}204205/// Create an empty `DataFrame` with empty columns as per the `schema`.206pub fn empty_with_arc_schema(schema: SchemaRef) -> Self {207let mut df = DataFrame::empty_with_schema(&schema);208unsafe { df.set_schema(schema) };209df210}211212/// Set the height (i.e. number of rows) of this [`DataFrame`].213///214/// # Safety215///216/// This needs to be equal to the length of all the columns, or `self.width()` must be 0.217#[inline]218pub unsafe fn set_height(&mut self, height: usize) -> &mut Self {219self.height = height;220self221}222223/// Get the height of the [`DataFrame`] which is the number of rows.224#[inline]225pub fn height(&self) -> usize {226self.height227}228229/// Get the number of columns in this [`DataFrame`].230#[inline]231pub fn width(&self) -> usize {232self.columns.len()233}234235/// Get (height, width) of the [`DataFrame`].236///237/// # Example238///239/// ```rust240/// # use polars_core::prelude::*;241/// let df0: DataFrame = DataFrame::empty();242/// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;243/// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],244/// "2" => [1, 2, 3, 4, 5])?;245///246/// assert_eq!(df0.shape(), (0 ,0));247/// assert_eq!(df1.shape(), (5, 1));248/// assert_eq!(df2.shape(), (5, 2));249/// # Ok::<(), PolarsError>(())250/// ```251#[inline]252pub fn shape(&self) -> (usize, usize) {253(self.height(), self.width())254}255256/// 0 width or height.257#[inline]258pub fn shape_has_zero(&self) -> bool {259matches!(self.shape(), (0, _) | (_, 0))260}261262#[inline]263pub fn columns(&self) -> &[Column] {264self.columns.as_slice()265}266267#[inline]268pub fn into_columns(self) -> Vec<Column> {269self.columns270}271272/// # Safety273///274/// The caller must ensure the length of all [`Column`]s remains equal to `self.height`, or275/// that [`DataFrame::set_height`] is called afterwards with the new `height`.276#[inline]277pub unsafe fn columns_mut(&mut self) -> &mut Vec<Column> {278self.clear_schema();279&mut self.columns280}281282/// # Safety283/// Adheres to all safety requirements of [`DataFrame::columns_mut`], and that the list of column284/// names remains unchanged.285#[inline]286pub unsafe fn columns_mut_retain_schema(&mut self) -> &mut Vec<Column> {287&mut self.columns288}289290/// Get the schema of this [`DataFrame`].291///292/// # Panics293/// Panics if there are duplicate column names.294pub fn schema(&self) -> &SchemaRef {295let out = self.cached_schema.get_or_init(|| {296Arc::new(297Schema::from_iter_check_duplicates(298self.columns299.iter()300.map(|x| (x.name().clone(), x.dtype().clone())),301)302.unwrap(),303)304});305306assert_eq!(out.len(), self.width());307308out309}310311#[inline]312pub fn cached_schema(&self) -> Option<&SchemaRef> {313self.cached_schema.get()314}315316/// Set the cached schema317///318/// # Safety319/// Schema must match the columns in `self`.320#[inline]321pub unsafe fn set_schema(&mut self, schema: SchemaRef) -> &mut Self {322self.cached_schema = schema.into();323self324}325326/// Set the cached schema327///328/// # Safety329/// Schema must match the columns in `self`.330#[inline]331pub unsafe fn with_schema(mut self, schema: SchemaRef) -> Self {332self.cached_schema = schema.into();333self334}335336/// Set the cached schema if `schema` is `Some()`.337///338/// # Safety339/// Schema must match the columns in `self`.340#[inline]341pub unsafe fn set_opt_schema(&mut self, schema: Option<SchemaRef>) -> &mut Self {342if let Some(schema) = schema {343unsafe { self.set_schema(schema) };344}345346self347}348349/// Clones the cached schema from `from` to `self.cached_schema` if there is one.350///351/// # Safety352/// Schema must match the columns in `self`.353#[inline]354pub unsafe fn set_schema_from(&mut self, from: &DataFrame) -> &mut Self {355self.set_opt_schema(from.cached_schema().cloned());356self357}358359/// Clones the cached schema from `from` to `self.cached_schema` if there is one.360///361/// # Safety362/// Schema must match the columns in `self`.363#[inline]364pub unsafe fn with_schema_from(mut self, from: &DataFrame) -> Self {365self.set_opt_schema(from.cached_schema().cloned());366self367}368369#[inline]370fn clear_schema(&mut self) -> &mut Self {371self.cached_schema = OnceLock::new();372self373}374}375376377