Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-core/src/frame/dataframe.rs
8420 views
1
use std::sync::{Arc, OnceLock};
2
3
use polars_error::PolarsResult;
4
5
use super::broadcast::{broadcast_columns, infer_broadcast_height};
6
use super::validation::validate_columns_slice;
7
use crate::frame::column::Column;
8
use crate::schema::{Schema, SchemaRef};
9
10
/// A contiguous growable collection of [`Column`]s that have the same length.
11
///
12
/// ## Use declarations
13
///
14
/// All the common tools can be found in [`crate::prelude`] (or in `polars::prelude`).
15
///
16
/// ```rust
17
/// use polars_core::prelude::*; // if the crate polars-core is used directly
18
/// // use polars::prelude::*; if the crate polars is used
19
/// ```
20
///
21
/// # Initialization
22
/// ## Default
23
///
24
/// A `DataFrame` can be initialized empty:
25
///
26
/// ```rust
27
/// # use polars_core::prelude::*;
28
/// let df = DataFrame::empty();
29
/// assert_eq!(df.shape(), (0, 0));
30
/// ```
31
///
32
/// ## Wrapping a `Vec<Series>`
33
///
34
/// A `DataFrame` is built upon a `Vec<Series>` where the `Series` have the same length.
35
///
36
/// ```rust
37
/// # use polars_core::prelude::*;
38
/// let s1 = Column::new("Fruit".into(), ["Apple", "Apple", "Pear"]);
39
/// let s2 = Column::new("Color".into(), ["Red", "Yellow", "Green"]);
40
///
41
/// let df: PolarsResult<DataFrame> = DataFrame::new_infer_height(vec![s1, s2]);
42
/// ```
43
///
44
/// ## Using a macro
45
///
46
/// The [`df!`] macro is a convenient method:
47
///
48
/// ```rust
49
/// # use polars_core::prelude::*;
50
/// let df: PolarsResult<DataFrame> = df!("Fruit" => ["Apple", "Apple", "Pear"],
51
/// "Color" => ["Red", "Yellow", "Green"]);
52
/// ```
53
///
54
/// ## Using a CSV file
55
///
56
/// See the `polars_io::csv::CsvReader`.
57
///
58
/// # Indexing
59
/// ## By a number
60
///
61
/// The `Index<usize>` is implemented for the `DataFrame`.
62
///
63
/// ```rust
64
/// # use polars_core::prelude::*;
65
/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
66
/// "Color" => ["Red", "Yellow", "Green"])?;
67
///
68
/// assert_eq!(df[0], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
69
/// assert_eq!(df[1], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
70
/// # Ok::<(), PolarsError>(())
71
/// ```
72
///
73
/// ## By a `Series` name
74
///
75
/// ```rust
76
/// # use polars_core::prelude::*;
77
/// let df = df!("Fruit" => ["Apple", "Apple", "Pear"],
78
/// "Color" => ["Red", "Yellow", "Green"])?;
79
///
80
/// assert_eq!(df["Fruit"], Column::new("Fruit".into(), &["Apple", "Apple", "Pear"]));
81
/// assert_eq!(df["Color"], Column::new("Color".into(), &["Red", "Yellow", "Green"]));
82
/// # Ok::<(), PolarsError>(())
83
/// ```
84
#[derive(Clone)]
85
pub struct DataFrame {
86
height: usize,
87
/// All columns must have length equal to `self.height`.
88
columns: Vec<Column>,
89
/// Cached schema. Must be cleared if column names / dtypes in `self.columns` change.
90
cached_schema: OnceLock<SchemaRef>,
91
}
92
93
impl Default for DataFrame {
94
fn default() -> Self {
95
DataFrame::empty()
96
}
97
}
98
99
impl DataFrame {
100
/// Creates an empty `DataFrame` usable in a compile time context (such as static initializers).
101
///
102
/// # Example
103
///
104
/// ```rust
105
/// use polars_core::prelude::DataFrame;
106
/// static EMPTY: DataFrame = DataFrame::empty();
107
/// ```
108
pub const fn empty() -> Self {
109
DataFrame::empty_with_height(0)
110
}
111
112
pub const fn empty_with_height(height: usize) -> Self {
113
DataFrame {
114
height,
115
columns: vec![],
116
cached_schema: OnceLock::new(),
117
}
118
}
119
120
pub fn new(height: usize, columns: Vec<Column>) -> PolarsResult<Self> {
121
validate_columns_slice(height, &columns)
122
.map_err(|e| e.wrap_msg(|e| format!("could not create a new DataFrame: {e}")))?;
123
124
Ok(unsafe { DataFrame::_new_unchecked_impl(height, columns) })
125
}
126
127
/// Height is sourced from first column.
128
pub fn new_infer_height(columns: Vec<Column>) -> PolarsResult<Self> {
129
DataFrame::new(columns.first().map_or(0, |c| c.len()), columns)
130
}
131
132
/// Create a new `DataFrame` but does not check the length or duplicate occurrence of the
133
/// [`Column`]s.
134
///
135
/// # Safety
136
/// [`Column`]s must have unique names and matching lengths.
137
pub unsafe fn new_unchecked(height: usize, columns: Vec<Column>) -> DataFrame {
138
if cfg!(debug_assertions) {
139
validate_columns_slice(height, &columns).unwrap();
140
}
141
142
unsafe { DataFrame::_new_unchecked_impl(height, columns) }
143
}
144
145
/// Height is sourced from first column. Does not check for matching height / duplicate names.
146
///
147
/// # Safety
148
/// [`Column`]s must have unique names and matching lengths.
149
pub unsafe fn new_unchecked_infer_height(columns: Vec<Column>) -> DataFrame {
150
DataFrame::new_unchecked(columns.first().map_or(0, |c| c.len()), columns)
151
}
152
153
/// This will not panic even in debug mode - there are some (rare) use cases where a DataFrame
154
/// is temporarily constructed containing duplicates for dispatching to functions. A DataFrame
155
/// constructed with this method is generally highly unsafe and should not be long-lived.
156
#[expect(clippy::missing_safety_doc)]
157
pub const unsafe fn _new_unchecked_impl(height: usize, columns: Vec<Column>) -> DataFrame {
158
DataFrame {
159
height,
160
columns,
161
cached_schema: OnceLock::new(),
162
}
163
}
164
165
/// Broadcasts unit-length columns to `height`. Errors if a column has height that is non-unit
166
/// length and not equal to `self.height()`.
167
pub fn new_with_broadcast(height: usize, mut columns: Vec<Column>) -> PolarsResult<Self> {
168
broadcast_columns(height, &mut columns)?;
169
DataFrame::new(height, columns)
170
}
171
172
/// Infers height as the first non-unit length column or 1 if not found.
173
pub fn new_infer_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
174
DataFrame::new_with_broadcast(infer_broadcast_height(&columns), columns)
175
}
176
177
/// Broadcasts unit-length columns to `height`. Errors if a column has height that is non-unit
178
/// length and not equal to `self.height()`.
179
///
180
/// # Safety
181
/// [`Column`]s must have unique names.
182
pub unsafe fn new_unchecked_with_broadcast(
183
height: usize,
184
mut columns: Vec<Column>,
185
) -> PolarsResult<Self> {
186
broadcast_columns(height, &mut columns)?;
187
Ok(unsafe { DataFrame::new_unchecked(height, columns) })
188
}
189
190
/// # Safety
191
/// [`Column`]s must have unique names.
192
pub unsafe fn new_unchecked_infer_broadcast(columns: Vec<Column>) -> PolarsResult<Self> {
193
DataFrame::new_unchecked_with_broadcast(infer_broadcast_height(&columns), columns)
194
}
195
196
/// Create a `DataFrame` 0 height and columns as per the `schema`.
197
pub fn empty_with_schema(schema: &Schema) -> Self {
198
let cols = schema
199
.iter()
200
.map(|(name, dtype)| Column::new_empty(name.clone(), dtype))
201
.collect();
202
203
unsafe { DataFrame::_new_unchecked_impl(0, cols) }
204
}
205
206
/// Create an empty `DataFrame` with empty columns as per the `schema`.
207
pub fn empty_with_arc_schema(schema: SchemaRef) -> Self {
208
let mut df = DataFrame::empty_with_schema(&schema);
209
unsafe { df.set_schema(schema) };
210
df
211
}
212
213
/// Set the height (i.e. number of rows) of this [`DataFrame`].
214
///
215
/// # Safety
216
///
217
/// This needs to be equal to the length of all the columns, or `self.width()` must be 0.
218
#[inline]
219
pub unsafe fn set_height(&mut self, height: usize) -> &mut Self {
220
self.height = height;
221
self
222
}
223
224
/// Get the height of the [`DataFrame`] which is the number of rows.
225
#[inline]
226
pub fn height(&self) -> usize {
227
self.height
228
}
229
230
/// Get the number of columns in this [`DataFrame`].
231
#[inline]
232
pub fn width(&self) -> usize {
233
self.columns.len()
234
}
235
236
/// Get (height, width) of the [`DataFrame`].
237
///
238
/// # Example
239
///
240
/// ```rust
241
/// # use polars_core::prelude::*;
242
/// let df0: DataFrame = DataFrame::empty();
243
/// let df1: DataFrame = df!("1" => [1, 2, 3, 4, 5])?;
244
/// let df2: DataFrame = df!("1" => [1, 2, 3, 4, 5],
245
/// "2" => [1, 2, 3, 4, 5])?;
246
///
247
/// assert_eq!(df0.shape(), (0 ,0));
248
/// assert_eq!(df1.shape(), (5, 1));
249
/// assert_eq!(df2.shape(), (5, 2));
250
/// # Ok::<(), PolarsError>(())
251
/// ```
252
#[inline]
253
pub fn shape(&self) -> (usize, usize) {
254
(self.height(), self.width())
255
}
256
257
/// 0 width or height.
258
#[inline]
259
pub fn shape_has_zero(&self) -> bool {
260
matches!(self.shape(), (0, _) | (_, 0))
261
}
262
263
#[inline]
264
pub fn columns(&self) -> &[Column] {
265
self.columns.as_slice()
266
}
267
268
#[inline]
269
pub fn into_columns(self) -> Vec<Column> {
270
self.columns
271
}
272
273
/// # Safety
274
///
275
/// The caller must ensure the length of all [`Column`]s remains equal to `self.height`, or
276
/// that [`DataFrame::set_height`] is called afterwards with the new `height`.
277
#[inline]
278
pub unsafe fn columns_mut(&mut self) -> &mut Vec<Column> {
279
self.clear_schema();
280
&mut self.columns
281
}
282
283
/// # Safety
284
/// Adheres to all safety requirements of [`DataFrame::columns_mut`], and that the list of column
285
/// names remains unchanged.
286
#[inline]
287
pub unsafe fn columns_mut_retain_schema(&mut self) -> &mut Vec<Column> {
288
&mut self.columns
289
}
290
291
/// Get the schema of this [`DataFrame`].
292
///
293
/// # Panics
294
/// Panics if there are duplicate column names.
295
pub fn schema(&self) -> &SchemaRef {
296
let out = self.cached_schema.get_or_init(|| {
297
Arc::new(
298
Schema::from_iter_check_duplicates(
299
self.columns
300
.iter()
301
.map(|x| (x.name().clone(), x.dtype().clone())),
302
)
303
.unwrap(),
304
)
305
});
306
307
assert_eq!(out.len(), self.width());
308
309
out
310
}
311
312
#[inline]
313
pub fn cached_schema(&self) -> Option<&SchemaRef> {
314
self.cached_schema.get()
315
}
316
317
/// Set the cached schema
318
///
319
/// # Safety
320
/// Schema must match the columns in `self`.
321
#[inline]
322
pub unsafe fn set_schema(&mut self, schema: SchemaRef) -> &mut Self {
323
self.cached_schema = schema.into();
324
self
325
}
326
327
/// Set the cached schema
328
///
329
/// # Safety
330
/// Schema must match the columns in `self`.
331
#[inline]
332
pub unsafe fn with_schema(mut self, schema: SchemaRef) -> Self {
333
self.cached_schema = schema.into();
334
self
335
}
336
337
/// Set the cached schema if `schema` is `Some()`.
338
///
339
/// # Safety
340
/// Schema must match the columns in `self`.
341
#[inline]
342
pub unsafe fn set_opt_schema(&mut self, schema: Option<SchemaRef>) -> &mut Self {
343
if let Some(schema) = schema {
344
unsafe { self.set_schema(schema) };
345
}
346
347
self
348
}
349
350
/// Clones the cached schema from `from` to `self.cached_schema` if there is one.
351
///
352
/// # Safety
353
/// Schema must match the columns in `self`.
354
#[inline]
355
pub unsafe fn set_schema_from(&mut self, from: &DataFrame) -> &mut Self {
356
self.set_opt_schema(from.cached_schema().cloned());
357
self
358
}
359
360
/// Clones the cached schema from `from` to `self.cached_schema` if there is one.
361
///
362
/// # Safety
363
/// Schema must match the columns in `self`.
364
#[inline]
365
pub unsafe fn with_schema_from(mut self, from: &DataFrame) -> Self {
366
self.set_opt_schema(from.cached_schema().cloned());
367
self
368
}
369
370
#[inline]
371
fn clear_schema(&mut self) -> &mut Self {
372
self.cached_schema = OnceLock::new();
373
self
374
}
375
}
376
377