Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/polars/dataframe/frame.py
6924 views
1
"""Module containing logic related to eager DataFrames."""
2
3
from __future__ import annotations
4
5
import contextlib
6
import os
7
import random
8
from collections import defaultdict
9
from collections.abc import (
10
Generator,
11
Iterable,
12
Mapping,
13
Sequence,
14
Sized,
15
)
16
from io import BytesIO, StringIO
17
from pathlib import Path
18
from typing import (
19
IO,
20
TYPE_CHECKING,
21
Any,
22
Callable,
23
ClassVar,
24
NoReturn,
25
TypeVar,
26
cast,
27
get_args,
28
overload,
29
)
30
31
import polars._reexport as pl
32
from polars import functions as F
33
from polars._typing import DbWriteMode, JaxExportType, TorchExportType
34
from polars._utils.construction import (
35
arrow_to_pydf,
36
dataframe_to_pydf,
37
dict_to_pydf,
38
iterable_to_pydf,
39
numpy_to_pydf,
40
pandas_to_pydf,
41
sequence_to_pydf,
42
series_to_pydf,
43
)
44
from polars._utils.convert import parse_as_duration_string
45
from polars._utils.deprecation import (
46
deprecate_renamed_parameter,
47
deprecated,
48
issue_deprecation_warning,
49
)
50
from polars._utils.getitem import get_df_item_by_key
51
from polars._utils.parse import parse_into_expression
52
from polars._utils.pycapsule import is_pycapsule, pycapsule_to_frame
53
from polars._utils.serde import serialize_polars_object
54
from polars._utils.unstable import issue_unstable_warning, unstable
55
from polars._utils.various import (
56
is_bool_sequence,
57
no_default,
58
normalize_filepath,
59
parse_version,
60
qualified_type_name,
61
require_same_type,
62
scale_bytes,
63
warn_null_comparison,
64
)
65
from polars._utils.wrap import wrap_expr, wrap_ldf, wrap_s
66
from polars.dataframe._html import NotebookFormatter
67
from polars.dataframe.group_by import DynamicGroupBy, GroupBy, RollingGroupBy
68
from polars.dataframe.plotting import DataFramePlot
69
from polars.datatypes import (
70
N_INFER_DEFAULT,
71
Boolean,
72
Float32,
73
Float64,
74
Int32,
75
Int64,
76
Null,
77
Object,
78
String,
79
Struct,
80
UInt16,
81
UInt32,
82
UInt64,
83
)
84
from polars.datatypes.group import INTEGER_DTYPES
85
from polars.dependencies import (
86
_ALTAIR_AVAILABLE,
87
_GREAT_TABLES_AVAILABLE,
88
_PANDAS_AVAILABLE,
89
_PYARROW_AVAILABLE,
90
_check_for_numpy,
91
_check_for_pandas,
92
_check_for_pyarrow,
93
_check_for_torch,
94
altair,
95
great_tables,
96
import_optional,
97
torch,
98
)
99
from polars.dependencies import numpy as np
100
from polars.dependencies import pandas as pd
101
from polars.dependencies import pyarrow as pa
102
from polars.exceptions import (
103
ColumnNotFoundError,
104
InvalidOperationError,
105
ModuleUpgradeRequiredError,
106
NoRowsReturnedError,
107
TooManyRowsReturnedError,
108
)
109
from polars.functions import col, lit
110
from polars.interchange.protocol import CompatLevel
111
from polars.schema import Schema
112
from polars.selectors import _expand_selector_dicts, _expand_selectors
113
114
with contextlib.suppress(ImportError): # Module not available when building docs
115
from polars._plr import PyDataFrame
116
from polars._plr import dtype_str_repr as _dtype_str_repr
117
from polars._plr import write_clipboard_string as _write_clipboard_string
118
119
if TYPE_CHECKING:
120
import sys
121
from collections.abc import Collection, Iterator, Mapping
122
from datetime import timedelta
123
from io import IOBase
124
from typing import Literal
125
126
import deltalake
127
import jax
128
import numpy.typing as npt
129
import pyiceberg
130
from great_tables import GT
131
from xlsxwriter import Workbook
132
from xlsxwriter.worksheet import Worksheet
133
134
from polars import DataType, Expr, LazyFrame, Series
135
from polars._typing import (
136
AsofJoinStrategy,
137
AvroCompression,
138
ClosedInterval,
139
ColumnFormatDict,
140
ColumnNameOrSelector,
141
ColumnTotalsDefinition,
142
ColumnWidthsDefinition,
143
ComparisonOperator,
144
ConditionalFormatDict,
145
ConnectionOrCursor,
146
CsvQuoteStyle,
147
DbWriteEngine,
148
EngineType,
149
FillNullStrategy,
150
FrameInitTypes,
151
IndexOrder,
152
IntoExpr,
153
IntoExprColumn,
154
IpcCompression,
155
JoinStrategy,
156
JoinValidation,
157
Label,
158
MaintainOrderJoin,
159
MultiColSelector,
160
MultiIndexSelector,
161
OneOrMoreDataTypes,
162
Orientation,
163
ParquetCompression,
164
ParquetMetadata,
165
PartitioningScheme,
166
PivotAgg,
167
PolarsDataType,
168
PythonDataType,
169
QuantileMethod,
170
RowTotalsDefinition,
171
SchemaDefinition,
172
SchemaDict,
173
SelectorType,
174
SerializationFormat,
175
SingleColSelector,
176
SingleIndexSelector,
177
SizeUnit,
178
StartBy,
179
UniqueKeepStrategy,
180
UnstackDirection,
181
)
182
from polars._utils.various import NoDefault
183
from polars.interchange.dataframe import PolarsDataFrame
184
from polars.io.cloud import CredentialProviderFunction
185
from polars.ml.torch import PolarsDataset
186
187
if sys.version_info >= (3, 10):
188
from typing import Concatenate, ParamSpec
189
else:
190
from typing_extensions import Concatenate, ParamSpec
191
192
if sys.version_info >= (3, 13):
193
from warnings import deprecated
194
else:
195
from typing_extensions import deprecated # noqa: TC004
196
197
T = TypeVar("T")
198
P = ParamSpec("P")
199
200
201
class DataFrame:
202
"""
203
Two-dimensional data structure representing data as a table with rows and columns.
204
205
Parameters
206
----------
207
data : dict, Sequence, ndarray, Series, or pandas.DataFrame
208
Two-dimensional data in various forms; dict input must contain Sequences,
209
Generators, or a `range`. Sequence may contain Series or other Sequences.
210
schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict
211
The schema of the resulting DataFrame. The schema may be declared in several
212
ways:
213
214
* As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
215
* As a list of column names; in this case types are automatically inferred.
216
* As a list of (name,type) pairs; this is equivalent to the dictionary form.
217
218
If you supply a list of column names that does not match the names in the
219
underlying data, the names given here will overwrite them. The number
220
of names given in the schema should match the underlying data dimensions.
221
222
If set to `None` (default), the schema is inferred from the data.
223
schema_overrides : dict, default None
224
Support type specification or override of one or more columns; note that
225
any dtypes inferred from the schema param will be overridden.
226
227
The number of entries in the schema should match the underlying data
228
dimensions, unless a sequence of dictionaries is being passed, in which case
229
a *partial* schema can be declared to prevent specific fields from being loaded.
230
strict : bool, default True
231
Throw an error if any `data` value does not exactly match the given or inferred
232
data type for that column. If set to `False`, values that do not match the data
233
type are cast to that data type or, if casting is not possible, set to null
234
instead.
235
orient : {'col', 'row'}, default None
236
Whether to interpret two-dimensional data as columns or as rows. If None,
237
the orientation is inferred by matching the columns and data dimensions. If
238
this does not yield conclusive results, column orientation is used.
239
infer_schema_length : int or None
240
The maximum number of rows to scan for schema inference. If set to `None`, the
241
full data may be scanned *(this can be slow)*. This parameter only applies if
242
the input data is a sequence or generator of rows; other input is read as-is.
243
nan_to_null : bool, default False
244
If the data comes from one or more numpy arrays, can optionally convert input
245
data np.nan values to null instead. This is a no-op for all other input data.
246
247
Notes
248
-----
249
Polars explicitly does not support subclassing of its core data types. See
250
the following GitHub issue for possible workarounds:
251
https://github.com/pola-rs/polars/issues/2846#issuecomment-1711799869
252
253
Examples
254
--------
255
Constructing a DataFrame from a dictionary:
256
257
>>> data = {"a": [1, 2], "b": [3, 4]}
258
>>> df = pl.DataFrame(data)
259
>>> df
260
shape: (2, 2)
261
┌─────┬─────┐
262
│ a ┆ b │
263
│ --- ┆ --- │
264
│ i64 ┆ i64 │
265
╞═════╪═════╡
266
│ 1 ┆ 3 │
267
│ 2 ┆ 4 │
268
└─────┴─────┘
269
270
Notice that the dtypes are automatically inferred as polars Int64:
271
272
>>> df.dtypes
273
[Int64, Int64]
274
275
To specify a more detailed/specific frame schema you can supply the `schema`
276
parameter with a dictionary of (name,dtype) pairs...
277
278
>>> data = {"col1": [0, 2], "col2": [3, 7]}
279
>>> df2 = pl.DataFrame(data, schema={"col1": pl.Float32, "col2": pl.Int64})
280
>>> df2
281
shape: (2, 2)
282
┌──────┬──────┐
283
│ col1 ┆ col2 │
284
│ --- ┆ --- │
285
│ f32 ┆ i64 │
286
╞══════╪══════╡
287
│ 0.0 ┆ 3 │
288
│ 2.0 ┆ 7 │
289
└──────┴──────┘
290
291
...a sequence of (name,dtype) pairs...
292
293
>>> data = {"col1": [1, 2], "col2": [3, 4]}
294
>>> df3 = pl.DataFrame(data, schema=[("col1", pl.Float32), ("col2", pl.Int64)])
295
>>> df3
296
shape: (2, 2)
297
┌──────┬──────┐
298
│ col1 ┆ col2 │
299
│ --- ┆ --- │
300
│ f32 ┆ i64 │
301
╞══════╪══════╡
302
│ 1.0 ┆ 3 │
303
│ 2.0 ┆ 4 │
304
└──────┴──────┘
305
306
...or a list of typed Series.
307
308
>>> data = [
309
... pl.Series("col1", [1, 2], dtype=pl.Float32),
310
... pl.Series("col2", [3, 4], dtype=pl.Int64),
311
... ]
312
>>> df4 = pl.DataFrame(data)
313
>>> df4
314
shape: (2, 2)
315
┌──────┬──────┐
316
│ col1 ┆ col2 │
317
│ --- ┆ --- │
318
│ f32 ┆ i64 │
319
╞══════╪══════╡
320
│ 1.0 ┆ 3 │
321
│ 2.0 ┆ 4 │
322
└──────┴──────┘
323
324
Constructing a DataFrame from a numpy ndarray, specifying column names:
325
326
>>> import numpy as np
327
>>> data = np.array([(1, 2), (3, 4)], dtype=np.int64)
328
>>> df5 = pl.DataFrame(data, schema=["a", "b"], orient="col")
329
>>> df5
330
shape: (2, 2)
331
┌─────┬─────┐
332
│ a ┆ b │
333
│ --- ┆ --- │
334
│ i64 ┆ i64 │
335
╞═════╪═════╡
336
│ 1 ┆ 3 │
337
│ 2 ┆ 4 │
338
└─────┴─────┘
339
340
Constructing a DataFrame from a list of lists, row orientation specified:
341
342
>>> data = [[1, 2, 3], [4, 5, 6]]
343
>>> df6 = pl.DataFrame(data, schema=["a", "b", "c"], orient="row")
344
>>> df6
345
shape: (2, 3)
346
┌─────┬─────┬─────┐
347
│ a ┆ b ┆ c │
348
│ --- ┆ --- ┆ --- │
349
│ i64 ┆ i64 ┆ i64 │
350
╞═════╪═════╪═════╡
351
│ 1 ┆ 2 ┆ 3 │
352
│ 4 ┆ 5 ┆ 6 │
353
└─────┴─────┴─────┘
354
"""
355
356
_df: PyDataFrame
357
_accessors: ClassVar[set[str]] = {"plot", "style"}
358
359
def __init__(
360
self,
361
data: FrameInitTypes | None = None,
362
schema: SchemaDefinition | None = None,
363
*,
364
schema_overrides: SchemaDict | None = None,
365
strict: bool = True,
366
orient: Orientation | None = None,
367
infer_schema_length: int | None = N_INFER_DEFAULT,
368
nan_to_null: bool = False,
369
) -> None:
370
if data is None:
371
self._df = dict_to_pydf(
372
{}, schema=schema, schema_overrides=schema_overrides
373
)
374
375
elif isinstance(data, dict):
376
self._df = dict_to_pydf(
377
data,
378
schema=schema,
379
schema_overrides=schema_overrides,
380
strict=strict,
381
nan_to_null=nan_to_null,
382
)
383
384
elif isinstance(data, (list, tuple, Sequence)):
385
self._df = sequence_to_pydf(
386
data,
387
schema=schema,
388
schema_overrides=schema_overrides,
389
strict=strict,
390
orient=orient,
391
infer_schema_length=infer_schema_length,
392
nan_to_null=nan_to_null,
393
)
394
395
elif isinstance(data, pl.Series):
396
self._df = series_to_pydf(
397
data, schema=schema, schema_overrides=schema_overrides, strict=strict
398
)
399
400
elif _check_for_numpy(data) and isinstance(data, np.ndarray):
401
self._df = numpy_to_pydf(
402
data,
403
schema=schema,
404
schema_overrides=schema_overrides,
405
strict=strict,
406
orient=orient,
407
nan_to_null=nan_to_null,
408
)
409
410
elif _check_for_pyarrow(data) and isinstance(data, pa.Table):
411
self._df = arrow_to_pydf(
412
data, schema=schema, schema_overrides=schema_overrides, strict=strict
413
)
414
415
elif _check_for_pandas(data) and isinstance(data, pd.DataFrame):
416
self._df = pandas_to_pydf(
417
data, schema=schema, schema_overrides=schema_overrides, strict=strict
418
)
419
420
elif _check_for_torch(data) and isinstance(data, torch.Tensor):
421
self._df = numpy_to_pydf(
422
data.numpy(force=False),
423
schema=schema,
424
schema_overrides=schema_overrides,
425
strict=strict,
426
orient=orient,
427
nan_to_null=nan_to_null,
428
)
429
430
elif (
431
not hasattr(data, "__arrow_c_stream__")
432
and not isinstance(data, Sized)
433
and isinstance(data, (Generator, Iterable))
434
):
435
self._df = iterable_to_pydf(
436
data,
437
schema=schema,
438
schema_overrides=schema_overrides,
439
strict=strict,
440
orient=orient,
441
infer_schema_length=infer_schema_length,
442
)
443
444
elif isinstance(data, pl.DataFrame):
445
self._df = dataframe_to_pydf(
446
data, schema=schema, schema_overrides=schema_overrides, strict=strict
447
)
448
449
elif is_pycapsule(data):
450
self._df = pycapsule_to_frame(
451
data,
452
schema=schema,
453
schema_overrides=schema_overrides,
454
)._df
455
else:
456
msg = (
457
f"DataFrame constructor called with unsupported type {type(data).__name__!r}"
458
" for the `data` parameter"
459
)
460
raise TypeError(msg)
461
462
@classmethod
463
def deserialize(
464
cls, source: str | Path | IOBase, *, format: SerializationFormat = "binary"
465
) -> DataFrame:
466
"""
467
Read a serialized DataFrame from a file.
468
469
Parameters
470
----------
471
source
472
Path to a file or a file-like object (by file-like object, we refer to
473
objects that have a `read()` method, such as a file handler (e.g.
474
via builtin `open` function) or `BytesIO`).
475
format
476
The format with which the DataFrame was serialized. Options:
477
478
- `"binary"`: Deserialize from binary format (bytes). This is the default.
479
- `"json"`: Deserialize from JSON format (string).
480
481
See Also
482
--------
483
DataFrame.serialize
484
485
Notes
486
-----
487
Serialization is not stable across Polars versions: a LazyFrame serialized
488
in one Polars version may not be deserializable in another Polars version.
489
490
Examples
491
--------
492
>>> import io
493
>>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]})
494
>>> bytes = df.serialize()
495
>>> pl.DataFrame.deserialize(io.BytesIO(bytes))
496
shape: (3, 2)
497
┌─────┬─────┐
498
│ a ┆ b │
499
│ --- ┆ --- │
500
│ i64 ┆ f64 │
501
╞═════╪═════╡
502
│ 1 ┆ 4.0 │
503
│ 2 ┆ 5.0 │
504
│ 3 ┆ 6.0 │
505
└─────┴─────┘
506
"""
507
if isinstance(source, StringIO):
508
source = BytesIO(source.getvalue().encode())
509
elif isinstance(source, (str, Path)):
510
source = normalize_filepath(source)
511
512
if format == "binary":
513
deserializer = PyDataFrame.deserialize_binary
514
elif format == "json":
515
deserializer = PyDataFrame.deserialize_json
516
else:
517
msg = f"`format` must be one of {{'binary', 'json'}}, got {format!r}"
518
raise ValueError(msg)
519
520
return cls._from_pydf(deserializer(source))
521
522
@classmethod
523
def _from_pydf(cls, py_df: PyDataFrame) -> DataFrame:
524
"""Construct Polars DataFrame from FFI PyDataFrame object."""
525
df = cls.__new__(cls)
526
df._df = py_df
527
return df
528
529
@classmethod
530
def _from_arrow(
531
cls,
532
data: pa.Table | pa.RecordBatch,
533
schema: SchemaDefinition | None = None,
534
*,
535
schema_overrides: SchemaDict | None = None,
536
rechunk: bool = True,
537
) -> DataFrame:
538
"""
539
Construct a DataFrame from an Arrow table.
540
541
This operation will be zero copy for the most part. Types that are not
542
supported by Polars may be cast to the closest supported type.
543
544
Parameters
545
----------
546
data : arrow Table, RecordBatch, or sequence of sequences
547
Data representing an Arrow Table or RecordBatch.
548
schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict
549
The DataFrame schema may be declared in several ways:
550
551
* As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
552
* As a list of column names; in this case types are automatically inferred.
553
* As a list of (name,type) pairs; this is equivalent to the dictionary form.
554
555
If you supply a list of column names that does not match the names in the
556
underlying data, the names given here will overwrite them. The number
557
of names given in the schema should match the underlying data dimensions.
558
schema_overrides : dict, default None
559
Support type specification or override of one or more columns; note that
560
any dtypes inferred from the columns param will be overridden.
561
rechunk : bool, default True
562
Make sure that all data is in contiguous memory.
563
"""
564
return cls._from_pydf(
565
arrow_to_pydf(
566
data,
567
schema=schema,
568
schema_overrides=schema_overrides,
569
rechunk=rechunk,
570
)
571
)
572
573
@classmethod
574
def _from_pandas(
575
cls,
576
data: pd.DataFrame,
577
schema: SchemaDefinition | None = None,
578
*,
579
schema_overrides: SchemaDict | None = None,
580
rechunk: bool = True,
581
nan_to_null: bool = True,
582
include_index: bool = False,
583
) -> DataFrame:
584
"""
585
Construct a Polars DataFrame from a pandas DataFrame.
586
587
Parameters
588
----------
589
data : pandas DataFrame
590
Two-dimensional data represented as a pandas DataFrame.
591
schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict
592
The DataFrame schema may be declared in several ways:
593
594
* As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
595
* As a list of column names; in this case types are automatically inferred.
596
* As a list of (name,type) pairs; this is equivalent to the dictionary form.
597
598
If you supply a list of column names that does not match the names in the
599
underlying data, the names given here will overwrite them. The number
600
of names given in the schema should match the underlying data dimensions.
601
schema_overrides : dict, default None
602
Support type specification or override of one or more columns; note that
603
any dtypes inferred from the columns param will be overridden.
604
rechunk : bool, default True
605
Make sure that all data is in contiguous memory.
606
nan_to_null : bool, default True
607
If the data contains NaN values they will be converted to null/None.
608
include_index : bool, default False
609
Load any non-default pandas indexes as columns.
610
"""
611
return cls._from_pydf(
612
pandas_to_pydf(
613
data,
614
schema=schema,
615
schema_overrides=schema_overrides,
616
rechunk=rechunk,
617
nan_to_null=nan_to_null,
618
include_index=include_index,
619
)
620
)
621
622
def _replace(self, column: str, new_column: Series) -> DataFrame:
623
"""Replace a column by a new Series (in place)."""
624
self._df.replace(column, new_column._s)
625
return self
626
627
@classmethod
628
def _import_columns(cls, pointer: int, width: int) -> DataFrame:
629
return cls._from_pydf(PyDataFrame._import_columns(pointer, width))
630
631
@property
632
@unstable()
633
def plot(self) -> DataFramePlot:
634
"""
635
Create a plot namespace.
636
637
.. warning::
638
This functionality is currently considered **unstable**. It may be
639
changed at any point without it being considered a breaking change.
640
641
.. versionchanged:: 1.6.0
642
In prior versions of Polars, HvPlot was the plotting backend. If you would
643
like to restore the previous plotting functionality, all you need to do
644
is add `import hvplot.polars` at the top of your script and replace
645
`df.plot` with `df.hvplot`.
646
647
Polars does not implement plotting logic itself, but instead defers to
648
`Altair <https://altair-viz.github.io/>`_:
649
650
- `df.plot.line(**kwargs)`
651
is shorthand for
652
`alt.Chart(df).mark_line(tooltip=True).encode(**kwargs).interactive()`
653
- `df.plot.point(**kwargs)`
654
is shorthand for
655
`alt.Chart(df).mark_point(tooltip=True).encode(**kwargs).interactive()` (and
656
`plot.scatter` is provided as an alias)
657
- `df.plot.bar(**kwargs)`
658
is shorthand for
659
`alt.Chart(df).mark_bar(tooltip=True).encode(**kwargs).interactive()`
660
- for any other attribute `attr`, `df.plot.attr(**kwargs)`
661
is shorthand for
662
`alt.Chart(df).mark_attr(tooltip=True).encode(**kwargs).interactive()`
663
664
For configuration, we suggest reading
665
`Chart Configuration <https://altair-viz.github.io/altair-tutorial/notebooks/08-Configuration.html>`_.
666
For example, you can:
667
668
- Change the width/height/title with
669
``.properties(width=500, height=350, title="My amazing plot")``.
670
- Change the x-axis label rotation with ``.configure_axisX(labelAngle=30)``.
671
- Change the opacity of the points in your scatter plot with
672
``.configure_point(opacity=.5)``.
673
674
Examples
675
--------
676
Scatter plot:
677
678
>>> df = pl.DataFrame(
679
... {
680
... "length": [1, 4, 6],
681
... "width": [4, 5, 6],
682
... "species": ["setosa", "setosa", "versicolor"],
683
... }
684
... )
685
>>> df.plot.point(x="length", y="width", color="species") # doctest: +SKIP
686
687
Set the x-axis title by using ``altair.X``:
688
689
>>> import altair as alt
690
>>> df.plot.point(
691
... x=alt.X("length", title="Length"), y="width", color="species"
692
... ) # doctest: +SKIP
693
694
Line plot:
695
696
>>> from datetime import date
697
>>> df = pl.DataFrame(
698
... {
699
... "date": [date(2020, 1, 2), date(2020, 1, 3), date(2020, 1, 4)] * 2,
700
... "price": [1, 4, 6, 1, 5, 2],
701
... "stock": ["a", "a", "a", "b", "b", "b"],
702
... }
703
... )
704
>>> df.plot.line(x="date", y="price", color="stock") # doctest: +SKIP
705
706
Bar plot:
707
708
>>> df = pl.DataFrame(
709
... {
710
... "day": ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] * 2,
711
... "group": ["a"] * 7 + ["b"] * 7,
712
... "value": [1, 3, 2, 4, 5, 6, 1, 1, 3, 2, 4, 5, 1, 2],
713
... }
714
... )
715
>>> df.plot.bar(
716
... x="day", y="value", color="day", column="group"
717
... ) # doctest: +SKIP
718
719
Or, to make a stacked version of the plot above:
720
721
>>> df.plot.bar(x="day", y="value", color="group") # doctest: +SKIP
722
"""
723
if not _ALTAIR_AVAILABLE or parse_version(altair.__version__) < (5, 4, 0):
724
msg = "altair>=5.4.0 is required for `.plot`"
725
raise ModuleUpgradeRequiredError(msg)
726
return DataFramePlot(self)
727
728
@property
729
@unstable()
730
def style(self) -> GT:
731
"""
732
Create a Great Table for styling.
733
734
.. warning::
735
This functionality is currently considered **unstable**. It may be
736
changed at any point without it being considered a breaking change.
737
738
Polars does not implement styling logic itself, but instead defers to
739
the Great Tables package. Please see the `Great Tables reference <https://posit-dev.github.io/great-tables/reference/>`_
740
for more information and documentation.
741
742
Examples
743
--------
744
Import some styling helpers, and create example data:
745
746
>>> import polars.selectors as cs
747
>>> from great_tables import loc, style
748
>>> df = pl.DataFrame(
749
... {
750
... "site_id": [0, 1, 2],
751
... "measure_a": [5, 4, 6],
752
... "measure_b": [7, 3, 3],
753
... }
754
... )
755
756
Emphasize the site_id as row names:
757
758
>>> df.style.tab_stub(rowname_col="site_id") # doctest: +SKIP
759
760
Fill the background for the highest measure_a value row:
761
762
>>> df.style.tab_style(
763
... style.fill("yellow"),
764
... loc.body(rows=pl.col("measure_a") == pl.col("measure_a").max()),
765
... ) # doctest: +SKIP
766
767
Put a spanner (high-level label) over measure columns:
768
769
>>> df.style.tab_spanner(
770
... "Measures", cs.starts_with("measure")
771
... ) # doctest: +SKIP
772
773
Format measure_b values to two decimal places:
774
775
>>> df.style.fmt_number("measure_b", decimals=2) # doctest: +SKIP
776
"""
777
if not _GREAT_TABLES_AVAILABLE:
778
msg = "great_tables is required for `.style`"
779
raise ModuleNotFoundError(msg)
780
781
return great_tables.GT(self)
782
783
@property
784
def shape(self) -> tuple[int, int]:
785
"""
786
Get the shape of the DataFrame.
787
788
Examples
789
--------
790
>>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5]})
791
>>> df.shape
792
(5, 1)
793
"""
794
return self._df.shape()
795
796
@property
797
def height(self) -> int:
798
"""
799
Get the number of rows.
800
801
Returns
802
-------
803
int
804
805
Examples
806
--------
807
>>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5]})
808
>>> df.height
809
5
810
"""
811
return self._df.height()
812
813
@property
814
def width(self) -> int:
815
"""
816
Get the number of columns.
817
818
Returns
819
-------
820
int
821
822
Examples
823
--------
824
>>> df = pl.DataFrame(
825
... {
826
... "foo": [1, 2, 3],
827
... "bar": [4, 5, 6],
828
... }
829
... )
830
>>> df.width
831
2
832
"""
833
return self._df.width()
834
835
@property
836
def columns(self) -> list[str]:
837
"""
838
Get or set column names.
839
840
Returns
841
-------
842
list of str
843
A list containing the name of each column in order.
844
845
Examples
846
--------
847
>>> df = pl.DataFrame(
848
... {
849
... "foo": [1, 2, 3],
850
... "bar": [6, 7, 8],
851
... "ham": ["a", "b", "c"],
852
... }
853
... )
854
>>> df.columns
855
['foo', 'bar', 'ham']
856
857
Set column names:
858
859
>>> df.columns = ["apple", "banana", "orange"]
860
>>> df
861
shape: (3, 3)
862
┌───────┬────────┬────────┐
863
│ apple ┆ banana ┆ orange │
864
│ --- ┆ --- ┆ --- │
865
│ i64 ┆ i64 ┆ str │
866
╞═══════╪════════╪════════╡
867
│ 1 ┆ 6 ┆ a │
868
│ 2 ┆ 7 ┆ b │
869
│ 3 ┆ 8 ┆ c │
870
└───────┴────────┴────────┘
871
"""
872
return self._df.columns()
873
874
@columns.setter
875
def columns(self, names: Sequence[str]) -> None:
876
"""
877
Change the column names of the `DataFrame`.
878
879
Parameters
880
----------
881
names
882
A list with new names for the `DataFrame`.
883
The length of the list should be equal to the width of the `DataFrame`.
884
"""
885
self._df.set_column_names(names)
886
887
@property
888
def dtypes(self) -> list[DataType]:
889
"""
890
Get the column data types.
891
892
The data types can also be found in column headers when printing the DataFrame.
893
894
Returns
895
-------
896
list of DataType
897
A list containing the data type of each column in order.
898
899
See Also
900
--------
901
schema
902
903
Examples
904
--------
905
>>> df = pl.DataFrame(
906
... {
907
... "foo": [1, 2, 3],
908
... "bar": [6.0, 7.0, 8.0],
909
... "ham": ["a", "b", "c"],
910
... }
911
... )
912
>>> df.dtypes
913
[Int64, Float64, String]
914
>>> df
915
shape: (3, 3)
916
┌─────┬─────┬─────┐
917
│ foo ┆ bar ┆ ham │
918
│ --- ┆ --- ┆ --- │
919
│ i64 ┆ f64 ┆ str │
920
╞═════╪═════╪═════╡
921
│ 1 ┆ 6.0 ┆ a │
922
│ 2 ┆ 7.0 ┆ b │
923
│ 3 ┆ 8.0 ┆ c │
924
└─────┴─────┴─────┘
925
"""
926
return self._df.dtypes()
927
928
@property
929
def flags(self) -> dict[str, dict[str, bool]]:
930
"""
931
Get flags that are set on the columns of this DataFrame.
932
933
Returns
934
-------
935
dict
936
Mapping from column names to column flags.
937
"""
938
return {name: self[name].flags for name in self.columns}
939
940
@property
941
def schema(self) -> Schema:
942
"""
943
Get an ordered mapping of column names to their data type.
944
945
Examples
946
--------
947
>>> df = pl.DataFrame(
948
... {
949
... "foo": [1, 2, 3],
950
... "bar": [6.0, 7.0, 8.0],
951
... "ham": ["a", "b", "c"],
952
... }
953
... )
954
>>> df.schema
955
Schema({'foo': Int64, 'bar': Float64, 'ham': String})
956
"""
957
return Schema(zip(self.columns, self.dtypes), check_dtypes=False)
958
959
def __array__(
960
self, dtype: npt.DTypeLike | None = None, copy: bool | None = None
961
) -> np.ndarray[Any, Any]:
962
"""
963
Return a NumPy ndarray with the given data type.
964
965
This method ensures a Polars DataFrame can be treated as a NumPy ndarray.
966
It enables `np.asarray` and NumPy universal functions.
967
968
See the NumPy documentation for more information:
969
https://numpy.org/doc/stable/user/basics.interoperability.html#the-array-method
970
"""
971
if copy is None:
972
writable, allow_copy = False, True
973
elif copy is True:
974
writable, allow_copy = True, True
975
elif copy is False:
976
writable, allow_copy = False, False
977
else:
978
msg = f"invalid input for `copy`: {copy!r}"
979
raise TypeError(msg)
980
981
arr = self.to_numpy(writable=writable, allow_copy=allow_copy)
982
983
if dtype is not None and dtype != arr.dtype:
984
if copy is False:
985
# TODO: Only raise when data must be copied
986
msg = f"copy not allowed: cast from {arr.dtype} to {dtype} prohibited"
987
raise RuntimeError(msg)
988
989
arr = arr.__array__(dtype)
990
991
return arr
992
993
def __dataframe__(
994
self,
995
nan_as_null: bool = False, # noqa: FBT001
996
allow_copy: bool = True, # noqa: FBT001
997
) -> PolarsDataFrame:
998
"""
999
Convert to a dataframe object implementing the dataframe interchange protocol.
1000
1001
Parameters
1002
----------
1003
nan_as_null
1004
Overwrite null values in the data with `NaN`.
1005
1006
.. warning::
1007
This functionality has not been implemented and the parameter will be
1008
removed in a future version.
1009
Setting this to `True` will raise a `NotImplementedError`.
1010
allow_copy
1011
Allow memory to be copied to perform the conversion. If set to `False`,
1012
causes conversions that are not zero-copy to fail.
1013
1014
Notes
1015
-----
1016
Details on the Python dataframe interchange protocol:
1017
https://data-apis.org/dataframe-protocol/latest/index.html
1018
1019
Examples
1020
--------
1021
Convert a Polars DataFrame to a generic dataframe object and access some
1022
properties.
1023
1024
>>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]})
1025
>>> dfi = df.__dataframe__()
1026
>>> dfi.num_rows()
1027
2
1028
>>> dfi.get_column(1).dtype
1029
(<DtypeKind.FLOAT: 2>, 64, 'g', '=')
1030
"""
1031
if nan_as_null:
1032
msg = (
1033
"functionality for `nan_as_null` has not been implemented and the"
1034
" parameter will be removed in a future version"
1035
"\n\nUse the default `nan_as_null=False`."
1036
)
1037
raise NotImplementedError(msg)
1038
1039
from polars.interchange.dataframe import PolarsDataFrame
1040
1041
return PolarsDataFrame(self, allow_copy=allow_copy)
1042
1043
def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame:
1044
"""Compare a DataFrame with another object."""
1045
if isinstance(other, DataFrame):
1046
return self._compare_to_other_df(other, op)
1047
else:
1048
return self._compare_to_non_df(other, op)
1049
1050
def _compare_to_other_df(
1051
self,
1052
other: DataFrame,
1053
op: ComparisonOperator,
1054
) -> DataFrame:
1055
"""Compare a DataFrame with another DataFrame."""
1056
if self.columns != other.columns:
1057
msg = "DataFrame columns do not match"
1058
raise ValueError(msg)
1059
if self.shape != other.shape:
1060
msg = "DataFrame dimensions do not match"
1061
raise ValueError(msg)
1062
1063
suffix = "__POLARS_CMP_OTHER"
1064
other_renamed = other.select(F.all().name.suffix(suffix))
1065
combined = F.concat([self, other_renamed], how="horizontal")
1066
1067
if op == "eq":
1068
expr = [F.col(n) == F.col(f"{n}{suffix}") for n in self.columns]
1069
elif op == "neq":
1070
expr = [F.col(n) != F.col(f"{n}{suffix}") for n in self.columns]
1071
elif op == "gt":
1072
expr = [F.col(n) > F.col(f"{n}{suffix}") for n in self.columns]
1073
elif op == "lt":
1074
expr = [F.col(n) < F.col(f"{n}{suffix}") for n in self.columns]
1075
elif op == "gt_eq":
1076
expr = [F.col(n) >= F.col(f"{n}{suffix}") for n in self.columns]
1077
elif op == "lt_eq":
1078
expr = [F.col(n) <= F.col(f"{n}{suffix}") for n in self.columns]
1079
else:
1080
msg = f"unexpected comparison operator {op!r}"
1081
raise ValueError(msg)
1082
1083
return combined.select(expr)
1084
1085
def _compare_to_non_df(
1086
self,
1087
other: Any,
1088
op: ComparisonOperator,
1089
) -> DataFrame:
1090
"""Compare a DataFrame with a non-DataFrame object."""
1091
warn_null_comparison(other)
1092
if op == "eq":
1093
return self.select(F.all() == other)
1094
elif op == "neq":
1095
return self.select(F.all() != other)
1096
elif op == "gt":
1097
return self.select(F.all() > other)
1098
elif op == "lt":
1099
return self.select(F.all() < other)
1100
elif op == "gt_eq":
1101
return self.select(F.all() >= other)
1102
elif op == "lt_eq":
1103
return self.select(F.all() <= other)
1104
else:
1105
msg = f"unexpected comparison operator {op!r}"
1106
raise ValueError(msg)
1107
1108
def _div(self, other: Any, *, floordiv: bool) -> DataFrame:
1109
if isinstance(other, pl.Series):
1110
if floordiv:
1111
return self.select(F.all() // lit(other))
1112
return self.select(F.all() / lit(other))
1113
1114
elif not isinstance(other, DataFrame):
1115
s = _prepare_other_arg(other, length=self.height)
1116
other = DataFrame([s.alias(f"n{i}") for i in range(self.width)])
1117
1118
orig_dtypes = other.dtypes
1119
# TODO: Dispatch to a native floordiv
1120
other = self._cast_all_from_to(other, INTEGER_DTYPES, Float64)
1121
df = self._from_pydf(self._df.div_df(other._df))
1122
1123
df = (
1124
df
1125
if not floordiv
1126
else df.with_columns([s.floor() for s in df if s.dtype.is_float()])
1127
)
1128
if floordiv:
1129
int_casts = [
1130
col(column).cast(tp)
1131
for i, (column, tp) in enumerate(self.schema.items())
1132
if tp.is_integer()
1133
and (orig_dtypes[i].is_integer() or orig_dtypes[i] == Null)
1134
]
1135
if int_casts:
1136
return df.with_columns(int_casts)
1137
return df
1138
1139
def _cast_all_from_to(
1140
self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType
1141
) -> DataFrame:
1142
casts = [s.cast(to).alias(s.name) for s in df if s.dtype in from_]
1143
return df.with_columns(casts) if casts else df
1144
1145
def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame:
1146
return self._div(other, floordiv=True)
1147
1148
def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame:
1149
return self._div(other, floordiv=False)
1150
1151
def __bool__(self) -> NoReturn:
1152
msg = (
1153
"the truth value of a DataFrame is ambiguous"
1154
"\n\nHint: to check if a DataFrame contains any values, use `is_empty()`."
1155
)
1156
raise TypeError(msg)
1157
1158
def __eq__(self, other: object) -> DataFrame: # type: ignore[override]
1159
return self._comp(other, "eq")
1160
1161
def __ne__(self, other: object) -> DataFrame: # type: ignore[override]
1162
return self._comp(other, "neq")
1163
1164
def __gt__(self, other: Any) -> DataFrame:
1165
return self._comp(other, "gt")
1166
1167
def __lt__(self, other: Any) -> DataFrame:
1168
return self._comp(other, "lt")
1169
1170
def __ge__(self, other: Any) -> DataFrame:
1171
return self._comp(other, "gt_eq")
1172
1173
def __le__(self, other: Any) -> DataFrame:
1174
return self._comp(other, "lt_eq")
1175
1176
def __getstate__(self) -> bytes:
1177
return self.serialize()
1178
1179
def __setstate__(self, state: bytes) -> None:
1180
self._df = self.deserialize(BytesIO(state))._df
1181
1182
def __mul__(self, other: DataFrame | Series | int | float) -> DataFrame:
1183
if isinstance(other, DataFrame):
1184
return self._from_pydf(self._df.mul_df(other._df))
1185
1186
other = _prepare_other_arg(other)
1187
return self._from_pydf(self._df.mul(other._s))
1188
1189
def __rmul__(self, other: int | float) -> DataFrame:
1190
return self * other
1191
1192
def __add__(
1193
self, other: DataFrame | Series | int | float | bool | str
1194
) -> DataFrame:
1195
if isinstance(other, DataFrame):
1196
return self._from_pydf(self._df.add_df(other._df))
1197
other = _prepare_other_arg(other)
1198
return self._from_pydf(self._df.add(other._s))
1199
1200
def __radd__(
1201
self, other: DataFrame | Series | int | float | bool | str
1202
) -> DataFrame:
1203
if isinstance(other, str):
1204
return self.select((lit(other) + F.col("*")).name.keep())
1205
return self + other
1206
1207
def __sub__(self, other: DataFrame | Series | int | float) -> DataFrame:
1208
if isinstance(other, DataFrame):
1209
return self._from_pydf(self._df.sub_df(other._df))
1210
other = _prepare_other_arg(other)
1211
return self._from_pydf(self._df.sub(other._s))
1212
1213
def __mod__(self, other: DataFrame | Series | int | float) -> DataFrame:
1214
if isinstance(other, DataFrame):
1215
return self._from_pydf(self._df.rem_df(other._df))
1216
other = _prepare_other_arg(other)
1217
return self._from_pydf(self._df.rem(other._s))
1218
1219
def __str__(self) -> str:
1220
return self._df.as_str()
1221
1222
def __repr__(self) -> str:
1223
return self.__str__()
1224
1225
def __contains__(self, key: str) -> bool:
1226
return key in self.columns
1227
1228
def __iter__(self) -> Iterator[Series]:
1229
return self.iter_columns()
1230
1231
def __reversed__(self) -> Iterator[Series]:
1232
return reversed(self.get_columns())
1233
1234
# `str` overlaps with `Sequence[str]`
1235
# We can ignore this but we must keep this overload ordering
1236
@overload
1237
def __getitem__(
1238
self, key: tuple[SingleIndexSelector, SingleColSelector]
1239
) -> Any: ...
1240
1241
@overload
1242
def __getitem__( # type: ignore[overload-overlap]
1243
self, key: str | tuple[MultiIndexSelector, SingleColSelector]
1244
) -> Series: ...
1245
1246
@overload
1247
def __getitem__(
1248
self,
1249
key: (
1250
SingleIndexSelector
1251
| MultiIndexSelector
1252
| MultiColSelector
1253
| tuple[SingleIndexSelector, MultiColSelector]
1254
| tuple[MultiIndexSelector, MultiColSelector]
1255
),
1256
) -> DataFrame: ...
1257
1258
def __getitem__(
1259
self,
1260
key: (
1261
SingleIndexSelector
1262
| SingleColSelector
1263
| MultiColSelector
1264
| MultiIndexSelector
1265
| tuple[SingleIndexSelector, SingleColSelector]
1266
| tuple[SingleIndexSelector, MultiColSelector]
1267
| tuple[MultiIndexSelector, SingleColSelector]
1268
| tuple[MultiIndexSelector, MultiColSelector]
1269
),
1270
) -> DataFrame | Series | Any:
1271
"""
1272
Get part of the DataFrame as a new DataFrame, Series, or scalar.
1273
1274
Parameters
1275
----------
1276
key
1277
Rows / columns to select. This is easiest to explain via example. Suppose
1278
we have a DataFrame with columns `'a'`, `'d'`, `'c'`, `'d'`. Here is what
1279
various types of `key` would do:
1280
1281
- `df[0, 'a']` extracts the first element of column `'a'` and returns a
1282
scalar.
1283
- `df[0]` extracts the first row and returns a Dataframe.
1284
- `df['a']` extracts column `'a'` and returns a Series.
1285
- `df[0:2]` extracts the first two rows and returns a Dataframe.
1286
- `df[0:2, 'a']` extracts the first two rows from column `'a'` and returns
1287
a Series.
1288
- `df[0:2, 0]` extracts the first two rows from the first column and returns
1289
a Series.
1290
- `df[[0, 1], [0, 1, 2]]` extracts the first two rows and the first three
1291
columns and returns a Dataframe.
1292
- `df[0: 2, ['a', 'c']]` extracts the first two rows from columns `'a'` and
1293
`'c'` and returns a Dataframe.
1294
- `df[:, 0: 2]` extracts all rows from the first two columns and returns a
1295
Dataframe.
1296
- `df[:, 'a': 'c']` extracts all rows and all columns positioned between
1297
`'a'` and `'c'` *inclusive* and returns a Dataframe. In our example,
1298
that would extract columns `'a'`, `'d'`, and `'c'`.
1299
1300
Returns
1301
-------
1302
DataFrame, Series, or scalar, depending on `key`.
1303
1304
Examples
1305
--------
1306
>>> df = pl.DataFrame(
1307
... {"a": [1, 2, 3], "d": [4, 5, 6], "c": [1, 3, 2], "b": [7, 8, 9]}
1308
... )
1309
>>> df[0]
1310
shape: (1, 4)
1311
┌─────┬─────┬─────┬─────┐
1312
│ a ┆ d ┆ c ┆ b │
1313
│ --- ┆ --- ┆ --- ┆ --- │
1314
│ i64 ┆ i64 ┆ i64 ┆ i64 │
1315
╞═════╪═════╪═════╪═════╡
1316
│ 1 ┆ 4 ┆ 1 ┆ 7 │
1317
└─────┴─────┴─────┴─────┘
1318
>>> df[0, "a"]
1319
1
1320
>>> df["a"]
1321
shape: (3,)
1322
Series: 'a' [i64]
1323
[
1324
1
1325
2
1326
3
1327
]
1328
>>> df[0:2]
1329
shape: (2, 4)
1330
┌─────┬─────┬─────┬─────┐
1331
│ a ┆ d ┆ c ┆ b │
1332
│ --- ┆ --- ┆ --- ┆ --- │
1333
│ i64 ┆ i64 ┆ i64 ┆ i64 │
1334
╞═════╪═════╪═════╪═════╡
1335
│ 1 ┆ 4 ┆ 1 ┆ 7 │
1336
│ 2 ┆ 5 ┆ 3 ┆ 8 │
1337
└─────┴─────┴─────┴─────┘
1338
>>> df[0:2, "a"]
1339
shape: (2,)
1340
Series: 'a' [i64]
1341
[
1342
1
1343
2
1344
]
1345
>>> df[0:2, 0]
1346
shape: (2,)
1347
Series: 'a' [i64]
1348
[
1349
1
1350
2
1351
]
1352
>>> df[[0, 1], [0, 1, 2]]
1353
shape: (2, 3)
1354
┌─────┬─────┬─────┐
1355
│ a ┆ d ┆ c │
1356
│ --- ┆ --- ┆ --- │
1357
│ i64 ┆ i64 ┆ i64 │
1358
╞═════╪═════╪═════╡
1359
│ 1 ┆ 4 ┆ 1 │
1360
│ 2 ┆ 5 ┆ 3 │
1361
└─────┴─────┴─────┘
1362
>>> df[0:2, ["a", "c"]]
1363
shape: (2, 2)
1364
┌─────┬─────┐
1365
│ a ┆ c │
1366
│ --- ┆ --- │
1367
│ i64 ┆ i64 │
1368
╞═════╪═════╡
1369
│ 1 ┆ 1 │
1370
│ 2 ┆ 3 │
1371
└─────┴─────┘
1372
>>> df[:, 0:2]
1373
shape: (3, 2)
1374
┌─────┬─────┐
1375
│ a ┆ d │
1376
│ --- ┆ --- │
1377
│ i64 ┆ i64 │
1378
╞═════╪═════╡
1379
│ 1 ┆ 4 │
1380
│ 2 ┆ 5 │
1381
│ 3 ┆ 6 │
1382
└─────┴─────┘
1383
>>> df[:, "a":"c"]
1384
shape: (3, 3)
1385
┌─────┬─────┬─────┐
1386
│ a ┆ d ┆ c │
1387
│ --- ┆ --- ┆ --- │
1388
│ i64 ┆ i64 ┆ i64 │
1389
╞═════╪═════╪═════╡
1390
│ 1 ┆ 4 ┆ 1 │
1391
│ 2 ┆ 5 ┆ 3 │
1392
│ 3 ┆ 6 ┆ 2 │
1393
└─────┴─────┴─────┘
1394
"""
1395
return get_df_item_by_key(self, key)
1396
1397
def __setitem__(
1398
self,
1399
key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int],
1400
value: Any,
1401
) -> None: # pragma: no cover
1402
"""
1403
Modify DataFrame elements in place, using assignment syntax.
1404
1405
Parameters
1406
----------
1407
key : str | Sequence[int] | Sequence[str] | tuple[Any, str | int]
1408
Specifies the location(s) within the DataFrame to assign new values.
1409
The behavior varies based on the type of `key`:
1410
1411
- Str: `df["a"] = value`:
1412
Not supported. Raises a `TypeError`. Use `df.with_columns(...)`
1413
to add or modify columns.
1414
1415
- Sequence[str]: `df[["a", "b"]] = value`:
1416
Assigns multiple columns at once. `value` must be a 2D array-like
1417
structure with the same number of columns as the list
1418
of column names provided.
1419
1420
- tuple[Any, str | int]: `df[row_idx, "a"] = value`:
1421
Assigns a new value to a specific element in the DataFrame, where
1422
`row_idx` specifies the row and `"a"` specifies the column.
1423
1424
- `df[row_idx, col_idx] = value`:
1425
Similar to the above, but `col_idx` is the integer index of the column.
1426
1427
value : Any
1428
The new value(s) to assign. The expected structure of `value` depends on the
1429
form of `key`:
1430
1431
- For multiple column assignment (`df[["a", "b"]] = value`), `value` should
1432
be a 2D array-like object with shape (n_rows, n_columns).
1433
1434
- For single element assignment (`df[row_idx, "a"] = value`), `value` should
1435
be a scalar.
1436
1437
Raises
1438
------
1439
TypeError
1440
If an unsupported assignment is attempted, such as assigning a Series
1441
directly to a column using `df["a"] = series`.
1442
1443
ValueError
1444
If the shape of `value` does not match the expected shape based on `key`.
1445
1446
Examples
1447
--------
1448
Sequence[str] : `df[["a", "b"]] = value`:
1449
1450
>>> import numpy as np
1451
>>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
1452
>>> df[["a", "b"]] = np.array([[10, 40], [20, 50], [30, 60]])
1453
>>> df
1454
shape: (3, 2)
1455
┌─────┬─────┐
1456
│ a ┆ b │
1457
│ --- ┆ --- │
1458
│ i64 ┆ i64 │
1459
╞═════╪═════╡
1460
│ 10 ┆ 40 │
1461
│ 20 ┆ 50 │
1462
│ 30 ┆ 60 │
1463
└─────┴─────┘
1464
1465
tuple[Any, str | int] : `df[row_idx, "a"] = value`:
1466
1467
>>> df[1, "a"] = 100
1468
>>> df
1469
shape: (3, 2)
1470
┌─────┬─────┐
1471
│ a ┆ b │
1472
│ --- ┆ --- │
1473
│ i64 ┆ i64 │
1474
╞═════╪═════╡
1475
│ 10 ┆ 40 │
1476
│ 100 ┆ 50 │
1477
│ 30 ┆ 60 │
1478
└─────┴─────┘
1479
1480
`df[row_idx, col_idx] = value`:
1481
1482
>>> df[0, 1] = 30
1483
>>> df
1484
shape: (3, 2)
1485
┌─────┬─────┐
1486
│ a ┆ b │
1487
│ --- ┆ --- │
1488
│ i64 ┆ i64 │
1489
╞═════╪═════╡
1490
│ 10 ┆ 30 │
1491
│ 100 ┆ 50 │
1492
│ 30 ┆ 60 │
1493
└─────┴─────┘
1494
"""
1495
# df["foo"] = series
1496
if isinstance(key, str):
1497
msg = (
1498
"DataFrame object does not support `Series` assignment by index"
1499
"\n\nUse `DataFrame.with_columns`."
1500
)
1501
raise TypeError(msg)
1502
1503
# df[["C", "D"]]
1504
elif isinstance(key, list):
1505
# TODO: Use python sequence constructors
1506
value = np.array(value)
1507
if value.ndim != 2:
1508
msg = "can only set multiple columns with 2D matrix"
1509
raise ValueError(msg)
1510
if value.shape[1] != len(key):
1511
msg = "matrix columns should be equal to list used to determine column names"
1512
raise ValueError(msg)
1513
1514
# TODO: we can parallelize this by calling from_numpy
1515
columns = []
1516
for i, name in enumerate(key):
1517
columns.append(pl.Series(name, value[:, i]))
1518
self._df = self.with_columns(columns)._df
1519
1520
# df[a, b]
1521
elif isinstance(key, tuple):
1522
row_selection, col_selection = key
1523
1524
if (
1525
isinstance(row_selection, pl.Series) and row_selection.dtype == Boolean
1526
) or is_bool_sequence(row_selection):
1527
msg = (
1528
"not allowed to set DataFrame by boolean mask in the row position"
1529
"\n\nConsider using `DataFrame.with_columns`."
1530
)
1531
raise TypeError(msg)
1532
1533
# get series column selection
1534
if isinstance(col_selection, str):
1535
s = self.__getitem__(col_selection)
1536
elif isinstance(col_selection, int):
1537
s = self[:, col_selection]
1538
else:
1539
msg = f"unexpected column selection {col_selection!r}"
1540
raise TypeError(msg)
1541
1542
# dispatch to __setitem__ of Series to do modification
1543
s[row_selection] = value
1544
1545
# now find the location to place series
1546
# df[idx]
1547
if isinstance(col_selection, int):
1548
self.replace_column(col_selection, s)
1549
# df["foo"]
1550
elif isinstance(col_selection, str):
1551
self._replace(col_selection, s)
1552
else:
1553
msg = (
1554
f"cannot use `__setitem__` on DataFrame"
1555
f" with key {key!r} of type {type(key).__name__!r}"
1556
f" and value {value!r} of type {type(value).__name__!r}"
1557
)
1558
raise TypeError(msg)
1559
1560
def __len__(self) -> int:
1561
return self.height
1562
1563
def __copy__(self) -> DataFrame:
1564
return self.clone()
1565
1566
def __deepcopy__(self, memo: None = None) -> DataFrame:
1567
return self.clone()
1568
1569
def _ipython_key_completions_(self) -> list[str]:
1570
return self.columns
1571
1572
def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
1573
"""
1574
Export a DataFrame via the Arrow PyCapsule Interface.
1575
1576
https://arrow.apache.org/docs/dev/format/CDataInterface/PyCapsuleInterface.html
1577
"""
1578
return self._df.__arrow_c_stream__(requested_schema)
1579
1580
def _repr_html_(self, *, _from_series: bool = False) -> str:
1581
"""
1582
Format output data in HTML for display in Jupyter Notebooks.
1583
1584
Output rows and columns can be modified by setting the following ENVIRONMENT
1585
variables:
1586
1587
* POLARS_FMT_MAX_COLS: set the number of columns
1588
* POLARS_FMT_MAX_ROWS: set the number of rows
1589
"""
1590
max_cols = int(os.environ.get("POLARS_FMT_MAX_COLS", default=75))
1591
if max_cols < 0:
1592
max_cols = self.width
1593
1594
max_rows = int(os.environ.get("POLARS_FMT_MAX_ROWS", default=10))
1595
if max_rows < 0:
1596
max_rows = self.height
1597
1598
return "".join(
1599
NotebookFormatter(
1600
self,
1601
max_cols=max_cols,
1602
max_rows=max_rows,
1603
from_series=_from_series,
1604
).render()
1605
)
1606
1607
def collect_schema(self) -> Schema:
1608
"""
1609
Get an ordered mapping of column names to their data type.
1610
1611
This is an alias for the :attr:`schema` property.
1612
1613
See Also
1614
--------
1615
schema
1616
1617
Notes
1618
-----
1619
This method is included to facilitate writing code that is generic for both
1620
DataFrame and LazyFrame.
1621
1622
Examples
1623
--------
1624
Determine the schema.
1625
1626
>>> df = pl.DataFrame(
1627
... {
1628
... "foo": [1, 2, 3],
1629
... "bar": [6.0, 7.0, 8.0],
1630
... "ham": ["a", "b", "c"],
1631
... }
1632
... )
1633
>>> df.collect_schema()
1634
Schema({'foo': Int64, 'bar': Float64, 'ham': String})
1635
1636
Access various properties of the schema using the :class:`Schema` object.
1637
1638
>>> schema = df.collect_schema()
1639
>>> schema["bar"]
1640
Float64
1641
>>> schema.names()
1642
['foo', 'bar', 'ham']
1643
>>> schema.dtypes()
1644
[Int64, Float64, String]
1645
>>> schema.len()
1646
3
1647
"""
1648
return self.schema
1649
1650
def item(self, row: int | None = None, column: int | str | None = None) -> Any:
1651
"""
1652
Return the DataFrame as a scalar, or return the element at the given row/column.
1653
1654
Parameters
1655
----------
1656
row
1657
Optional row index.
1658
column
1659
Optional column index or name.
1660
1661
See Also
1662
--------
1663
row : Get the values of a single row, either by index or by predicate.
1664
1665
Notes
1666
-----
1667
If row/col not provided, this is equivalent to `df[0,0]`, with a check that
1668
the shape is (1,1). With row/col, this is equivalent to `df[row,col]`.
1669
1670
Examples
1671
--------
1672
>>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
1673
>>> df.select((pl.col("a") * pl.col("b")).sum()).item()
1674
32
1675
>>> df.item(1, 1)
1676
5
1677
>>> df.item(2, "b")
1678
6
1679
"""
1680
if row is None and column is None:
1681
if self.shape != (1, 1):
1682
msg = (
1683
"can only call `.item()` if the dataframe is of shape (1, 1),"
1684
" or if explicit row/col values are provided;"
1685
f" frame has shape {self.shape!r}"
1686
)
1687
raise ValueError(msg)
1688
return self._df.to_series(0).get_index(0)
1689
1690
elif row is None or column is None:
1691
msg = "cannot call `.item()` with only one of `row` or `column`"
1692
raise ValueError(msg)
1693
1694
s = (
1695
self._df.to_series(column)
1696
if isinstance(column, int)
1697
else self._df.get_column(column)
1698
)
1699
return s.get_index_signed(row)
1700
1701
@deprecate_renamed_parameter("future", "compat_level", version="1.1")
1702
def to_arrow(self, *, compat_level: CompatLevel | None = None) -> pa.Table:
1703
"""
1704
Collect the underlying arrow arrays in an Arrow Table.
1705
1706
This operation is mostly zero copy.
1707
1708
Data types that do copy:
1709
- CategoricalType
1710
1711
.. versionchanged:: 1.1
1712
The `future` parameter was renamed `compat_level`.
1713
1714
Parameters
1715
----------
1716
compat_level
1717
Use a specific compatibility level
1718
when exporting Polars' internal data structures.
1719
1720
Examples
1721
--------
1722
>>> df = pl.DataFrame(
1723
... {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]}
1724
... )
1725
>>> df.to_arrow()
1726
pyarrow.Table
1727
foo: int64
1728
bar: large_string
1729
----
1730
foo: [[1,2,3,4,5,6]]
1731
bar: [["a","b","c","d","e","f"]]
1732
"""
1733
if not self.width: # 0x0 dataframe, cannot infer schema from batches
1734
return pa.table({})
1735
1736
compat_level_py: int | bool
1737
if compat_level is None:
1738
compat_level_py = False
1739
elif isinstance(compat_level, CompatLevel):
1740
compat_level_py = compat_level._version
1741
1742
record_batches = self._df.to_arrow(compat_level_py)
1743
return pa.Table.from_batches(record_batches)
1744
1745
@overload
1746
def to_dict(self, *, as_series: Literal[True] = ...) -> dict[str, Series]: ...
1747
1748
@overload
1749
def to_dict(self, *, as_series: Literal[False]) -> dict[str, list[Any]]: ...
1750
1751
@overload
1752
def to_dict(
1753
self, *, as_series: bool
1754
) -> dict[str, Series] | dict[str, list[Any]]: ...
1755
1756
def to_dict(
1757
self, *, as_series: bool = True
1758
) -> dict[str, Series] | dict[str, list[Any]]:
1759
"""
1760
Convert DataFrame to a dictionary mapping column name to values.
1761
1762
Parameters
1763
----------
1764
as_series
1765
True -> Values are Series
1766
False -> Values are List[Any]
1767
1768
See Also
1769
--------
1770
rows_by_key
1771
to_dicts
1772
1773
Examples
1774
--------
1775
>>> df = pl.DataFrame(
1776
... {
1777
... "A": [1, 2, 3, 4, 5],
1778
... "fruits": ["banana", "banana", "apple", "apple", "banana"],
1779
... "B": [5, 4, 3, 2, 1],
1780
... "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
1781
... "optional": [28, 300, None, 2, -30],
1782
... }
1783
... )
1784
>>> df
1785
shape: (5, 5)
1786
┌─────┬────────┬─────┬────────┬──────────┐
1787
│ A ┆ fruits ┆ B ┆ cars ┆ optional │
1788
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
1789
│ i64 ┆ str ┆ i64 ┆ str ┆ i64 │
1790
╞═════╪════════╪═════╪════════╪══════════╡
1791
│ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │
1792
│ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │
1793
│ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │
1794
│ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │
1795
│ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │
1796
└─────┴────────┴─────┴────────┴──────────┘
1797
>>> df.to_dict(as_series=False)
1798
{'A': [1, 2, 3, 4, 5],
1799
'fruits': ['banana', 'banana', 'apple', 'apple', 'banana'],
1800
'B': [5, 4, 3, 2, 1],
1801
'cars': ['beetle', 'audi', 'beetle', 'beetle', 'beetle'],
1802
'optional': [28, 300, None, 2, -30]}
1803
>>> df.to_dict(as_series=True)
1804
{'A': shape: (5,)
1805
Series: 'A' [i64]
1806
[
1807
1
1808
2
1809
3
1810
4
1811
5
1812
], 'fruits': shape: (5,)
1813
Series: 'fruits' [str]
1814
[
1815
"banana"
1816
"banana"
1817
"apple"
1818
"apple"
1819
"banana"
1820
], 'B': shape: (5,)
1821
Series: 'B' [i64]
1822
[
1823
5
1824
4
1825
3
1826
2
1827
1
1828
], 'cars': shape: (5,)
1829
Series: 'cars' [str]
1830
[
1831
"beetle"
1832
"audi"
1833
"beetle"
1834
"beetle"
1835
"beetle"
1836
], 'optional': shape: (5,)
1837
Series: 'optional' [i64]
1838
[
1839
28
1840
300
1841
null
1842
2
1843
-30
1844
]}
1845
"""
1846
if as_series:
1847
return {s.name: s for s in self}
1848
else:
1849
return {s.name: s.to_list() for s in self}
1850
1851
def to_dicts(self) -> list[dict[str, Any]]:
1852
"""
1853
Convert every row to a dictionary of Python-native values.
1854
1855
Notes
1856
-----
1857
If you have `ns`-precision temporal values you should be aware that Python
1858
natively only supports up to `μs`-precision; `ns`-precision values will be
1859
truncated to microseconds on conversion to Python. If this matters to your
1860
use-case you should export to a different format (such as Arrow or NumPy).
1861
1862
Examples
1863
--------
1864
>>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
1865
>>> df.to_dicts()
1866
[{'foo': 1, 'bar': 4}, {'foo': 2, 'bar': 5}, {'foo': 3, 'bar': 6}]
1867
"""
1868
return self.rows(named=True)
1869
1870
def to_numpy(
1871
self,
1872
*,
1873
order: IndexOrder = "fortran",
1874
writable: bool = False,
1875
allow_copy: bool = True,
1876
structured: bool = False,
1877
use_pyarrow: bool | None = None,
1878
) -> np.ndarray[Any, Any]:
1879
"""
1880
Convert this DataFrame to a NumPy ndarray.
1881
1882
This operation copies data only when necessary. The conversion is zero copy when
1883
all of the following hold:
1884
1885
- The DataFrame is fully contiguous in memory, with all Series back-to-back and
1886
all Series consisting of a single chunk.
1887
- The data type is an integer or float.
1888
- The DataFrame contains no null values.
1889
- The `order` parameter is set to `fortran` (default).
1890
- The `writable` parameter is set to `False` (default).
1891
1892
Parameters
1893
----------
1894
order
1895
The index order of the returned NumPy array, either C-like or
1896
Fortran-like. In general, using the Fortran-like index order is faster.
1897
However, the C-like order might be more appropriate to use for downstream
1898
applications to prevent cloning data, e.g. when reshaping into a
1899
one-dimensional array.
1900
writable
1901
Ensure the resulting array is writable. This will force a copy of the data
1902
if the array was created without copy, as the underlying Arrow data is
1903
immutable.
1904
allow_copy
1905
Allow memory to be copied to perform the conversion. If set to `False`,
1906
causes conversions that are not zero-copy to fail.
1907
structured
1908
Return a `structured array`_ with a data type that corresponds to the
1909
DataFrame schema. If set to `False` (default), a 2D ndarray is
1910
returned instead.
1911
1912
.. _structured array: https://numpy.org/doc/stable/user/basics.rec.html
1913
1914
use_pyarrow
1915
Use `pyarrow.Array.to_numpy
1916
<https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array.to_numpy>`_
1917
1918
function for the conversion to NumPy if necessary.
1919
1920
.. deprecated:: 0.20.28
1921
Polars now uses its native engine by default for conversion to NumPy.
1922
1923
Examples
1924
--------
1925
Numeric data without nulls can be converted without copying data in some cases.
1926
The resulting array will not be writable.
1927
1928
>>> df = pl.DataFrame({"a": [1, 2, 3]})
1929
>>> arr = df.to_numpy()
1930
>>> arr
1931
array([[1],
1932
[2],
1933
[3]])
1934
>>> arr.flags.writeable
1935
False
1936
1937
Set `writable=True` to force data copy to make the array writable.
1938
1939
>>> df.to_numpy(writable=True).flags.writeable
1940
True
1941
1942
If the DataFrame contains different numeric data types, the resulting data type
1943
will be the supertype. This requires data to be copied. Integer types with
1944
nulls are cast to a float type with `nan` representing a null value.
1945
1946
>>> df = pl.DataFrame({"a": [1, 2, None], "b": [4.0, 5.0, 6.0]})
1947
>>> df.to_numpy()
1948
array([[ 1., 4.],
1949
[ 2., 5.],
1950
[nan, 6.]])
1951
1952
Set `allow_copy=False` to raise an error if data would be copied.
1953
1954
>>> s.to_numpy(allow_copy=False) # doctest: +SKIP
1955
Traceback (most recent call last):
1956
...
1957
RuntimeError: copy not allowed: cannot convert to a NumPy array without copying data
1958
1959
Polars defaults to F-contiguous order. Use `order="c"` to force the resulting
1960
array to be C-contiguous.
1961
1962
>>> df.to_numpy(order="c").flags.c_contiguous
1963
True
1964
1965
DataFrames with mixed types will result in an array with an object dtype.
1966
1967
>>> df = pl.DataFrame(
1968
... {
1969
... "foo": [1, 2, 3],
1970
... "bar": [6.5, 7.0, 8.5],
1971
... "ham": ["a", "b", "c"],
1972
... },
1973
... schema_overrides={"foo": pl.UInt8, "bar": pl.Float32},
1974
... )
1975
>>> df.to_numpy()
1976
array([[1, 6.5, 'a'],
1977
[2, 7.0, 'b'],
1978
[3, 8.5, 'c']], dtype=object)
1979
1980
Set `structured=True` to convert to a structured array, which can better
1981
preserve individual column data such as name and data type.
1982
1983
>>> df.to_numpy(structured=True)
1984
array([(1, 6.5, 'a'), (2, 7. , 'b'), (3, 8.5, 'c')],
1985
dtype=[('foo', 'u1'), ('bar', '<f4'), ('ham', '<U1')])
1986
""" # noqa: W505
1987
if use_pyarrow is not None:
1988
issue_deprecation_warning(
1989
"the `use_pyarrow` parameter for `DataFrame.to_numpy` is deprecated."
1990
" Polars now uses its native engine by default for conversion to NumPy.",
1991
version="0.20.28",
1992
)
1993
1994
if structured:
1995
if not allow_copy and not self.is_empty():
1996
msg = "copy not allowed: cannot create structured array without copying data"
1997
raise RuntimeError(msg)
1998
1999
arrays = []
2000
struct_dtype = []
2001
for s in self.iter_columns():
2002
if s.dtype == Struct:
2003
arr = s.struct.unnest().to_numpy(
2004
structured=True,
2005
allow_copy=True,
2006
use_pyarrow=use_pyarrow,
2007
)
2008
else:
2009
arr = s.to_numpy(use_pyarrow=use_pyarrow)
2010
2011
if s.dtype == String and not s.has_nulls():
2012
arr = arr.astype(str, copy=False)
2013
arrays.append(arr)
2014
struct_dtype.append((s.name, arr.dtype, arr.shape[1:]))
2015
2016
out = np.empty(self.height, dtype=struct_dtype)
2017
for idx, c in enumerate(self.columns):
2018
out[c] = arrays[idx]
2019
return out
2020
2021
return self._df.to_numpy(order, writable=writable, allow_copy=allow_copy)
2022
2023
@overload
2024
def to_jax(
2025
self,
2026
return_type: Literal["array"] = ...,
2027
*,
2028
device: jax.Device | str | None = ...,
2029
label: str | Expr | Sequence[str | Expr] | None = ...,
2030
features: str | Expr | Sequence[str | Expr] | None = ...,
2031
dtype: PolarsDataType | None = ...,
2032
order: IndexOrder = ...,
2033
) -> jax.Array: ...
2034
2035
@overload
2036
def to_jax(
2037
self,
2038
return_type: Literal["dict"],
2039
*,
2040
device: jax.Device | str | None = ...,
2041
label: str | Expr | Sequence[str | Expr] | None = ...,
2042
features: str | Expr | Sequence[str | Expr] | None = ...,
2043
dtype: PolarsDataType | None = ...,
2044
order: IndexOrder = ...,
2045
) -> dict[str, jax.Array]: ...
2046
2047
@unstable()
2048
def to_jax(
2049
self,
2050
return_type: JaxExportType = "array",
2051
*,
2052
device: jax.Device | str | None = None,
2053
label: str | Expr | Sequence[str | Expr] | None = None,
2054
features: str | Expr | Sequence[str | Expr] | None = None,
2055
dtype: PolarsDataType | None = None,
2056
order: IndexOrder = "fortran",
2057
) -> jax.Array | dict[str, jax.Array]:
2058
"""
2059
Convert DataFrame to a Jax Array, or dict of Jax Arrays.
2060
2061
.. versionadded:: 0.20.27
2062
2063
.. warning::
2064
This functionality is currently considered **unstable**. It may be
2065
changed at any point without it being considered a breaking change.
2066
2067
Parameters
2068
----------
2069
return_type : {"array", "dict"}
2070
Set return type; a Jax Array, or dict of Jax Arrays.
2071
device
2072
Specify the jax `Device` on which the array will be created; can provide
2073
a string (such as "cpu", "gpu", or "tpu") in which case the device is
2074
retrieved as `jax.devices(string)[0]`. For more specific control you
2075
can supply the instantiated `Device` directly. If None, arrays are
2076
created on the default device.
2077
label
2078
One or more column names, expressions, or selectors that label the feature
2079
data; results in a `{"label": ..., "features": ...}` dict being returned
2080
when `return_type` is "dict" instead of a `{"col": array, }` dict.
2081
features
2082
One or more column names, expressions, or selectors that contain the feature
2083
data; if omitted, all columns that are not designated as part of the label
2084
are used. Only applies when `return_type` is "dict".
2085
dtype
2086
Unify the dtype of all returned arrays; this casts any column that is
2087
not already of the required dtype before converting to Array. Note that
2088
export will be single-precision (32bit) unless the Jax config/environment
2089
directs otherwise (eg: "jax_enable_x64" was set True in the config object
2090
at startup, or "JAX_ENABLE_X64" is set to "1" in the environment).
2091
order : {"c", "fortran"}
2092
The index order of the returned Jax array, either C-like (row-major) or
2093
Fortran-like (column-major).
2094
2095
See Also
2096
--------
2097
to_dummies
2098
to_numpy
2099
to_torch
2100
2101
Examples
2102
--------
2103
>>> df = pl.DataFrame(
2104
... {
2105
... "lbl": [0, 1, 2, 3],
2106
... "feat1": [1, 0, 0, 1],
2107
... "feat2": [1.5, -0.5, 0.0, -2.25],
2108
... }
2109
... )
2110
2111
Standard return type (2D Array), on the standard device:
2112
2113
>>> df.to_jax()
2114
Array([[ 0. , 1. , 1.5 ],
2115
[ 1. , 0. , -0.5 ],
2116
[ 2. , 0. , 0. ],
2117
[ 3. , 1. , -2.25]], dtype=float32)
2118
2119
Create the Array on the default GPU device:
2120
2121
>>> a = df.to_jax(device="gpu") # doctest: +SKIP
2122
>>> a.device() # doctest: +SKIP
2123
GpuDevice(id=0, process_index=0)
2124
2125
Create the Array on a specific GPU device:
2126
2127
>>> gpu_device = jax.devices("gpu")[1] # doctest: +SKIP
2128
>>> a = df.to_jax(device=gpu_device) # doctest: +SKIP
2129
>>> a.device() # doctest: +SKIP
2130
GpuDevice(id=1, process_index=0)
2131
2132
As a dictionary of individual Arrays:
2133
2134
>>> df.to_jax("dict")
2135
{'lbl': Array([0, 1, 2, 3], dtype=int32),
2136
'feat1': Array([1, 0, 0, 1], dtype=int32),
2137
'feat2': Array([ 1.5 , -0.5 , 0. , -2.25], dtype=float32)}
2138
2139
As a "label" and "features" dictionary; note that as "features" is not
2140
declared, it defaults to all the columns that are not in "label":
2141
2142
>>> df.to_jax("dict", label="lbl")
2143
{'label': Array([[0],
2144
[1],
2145
[2],
2146
[3]], dtype=int32),
2147
'features': Array([[ 1. , 1.5 ],
2148
[ 0. , -0.5 ],
2149
[ 0. , 0. ],
2150
[ 1. , -2.25]], dtype=float32)}
2151
2152
As a "label" and "features" dictionary where each is designated using
2153
a col or selector expression (which can also be used to cast the data
2154
if the label and features are better-represented with different dtypes):
2155
2156
>>> import polars.selectors as cs
2157
>>> df.to_jax(
2158
... return_type="dict",
2159
... features=cs.float(),
2160
... label=pl.col("lbl").cast(pl.UInt8),
2161
... )
2162
{'label': Array([[0],
2163
[1],
2164
[2],
2165
[3]], dtype=uint8),
2166
'features': Array([[ 1.5 ],
2167
[-0.5 ],
2168
[ 0. ],
2169
[-2.25]], dtype=float32)}
2170
"""
2171
if return_type != "dict" and (label is not None or features is not None):
2172
msg = "`label` and `features` only apply when `return_type` is 'dict'"
2173
raise ValueError(msg)
2174
elif return_type == "dict" and label is None and features is not None:
2175
msg = "`label` is required if setting `features` when `return_type='dict'"
2176
raise ValueError(msg)
2177
2178
jx = import_optional(
2179
"jax",
2180
install_message="Please see `https://jax.readthedocs.io/en/latest/installation.html` "
2181
"for specific installation recommendations for the Jax package",
2182
)
2183
enabled_double_precision = jx.config.jax_enable_x64 or bool(
2184
int(os.environ.get("JAX_ENABLE_X64", "0"))
2185
)
2186
if dtype:
2187
frame = self.cast(dtype)
2188
elif not enabled_double_precision:
2189
# enforce single-precision unless environment/config directs otherwise
2190
frame = self.cast({Float64: Float32, Int64: Int32, UInt64: UInt32})
2191
else:
2192
frame = self
2193
2194
if isinstance(device, str):
2195
device = jx.devices(device)[0]
2196
2197
with contextlib.nullcontext() if device is None else jx.default_device(device):
2198
if return_type == "array":
2199
# note: jax arrays are immutable, so can avoid a copy (vs torch)
2200
from polars.ml.utilities import frame_to_numpy
2201
2202
arr = frame_to_numpy(
2203
df=frame,
2204
order=order,
2205
writable=False,
2206
target="Jax Array",
2207
)
2208
return jx.numpy.asarray(a=arr, order="K")
2209
2210
elif return_type == "dict":
2211
if label is not None:
2212
# return a {"label": array(s), "features": array(s)} dict
2213
label_frame = frame.select(label)
2214
features_frame = (
2215
frame.select(features)
2216
if features is not None
2217
else frame.drop(*label_frame.columns)
2218
)
2219
return {
2220
"label": label_frame.to_jax(),
2221
"features": features_frame.to_jax(),
2222
}
2223
else:
2224
# return a {"col": array} dict
2225
return {srs.name: srs.to_jax() for srs in frame}
2226
else:
2227
valid_jax_types = ", ".join(get_args(JaxExportType))
2228
msg = f"invalid `return_type`: {return_type!r}\nExpected one of: {valid_jax_types}"
2229
raise ValueError(msg)
2230
2231
@overload
2232
def to_torch(
2233
self,
2234
return_type: Literal["tensor"] = ...,
2235
*,
2236
label: str | Expr | Sequence[str | Expr] | None = ...,
2237
features: str | Expr | Sequence[str | Expr] | None = ...,
2238
dtype: PolarsDataType | None = ...,
2239
) -> torch.Tensor: ...
2240
2241
@overload
2242
def to_torch(
2243
self,
2244
return_type: Literal["dataset"],
2245
*,
2246
label: str | Expr | Sequence[str | Expr] | None = ...,
2247
features: str | Expr | Sequence[str | Expr] | None = ...,
2248
dtype: PolarsDataType | None = ...,
2249
) -> PolarsDataset: ...
2250
2251
@overload
2252
def to_torch(
2253
self,
2254
return_type: Literal["dict"],
2255
*,
2256
label: str | Expr | Sequence[str | Expr] | None = ...,
2257
features: str | Expr | Sequence[str | Expr] | None = ...,
2258
dtype: PolarsDataType | None = ...,
2259
) -> dict[str, torch.Tensor]: ...
2260
2261
@unstable()
2262
def to_torch(
2263
self,
2264
return_type: TorchExportType = "tensor",
2265
*,
2266
label: str | Expr | Sequence[str | Expr] | None = None,
2267
features: str | Expr | Sequence[str | Expr] | None = None,
2268
dtype: PolarsDataType | None = None,
2269
) -> torch.Tensor | dict[str, torch.Tensor] | PolarsDataset:
2270
"""
2271
Convert DataFrame to a PyTorch Tensor, Dataset, or dict of Tensors.
2272
2273
.. versionadded:: 0.20.23
2274
2275
.. warning::
2276
This functionality is currently considered **unstable**. It may be
2277
changed at any point without it being considered a breaking change.
2278
2279
Parameters
2280
----------
2281
return_type : {"tensor", "dataset", "dict"}
2282
Set return type; a PyTorch Tensor, PolarsDataset (a frame-specialized
2283
TensorDataset), or dict of Tensors.
2284
label
2285
One or more column names, expressions, or selectors that label the feature
2286
data; when `return_type` is "dataset", the PolarsDataset will return
2287
`(features, label)` tensor tuples for each row. Otherwise, it returns
2288
`(features,)` tensor tuples where the feature contains all the row data.
2289
features
2290
One or more column names, expressions, or selectors that contain the feature
2291
data; if omitted, all columns that are not designated as part of the label
2292
are used.
2293
dtype
2294
Unify the dtype of all returned tensors; this casts any column that is
2295
not of the required dtype before converting to Tensor. This includes
2296
the label column *unless* the label is an expression (such as
2297
`pl.col("label_column").cast(pl.Int16)`).
2298
2299
See Also
2300
--------
2301
to_dummies
2302
to_jax
2303
to_numpy
2304
2305
Examples
2306
--------
2307
>>> df = pl.DataFrame(
2308
... {
2309
... "lbl": [0, 1, 2, 3],
2310
... "feat1": [1, 0, 0, 1],
2311
... "feat2": [1.5, -0.5, 0.0, -2.25],
2312
... }
2313
... )
2314
2315
Standard return type (Tensor), with f32 supertype:
2316
2317
>>> df.to_torch(dtype=pl.Float32)
2318
tensor([[ 0.0000, 1.0000, 1.5000],
2319
[ 1.0000, 0.0000, -0.5000],
2320
[ 2.0000, 0.0000, 0.0000],
2321
[ 3.0000, 1.0000, -2.2500]])
2322
2323
As a dictionary of individual Tensors:
2324
2325
>>> df.to_torch("dict")
2326
{'lbl': tensor([0, 1, 2, 3]),
2327
'feat1': tensor([1, 0, 0, 1]),
2328
'feat2': tensor([ 1.5000, -0.5000, 0.0000, -2.2500], dtype=torch.float64)}
2329
2330
As a "label" and "features" dictionary; note that as "features" is not
2331
declared, it defaults to all the columns that are not in "label":
2332
2333
>>> df.to_torch("dict", label="lbl", dtype=pl.Float32)
2334
{'label': tensor([[0.],
2335
[1.],
2336
[2.],
2337
[3.]]),
2338
'features': tensor([[ 1.0000, 1.5000],
2339
[ 0.0000, -0.5000],
2340
[ 0.0000, 0.0000],
2341
[ 1.0000, -2.2500]])}
2342
2343
As a PolarsDataset, with f64 supertype:
2344
2345
>>> ds = df.to_torch("dataset", dtype=pl.Float64)
2346
>>> ds[3]
2347
(tensor([ 3.0000, 1.0000, -2.2500], dtype=torch.float64),)
2348
>>> ds[:2]
2349
(tensor([[ 0.0000, 1.0000, 1.5000],
2350
[ 1.0000, 0.0000, -0.5000]], dtype=torch.float64),)
2351
>>> ds[[0, 3]]
2352
(tensor([[ 0.0000, 1.0000, 1.5000],
2353
[ 3.0000, 1.0000, -2.2500]], dtype=torch.float64),)
2354
2355
As a convenience the PolarsDataset can opt in to half-precision data
2356
for experimentation (usually this would be set on the model/pipeline):
2357
2358
>>> list(ds.half())
2359
[(tensor([0.0000, 1.0000, 1.5000], dtype=torch.float16),),
2360
(tensor([ 1.0000, 0.0000, -0.5000], dtype=torch.float16),),
2361
(tensor([2., 0., 0.], dtype=torch.float16),),
2362
(tensor([ 3.0000, 1.0000, -2.2500], dtype=torch.float16),)]
2363
2364
Pass PolarsDataset to a DataLoader, designating the label:
2365
2366
>>> from torch.utils.data import DataLoader
2367
>>> ds = df.to_torch("dataset", label="lbl")
2368
>>> dl = DataLoader(ds, batch_size=2)
2369
>>> batches = list(dl)
2370
>>> batches[0]
2371
[tensor([[ 1.0000, 1.5000],
2372
[ 0.0000, -0.5000]], dtype=torch.float64), tensor([0, 1])]
2373
2374
Note that labels can be given as expressions, allowing them to have
2375
a dtype independent of the feature columns (multi-column labels are
2376
supported).
2377
2378
>>> ds = df.to_torch(
2379
... return_type="dataset",
2380
... dtype=pl.Float32,
2381
... label=pl.col("lbl").cast(pl.Int16),
2382
... )
2383
>>> ds[:2]
2384
(tensor([[ 1.0000, 1.5000],
2385
[ 0.0000, -0.5000]]), tensor([0, 1], dtype=torch.int16))
2386
2387
Easily integrate with (for example) scikit-learn and other datasets:
2388
2389
>>> from sklearn.datasets import fetch_california_housing # doctest: +SKIP
2390
>>> housing = fetch_california_housing() # doctest: +SKIP
2391
>>> df = pl.DataFrame(
2392
... data=housing.data,
2393
... schema=housing.feature_names,
2394
... ).with_columns(
2395
... Target=housing.target,
2396
... ) # doctest: +SKIP
2397
>>> train = df.to_torch("dataset", label="Target") # doctest: +SKIP
2398
>>> loader = DataLoader(
2399
... train,
2400
... shuffle=True,
2401
... batch_size=64,
2402
... ) # doctest: +SKIP
2403
"""
2404
if return_type not in ("dataset", "dict") and (
2405
label is not None or features is not None
2406
):
2407
msg = "`label` and `features` only apply when `return_type` is 'dataset' or 'dict'"
2408
raise ValueError(msg)
2409
elif return_type == "dict" and label is None and features is not None:
2410
msg = "`label` is required if setting `features` when `return_type='dict'"
2411
raise ValueError(msg)
2412
2413
torch = import_optional("torch")
2414
2415
# Cast columns.
2416
if dtype in (UInt16, UInt32, UInt64):
2417
msg = f"PyTorch does not support u16, u32, or u64 dtypes; given {dtype}"
2418
raise ValueError(msg)
2419
2420
to_dtype = dtype or {UInt16: Int32, UInt32: Int64, UInt64: Int64}
2421
2422
if label is not None:
2423
label_frame = self.select(label)
2424
# Avoid casting the label if it's an expression.
2425
if not isinstance(label, pl.Expr):
2426
label_frame = label_frame.cast(to_dtype) # type: ignore[arg-type]
2427
features_frame = (
2428
self.select(features)
2429
if features is not None
2430
else self.drop(*label_frame.columns)
2431
).cast(to_dtype) # type: ignore[arg-type]
2432
frame = F.concat([label_frame, features_frame], how="horizontal")
2433
else:
2434
frame = (self.select(features) if features is not None else self).cast(
2435
to_dtype # type: ignore[arg-type]
2436
)
2437
2438
if return_type == "tensor":
2439
# note: torch tensors are not immutable, so we must consider them writable
2440
from polars.ml.utilities import frame_to_numpy
2441
2442
arr = frame_to_numpy(frame, writable=True, target="Tensor")
2443
return torch.from_numpy(arr)
2444
2445
elif return_type == "dict":
2446
if label is not None:
2447
# return a {"label": tensor(s), "features": tensor(s)} dict
2448
return {
2449
"label": label_frame.to_torch(),
2450
"features": features_frame.to_torch(),
2451
}
2452
else:
2453
# return a {"col": tensor} dict
2454
return {srs.name: srs.to_torch() for srs in frame}
2455
2456
elif return_type == "dataset":
2457
# return a torch Dataset object
2458
from polars.ml.torch import PolarsDataset
2459
2460
pds_label = None if label is None else label_frame.columns
2461
return PolarsDataset(frame, label=pds_label, features=features)
2462
else:
2463
valid_torch_types = ", ".join(get_args(TorchExportType))
2464
msg = f"invalid `return_type`: {return_type!r}\nExpected one of: {valid_torch_types}"
2465
raise ValueError(msg)
2466
2467
def to_pandas(
2468
self,
2469
*,
2470
use_pyarrow_extension_array: bool = False,
2471
**kwargs: Any,
2472
) -> pd.DataFrame:
2473
"""
2474
Convert this DataFrame to a pandas DataFrame.
2475
2476
This operation copies data if `use_pyarrow_extension_array` is not enabled.
2477
2478
Parameters
2479
----------
2480
use_pyarrow_extension_array
2481
Use PyArrow-backed extension arrays instead of NumPy arrays for the columns
2482
of the pandas DataFrame. This allows zero copy operations and preservation
2483
of null values. Subsequent operations on the resulting pandas DataFrame may
2484
trigger conversion to NumPy if those operations are not supported by PyArrow
2485
compute functions.
2486
**kwargs
2487
Additional keyword arguments to be passed to
2488
:meth:`pyarrow.Table.to_pandas`.
2489
2490
Returns
2491
-------
2492
:class:`pandas.DataFrame`
2493
2494
Notes
2495
-----
2496
This operation requires that both :mod:`pandas` and :mod:`pyarrow` are
2497
installed.
2498
2499
Examples
2500
--------
2501
>>> df = pl.DataFrame(
2502
... {
2503
... "foo": [1, 2, 3],
2504
... "bar": [6.0, 7.0, 8.0],
2505
... "ham": ["a", "b", "c"],
2506
... }
2507
... )
2508
>>> df.to_pandas()
2509
foo bar ham
2510
0 1 6.0 a
2511
1 2 7.0 b
2512
2 3 8.0 c
2513
2514
Null values in numeric columns are converted to `NaN`.
2515
2516
>>> df = pl.DataFrame(
2517
... {
2518
... "foo": [1, 2, None],
2519
... "bar": [6.0, None, 8.0],
2520
... "ham": [None, "b", "c"],
2521
... }
2522
... )
2523
>>> df.to_pandas()
2524
foo bar ham
2525
0 1.0 6.0 None
2526
1 2.0 NaN b
2527
2 NaN 8.0 c
2528
2529
Pass `use_pyarrow_extension_array=True` to get a pandas DataFrame with columns
2530
backed by PyArrow extension arrays. This will preserve null values.
2531
2532
>>> df.to_pandas(use_pyarrow_extension_array=True)
2533
foo bar ham
2534
0 1 6.0 <NA>
2535
1 2 <NA> b
2536
2 <NA> 8.0 c
2537
>>> _.dtypes
2538
foo int64[pyarrow]
2539
bar double[pyarrow]
2540
ham large_string[pyarrow]
2541
dtype: object
2542
"""
2543
if use_pyarrow_extension_array:
2544
if parse_version(pd.__version__) < (1, 5):
2545
msg = f'pandas>=1.5.0 is required for `to_pandas("use_pyarrow_extension_array=True")`, found Pandas {pd.__version__!r}'
2546
raise ModuleUpgradeRequiredError(msg)
2547
if not _PYARROW_AVAILABLE or parse_version(pa.__version__) < (8, 0):
2548
msg = "pyarrow>=8.0.0 is required for `to_pandas(use_pyarrow_extension_array=True)`"
2549
if _PYARROW_AVAILABLE:
2550
msg += f", found pyarrow {pa.__version__!r}."
2551
raise ModuleUpgradeRequiredError(msg)
2552
else:
2553
raise ModuleNotFoundError(msg)
2554
2555
# handle Object columns separately (Arrow does not convert them correctly)
2556
if Object in self.dtypes:
2557
return self._to_pandas_with_object_columns(
2558
use_pyarrow_extension_array=use_pyarrow_extension_array, **kwargs
2559
)
2560
2561
return self._to_pandas_without_object_columns(
2562
self, use_pyarrow_extension_array=use_pyarrow_extension_array, **kwargs
2563
)
2564
2565
def _to_pandas_with_object_columns(
2566
self,
2567
*,
2568
use_pyarrow_extension_array: bool,
2569
**kwargs: Any,
2570
) -> pd.DataFrame:
2571
# Find which columns are of type pl.Object, and which aren't:
2572
object_columns = []
2573
not_object_columns = []
2574
for i, dtype in enumerate(self.dtypes):
2575
if dtype.is_object():
2576
object_columns.append(i)
2577
else:
2578
not_object_columns.append(i)
2579
2580
# Export columns that aren't pl.Object, in the same order:
2581
if not_object_columns:
2582
df_without_objects = self[:, not_object_columns]
2583
pandas_df = self._to_pandas_without_object_columns(
2584
df_without_objects,
2585
use_pyarrow_extension_array=use_pyarrow_extension_array,
2586
**kwargs,
2587
)
2588
else:
2589
pandas_df = pd.DataFrame()
2590
2591
# Add columns that are pl.Object, using Series' custom to_pandas()
2592
# logic for this case. We do this in order, so the original index for
2593
# the next column in this dataframe is correct for the partially
2594
# constructed Pandas dataframe, since there are no additional or
2595
# missing columns to the inserted column's left.
2596
for i in object_columns:
2597
name = self.columns[i]
2598
pandas_df.insert(i, name, self.to_series(i).to_pandas())
2599
2600
return pandas_df
2601
2602
def _to_pandas_without_object_columns(
2603
self,
2604
df: DataFrame,
2605
*,
2606
use_pyarrow_extension_array: bool,
2607
**kwargs: Any,
2608
) -> pd.DataFrame:
2609
if not df.width: # Empty dataframe, cannot infer schema from batches
2610
return pd.DataFrame()
2611
2612
record_batches = df._df.to_pandas()
2613
tbl = pa.Table.from_batches(record_batches)
2614
if use_pyarrow_extension_array:
2615
return tbl.to_pandas(
2616
self_destruct=True,
2617
split_blocks=True,
2618
types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype),
2619
**kwargs,
2620
)
2621
2622
date_as_object = kwargs.pop("date_as_object", False)
2623
return tbl.to_pandas(date_as_object=date_as_object, **kwargs)
2624
2625
def to_series(self, index: int = 0) -> Series:
2626
"""
2627
Select column as Series at index location.
2628
2629
Parameters
2630
----------
2631
index
2632
Location of selection.
2633
2634
See Also
2635
--------
2636
get_column
2637
2638
Examples
2639
--------
2640
>>> df = pl.DataFrame(
2641
... {
2642
... "foo": [1, 2, 3],
2643
... "bar": [6, 7, 8],
2644
... "ham": ["a", "b", "c"],
2645
... }
2646
... )
2647
>>> df.to_series(1)
2648
shape: (3,)
2649
Series: 'bar' [i64]
2650
[
2651
6
2652
7
2653
8
2654
]
2655
"""
2656
return wrap_s(self._df.to_series(index))
2657
2658
def to_init_repr(self, n: int = 1000) -> str:
2659
"""
2660
Convert DataFrame to instantiable string representation.
2661
2662
Parameters
2663
----------
2664
n
2665
Only use first n rows.
2666
2667
See Also
2668
--------
2669
polars.Series.to_init_repr
2670
polars.from_repr
2671
2672
Examples
2673
--------
2674
>>> df = pl.DataFrame(
2675
... [
2676
... pl.Series("foo", [1, 2, 3], dtype=pl.UInt8),
2677
... pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32),
2678
... pl.Series("ham", ["a", "b", "c"], dtype=pl.String),
2679
... ]
2680
... )
2681
>>> print(df.to_init_repr())
2682
pl.DataFrame(
2683
[
2684
pl.Series('foo', [1, 2, 3], dtype=pl.UInt8),
2685
pl.Series('bar', [6.0, 7.0, 8.0], dtype=pl.Float32),
2686
pl.Series('ham', ['a', 'b', 'c'], dtype=pl.String),
2687
]
2688
)
2689
2690
>>> df_from_str_repr = eval(df.to_init_repr())
2691
>>> df_from_str_repr
2692
shape: (3, 3)
2693
┌─────┬─────┬─────┐
2694
│ foo ┆ bar ┆ ham │
2695
│ --- ┆ --- ┆ --- │
2696
│ u8 ┆ f32 ┆ str │
2697
╞═════╪═════╪═════╡
2698
│ 1 ┆ 6.0 ┆ a │
2699
│ 2 ┆ 7.0 ┆ b │
2700
│ 3 ┆ 8.0 ┆ c │
2701
└─────┴─────┴─────┘
2702
"""
2703
output = StringIO()
2704
output.write("pl.DataFrame(\n [\n")
2705
2706
for i in range(self.width):
2707
output.write(" ")
2708
output.write(self.to_series(i).to_init_repr(n))
2709
output.write(",\n")
2710
2711
output.write(" ]\n)\n")
2712
2713
return output.getvalue()
2714
2715
@overload
2716
def serialize(
2717
self, file: None = ..., *, format: Literal["binary"] = ...
2718
) -> bytes: ...
2719
2720
@overload
2721
def serialize(self, file: None = ..., *, format: Literal["json"]) -> str: ...
2722
2723
@overload
2724
def serialize(
2725
self, file: IOBase | str | Path, *, format: SerializationFormat = ...
2726
) -> None: ...
2727
2728
def serialize(
2729
self,
2730
file: IOBase | str | Path | None = None,
2731
*,
2732
format: SerializationFormat = "binary",
2733
) -> bytes | str | None:
2734
r"""
2735
Serialize this DataFrame to a file or string in JSON format.
2736
2737
Parameters
2738
----------
2739
file
2740
File path or writable file-like object to which the result will be written.
2741
If set to `None` (default), the output is returned as a string instead.
2742
format
2743
The format in which to serialize. Options:
2744
2745
- `"binary"`: Serialize to binary format (bytes). This is the default.
2746
- `"json"`: Serialize to JSON format (string).
2747
2748
Notes
2749
-----
2750
Serialization is not stable across Polars versions: a LazyFrame serialized
2751
in one Polars version may not be deserializable in another Polars version.
2752
2753
Examples
2754
--------
2755
Serialize the DataFrame into a binary representation.
2756
2757
>>> df = pl.DataFrame(
2758
... {
2759
... "foo": [1, 2, 3],
2760
... "bar": [6, 7, 8],
2761
... }
2762
... )
2763
>>> bytes = df.serialize()
2764
>>> type(bytes)
2765
<class 'bytes'>
2766
2767
The bytes can later be deserialized back into a DataFrame.
2768
2769
>>> import io
2770
>>> pl.DataFrame.deserialize(io.BytesIO(bytes))
2771
shape: (3, 2)
2772
┌─────┬─────┐
2773
│ foo ┆ bar │
2774
│ --- ┆ --- │
2775
│ i64 ┆ i64 │
2776
╞═════╪═════╡
2777
│ 1 ┆ 6 │
2778
│ 2 ┆ 7 │
2779
│ 3 ┆ 8 │
2780
└─────┴─────┘
2781
"""
2782
if format == "binary":
2783
serializer = self._df.serialize_binary
2784
elif format == "json":
2785
serializer = self._df.serialize_json
2786
else:
2787
msg = f"`format` must be one of {{'binary', 'json'}}, got {format!r}"
2788
raise ValueError(msg)
2789
2790
return serialize_polars_object(serializer, file, format)
2791
2792
@overload
2793
def write_json(self, file: None = ...) -> str: ...
2794
2795
@overload
2796
def write_json(self, file: IOBase | str | Path) -> None: ...
2797
2798
def write_json(self, file: IOBase | str | Path | None = None) -> str | None:
2799
"""
2800
Serialize to JSON representation.
2801
2802
Parameters
2803
----------
2804
file
2805
File path or writable file-like object to which the result will be written.
2806
If set to `None` (default), the output is returned as a string instead.
2807
2808
See Also
2809
--------
2810
DataFrame.write_ndjson
2811
2812
Examples
2813
--------
2814
>>> df = pl.DataFrame(
2815
... {
2816
... "foo": [1, 2, 3],
2817
... "bar": [6, 7, 8],
2818
... }
2819
... )
2820
>>> df.write_json()
2821
'[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]'
2822
"""
2823
2824
def write_json_to_string() -> str:
2825
with BytesIO() as buf:
2826
self._df.write_json(buf)
2827
json_bytes = buf.getvalue()
2828
return json_bytes.decode("utf8")
2829
2830
if file is None:
2831
return write_json_to_string()
2832
elif isinstance(file, StringIO):
2833
json_str = write_json_to_string()
2834
file.write(json_str)
2835
return None
2836
elif isinstance(file, (str, Path)):
2837
file = normalize_filepath(file)
2838
self._df.write_json(file)
2839
return None
2840
else:
2841
self._df.write_json(file)
2842
return None
2843
2844
@overload
2845
def write_ndjson(self, file: None = None) -> str: ...
2846
2847
@overload
2848
def write_ndjson(self, file: str | Path | IO[bytes] | IO[str]) -> None: ...
2849
2850
def write_ndjson(
2851
self, file: str | Path | IO[bytes] | IO[str] | None = None
2852
) -> str | None:
2853
r"""
2854
Serialize to newline delimited JSON representation.
2855
2856
Parameters
2857
----------
2858
file
2859
File path or writable file-like object to which the result will be written.
2860
If set to `None` (default), the output is returned as a string instead.
2861
2862
Examples
2863
--------
2864
>>> df = pl.DataFrame(
2865
... {
2866
... "foo": [1, 2, 3],
2867
... "bar": [6, 7, 8],
2868
... }
2869
... )
2870
>>> df.write_ndjson()
2871
'{"foo":1,"bar":6}\n{"foo":2,"bar":7}\n{"foo":3,"bar":8}\n'
2872
"""
2873
should_return_buffer = False
2874
target: str | Path | IO[bytes] | IO[str]
2875
if file is None:
2876
target = cast("IO[bytes]", BytesIO())
2877
should_return_buffer = True
2878
elif isinstance(file, (str, os.PathLike)):
2879
target = normalize_filepath(file)
2880
else:
2881
target = file
2882
2883
engine: EngineType = "in-memory"
2884
2885
from polars.lazyframe.opt_flags import QueryOptFlags
2886
2887
self.lazy().sink_ndjson(
2888
target,
2889
optimizations=QueryOptFlags._eager(),
2890
engine=engine,
2891
)
2892
2893
if should_return_buffer:
2894
return str(target.getvalue(), encoding="utf-8") # type: ignore[union-attr]
2895
2896
return None
2897
2898
@overload
2899
def write_csv(
2900
self,
2901
file: None = None,
2902
*,
2903
include_bom: bool = ...,
2904
include_header: bool = ...,
2905
separator: str = ...,
2906
line_terminator: str = ...,
2907
quote_char: str = ...,
2908
batch_size: int = ...,
2909
datetime_format: str | None = ...,
2910
date_format: str | None = ...,
2911
time_format: str | None = ...,
2912
float_scientific: bool | None = ...,
2913
float_precision: int | None = ...,
2914
decimal_comma: bool = ...,
2915
null_value: str | None = ...,
2916
quote_style: CsvQuoteStyle | None = ...,
2917
storage_options: dict[str, Any] | None = ...,
2918
credential_provider: CredentialProviderFunction | Literal["auto"] | None = ...,
2919
retries: int = ...,
2920
) -> str: ...
2921
2922
@overload
2923
def write_csv(
2924
self,
2925
file: str | Path | IO[str] | IO[bytes],
2926
*,
2927
include_bom: bool = ...,
2928
include_header: bool = ...,
2929
separator: str = ...,
2930
line_terminator: str = ...,
2931
quote_char: str = ...,
2932
batch_size: int = ...,
2933
datetime_format: str | None = ...,
2934
date_format: str | None = ...,
2935
time_format: str | None = ...,
2936
float_scientific: bool | None = ...,
2937
float_precision: int | None = ...,
2938
decimal_comma: bool = ...,
2939
null_value: str | None = ...,
2940
quote_style: CsvQuoteStyle | None = ...,
2941
storage_options: dict[str, Any] | None = ...,
2942
credential_provider: CredentialProviderFunction | Literal["auto"] | None = ...,
2943
retries: int = ...,
2944
) -> None: ...
2945
2946
def write_csv(
2947
self,
2948
file: str | Path | IO[str] | IO[bytes] | None = None,
2949
*,
2950
include_bom: bool = False,
2951
include_header: bool = True,
2952
separator: str = ",",
2953
line_terminator: str = "\n",
2954
quote_char: str = '"',
2955
batch_size: int = 1024,
2956
datetime_format: str | None = None,
2957
date_format: str | None = None,
2958
time_format: str | None = None,
2959
float_scientific: bool | None = None,
2960
float_precision: int | None = None,
2961
decimal_comma: bool = False,
2962
null_value: str | None = None,
2963
quote_style: CsvQuoteStyle | None = None,
2964
storage_options: dict[str, Any] | None = None,
2965
credential_provider: (
2966
CredentialProviderFunction | Literal["auto"] | None
2967
) = "auto",
2968
retries: int = 2,
2969
) -> str | None:
2970
"""
2971
Write to comma-separated values (CSV) file.
2972
2973
Parameters
2974
----------
2975
file
2976
File path or writable file-like object to which the result will be written.
2977
If set to `None` (default), the output is returned as a string instead.
2978
include_bom
2979
Whether to include UTF-8 BOM in the CSV output.
2980
include_header
2981
Whether to include header in the CSV output.
2982
separator
2983
Separate CSV fields with this symbol.
2984
line_terminator
2985
String used to end each row.
2986
quote_char
2987
Byte to use as quoting character.
2988
batch_size
2989
Number of rows that will be processed per thread.
2990
datetime_format
2991
A format string, with the specifiers defined by the
2992
`chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
2993
Rust crate. If no format specified, the default fractional-second
2994
precision is inferred from the maximum timeunit found in the frame's
2995
Datetime cols (if any).
2996
date_format
2997
A format string, with the specifiers defined by the
2998
`chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
2999
Rust crate.
3000
time_format
3001
A format string, with the specifiers defined by the
3002
`chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
3003
Rust crate.
3004
float_scientific
3005
Whether to use scientific form always (true), never (false), or
3006
automatically (None) for `Float32` and `Float64` datatypes.
3007
float_precision
3008
Number of decimal places to write, applied to both `Float32` and
3009
`Float64` datatypes.
3010
decimal_comma
3011
Use a comma as the decimal separator instead of a point in standard
3012
notation. Floats will be encapsulated in quotes if necessary; set the
3013
field separator to override.
3014
null_value
3015
A string representing null values (defaulting to the empty string).
3016
quote_style : {'necessary', 'always', 'non_numeric', 'never'}
3017
Determines the quoting strategy used.
3018
3019
- necessary (default): This puts quotes around fields only when necessary.
3020
They are necessary when fields contain a quote,
3021
separator or record terminator.
3022
Quotes are also necessary when writing an empty record
3023
(which is indistinguishable from a record with one empty field).
3024
This is the default.
3025
- always: This puts quotes around every field. Always.
3026
- never: This never puts quotes around fields, even if that results in
3027
invalid CSV data (e.g.: by not quoting strings containing the separator).
3028
- non_numeric: This puts quotes around all fields that are non-numeric.
3029
Namely, when writing a field that does not parse as a valid float
3030
or integer, then quotes will be used even if they aren`t strictly
3031
necessary.
3032
storage_options
3033
Options that indicate how to connect to a cloud provider.
3034
3035
The cloud providers currently supported are AWS, GCP, and Azure.
3036
See supported keys here:
3037
3038
* `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
3039
* `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
3040
* `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
3041
* Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
3042
`{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
3043
3044
If `storage_options` is not provided, Polars will try to infer the
3045
information from environment variables.
3046
credential_provider
3047
Provide a function that can be called to provide cloud storage
3048
credentials. The function is expected to return a dictionary of
3049
credential keys along with an optional credential expiry time.
3050
3051
.. warning::
3052
This functionality is considered **unstable**. It may be changed
3053
at any point without it being considered a breaking change.
3054
retries
3055
Number of retries if accessing a cloud instance fails.
3056
3057
Examples
3058
--------
3059
>>> import pathlib
3060
>>>
3061
>>> df = pl.DataFrame(
3062
... {
3063
... "foo": [1, 2, 3, 4, 5],
3064
... "bar": [6, 7, 8, 9, 10],
3065
... "ham": ["a", "b", "c", "d", "e"],
3066
... }
3067
... )
3068
>>> path: pathlib.Path = dirpath / "new_file.csv"
3069
>>> df.write_csv(path, separator=",")
3070
"""
3071
from polars.io.csv._utils import _check_arg_is_1byte
3072
3073
_check_arg_is_1byte("separator", separator, can_be_empty=False)
3074
_check_arg_is_1byte("quote_char", quote_char, can_be_empty=True)
3075
if not null_value:
3076
null_value = None
3077
3078
should_return_buffer = False
3079
target: str | Path | IO[bytes] | IO[str]
3080
if file is None:
3081
target = cast("IO[bytes]", BytesIO())
3082
should_return_buffer = True
3083
elif isinstance(file, (str, os.PathLike)):
3084
target = normalize_filepath(file)
3085
else:
3086
target = file
3087
3088
engine: EngineType = "in-memory"
3089
3090
from polars.lazyframe.opt_flags import QueryOptFlags
3091
3092
self.lazy().sink_csv(
3093
target,
3094
include_bom=include_bom,
3095
include_header=include_header,
3096
separator=separator,
3097
line_terminator=line_terminator,
3098
quote_char=quote_char,
3099
batch_size=batch_size,
3100
datetime_format=datetime_format,
3101
date_format=date_format,
3102
time_format=time_format,
3103
float_scientific=float_scientific,
3104
float_precision=float_precision,
3105
decimal_comma=decimal_comma,
3106
null_value=null_value,
3107
quote_style=quote_style,
3108
storage_options=storage_options,
3109
credential_provider=credential_provider,
3110
retries=retries,
3111
optimizations=QueryOptFlags._eager(),
3112
engine=engine,
3113
)
3114
3115
if should_return_buffer:
3116
return str(target.getvalue(), encoding="utf-8") # type: ignore[union-attr]
3117
3118
return None
3119
3120
def write_clipboard(self, *, separator: str = "\t", **kwargs: Any) -> None:
3121
"""
3122
Copy `DataFrame` in csv format to the system clipboard with `write_csv`.
3123
3124
Useful for pasting into Excel or other similar spreadsheet software.
3125
3126
Parameters
3127
----------
3128
separator
3129
Separate CSV fields with this symbol.
3130
kwargs
3131
Additional arguments to pass to `write_csv`.
3132
3133
See Also
3134
--------
3135
polars.read_clipboard: Read a DataFrame from the clipboard.
3136
write_csv: Write to comma-separated values (CSV) file.
3137
"""
3138
result: str = self.write_csv(file=None, separator=separator, **kwargs)
3139
_write_clipboard_string(result)
3140
3141
def write_avro(
3142
self,
3143
file: str | Path | IO[bytes],
3144
compression: AvroCompression = "uncompressed",
3145
name: str = "",
3146
) -> None:
3147
"""
3148
Write to Apache Avro file.
3149
3150
Parameters
3151
----------
3152
file
3153
File path or writable file-like object to which the data will be written.
3154
compression : {'uncompressed', 'snappy', 'deflate'}
3155
Compression method. Defaults to "uncompressed".
3156
name
3157
Schema name. Defaults to empty string.
3158
3159
Examples
3160
--------
3161
>>> import pathlib
3162
>>>
3163
>>> df = pl.DataFrame(
3164
... {
3165
... "foo": [1, 2, 3, 4, 5],
3166
... "bar": [6, 7, 8, 9, 10],
3167
... "ham": ["a", "b", "c", "d", "e"],
3168
... }
3169
... )
3170
>>> path: pathlib.Path = dirpath / "new_file.avro"
3171
>>> df.write_avro(path)
3172
"""
3173
if compression is None:
3174
compression = "uncompressed"
3175
if isinstance(file, (str, Path)):
3176
file = normalize_filepath(file)
3177
if name is None:
3178
name = ""
3179
3180
self._df.write_avro(file, compression, name)
3181
3182
def write_excel(
3183
self,
3184
workbook: str | Workbook | IO[bytes] | Path | None = None,
3185
worksheet: str | Worksheet | None = None,
3186
*,
3187
position: tuple[int, int] | str = "A1",
3188
table_style: str | dict[str, Any] | None = None,
3189
table_name: str | None = None,
3190
column_formats: ColumnFormatDict | None = None,
3191
dtype_formats: dict[OneOrMoreDataTypes, str] | None = None,
3192
conditional_formats: ConditionalFormatDict | None = None,
3193
header_format: dict[str, Any] | None = None,
3194
column_totals: ColumnTotalsDefinition | None = None,
3195
column_widths: ColumnWidthsDefinition | None = None,
3196
row_totals: RowTotalsDefinition | None = None,
3197
row_heights: dict[int | tuple[int, ...], int] | int | None = None,
3198
sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = None,
3199
formulas: dict[str, str | dict[str, str]] | None = None,
3200
float_precision: int = 3,
3201
include_header: bool = True,
3202
autofilter: bool = True,
3203
autofit: bool = False,
3204
hidden_columns: Sequence[str] | SelectorType | None = None,
3205
hide_gridlines: bool = False,
3206
sheet_zoom: int | None = None,
3207
freeze_panes: (
3208
str
3209
| tuple[int, int]
3210
| tuple[str, int, int]
3211
| tuple[int, int, int, int]
3212
| None
3213
) = None,
3214
) -> Workbook:
3215
"""
3216
Write frame data to a table in an Excel workbook/worksheet.
3217
3218
Parameters
3219
----------
3220
workbook : {str, Workbook}
3221
String name or path of the workbook to create, BytesIO object, file opened
3222
in binary-mode, or an `xlsxwriter.Workbook` object that has not been closed.
3223
If None, writes to a `dataframe.xlsx` workbook in the working directory.
3224
worksheet : {str, Worksheet}
3225
Name of target worksheet or an `xlsxwriter.Worksheet` object (in which
3226
case `workbook` must be the parent `xlsxwriter.Workbook` object); if None,
3227
writes to "Sheet1" when creating a new workbook (note that writing to an
3228
existing workbook requires a valid existing -or new- worksheet name).
3229
position : {str, tuple}
3230
Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple.
3231
table_style : {str, dict}
3232
A named Excel table style, such as "Table Style Medium 4", or a dictionary
3233
of `{"key":value,}` options containing one or more of the following keys:
3234
"style", "first_column", "last_column", "banded_columns, "banded_rows".
3235
table_name : str
3236
Name of the output table object in the worksheet; can then be referred to
3237
in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations.
3238
column_formats : dict
3239
A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an
3240
Excel format string to the given columns. Formats defined here (such as
3241
"dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`.
3242
dtype_formats : dict
3243
A `{dtype:str,}` dictionary that sets the default Excel format for the
3244
given dtype. (This can be overridden on a per-column basis by the
3245
`column_formats` param).
3246
conditional_formats : dict
3247
A dictionary of colname (or selector) keys to a format str, dict, or list
3248
that defines conditional formatting options for the specified columns.
3249
3250
* If supplying a string typename, should be one of the valid `xlsxwriter`
3251
types such as "3_color_scale", "data_bar", etc.
3252
* If supplying a dictionary you can make use of any/all `xlsxwriter`
3253
supported options, including icon sets, formulae, etc.
3254
* Supplying multiple columns as a tuple/key will apply a single format
3255
across all columns - this is effective in creating a heatmap, as the
3256
min/max values will be determined across the entire range, not per-column.
3257
* Finally, you can also supply a list made up from the above options
3258
in order to apply *more* than one conditional format to the same range.
3259
header_format : dict
3260
A `{key:value,}` dictionary of `xlsxwriter` format options to apply
3261
to the table header row, such as `{"bold":True, "font_color":"#702963"}`.
3262
column_totals : {bool, list, dict}
3263
Add a column-total row to the exported table.
3264
3265
* If True, all numeric columns will have an associated total using "sum".
3266
* If passing a string, it must be one of the valid total function names
3267
and all numeric columns will have an associated total using that function.
3268
* If passing a list of colnames, only those given will have a total.
3269
* For more control, pass a `{colname:funcname,}` dict.
3270
3271
Valid column-total function names are "average", "count_nums", "count",
3272
"max", "min", "std_dev", "sum", and "var".
3273
column_widths : {dict, int}
3274
A `{colname:int,}` or `{selector:int,}` dict or a single integer that
3275
sets (or overrides if autofitting) table column widths, in integer pixel
3276
units. If given as an integer the same value is used for all table columns.
3277
row_totals : {dict, list, bool}
3278
Add a row-total column to the right-hand side of the exported table.
3279
3280
* If True, a column called "total" will be added at the end of the table
3281
that applies a "sum" function row-wise across all numeric columns.
3282
* If passing a list/sequence of column names, only the matching columns
3283
will participate in the sum.
3284
* Can also pass a `{colname:columns,}` dictionary to create one or
3285
more total columns with distinct names, referencing different columns.
3286
row_heights : {dict, int}
3287
An int or `{row_index:int,}` dictionary that sets the height of the given
3288
rows (if providing a dictionary) or all rows (if providing an integer) that
3289
intersect with the table body (including any header and total row) in
3290
integer pixel units. Note that `row_index` starts at zero and will be
3291
the header row (unless `include_header` is False).
3292
sparklines : dict
3293
A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more
3294
sparklines to be written into a new column in the table.
3295
3296
* If passing a list of colnames (used as the source of the sparkline data)
3297
the default sparkline settings are used (eg: line chart with no markers).
3298
* For more control an `xlsxwriter`-compliant options dict can be supplied,
3299
in which case three additional polars-specific keys are available:
3300
"columns", "insert_before", and "insert_after". These allow you to define
3301
the source columns and position the sparkline(s) with respect to other
3302
table columns. If no position directive is given, sparklines are added to
3303
the end of the table (eg: to the far right) in the order they are given.
3304
formulas : dict
3305
A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or
3306
more formulas to be written into a new column in the table. Note that you
3307
are strongly advised to use structured references in your formulae wherever
3308
possible to make it simple to reference columns by name.
3309
3310
* If providing a string formula (such as "=[@colx]*[@coly]") the column will
3311
be added to the end of the table (eg: to the far right), after any default
3312
sparklines and before any row_totals.
3313
* For the most control supply an options dictionary with the following keys:
3314
"formula" (mandatory), one of "insert_before" or "insert_after", and
3315
optionally "return_dtype". The latter is used to appropriately format the
3316
output of the formula and allow it to participate in row/column totals.
3317
float_precision : int
3318
Default number of decimals displayed for floating point columns (note that
3319
this is purely a formatting directive; the actual values are not rounded).
3320
include_header : bool
3321
Indicate if the table should be created with a header row.
3322
autofilter : bool
3323
If the table has headers, provide autofilter capability.
3324
autofit : bool
3325
Calculate individual column widths from the data.
3326
hidden_columns : str | list
3327
A column name, list of column names, or a selector representing table
3328
columns to mark as hidden in the output worksheet.
3329
hide_gridlines : bool
3330
Do not display any gridlines on the output worksheet.
3331
sheet_zoom : int
3332
Set the default zoom level of the output worksheet.
3333
freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int)
3334
Freeze workbook panes.
3335
3336
* If (row, col) is supplied, panes are split at the top-left corner of the
3337
specified cell, which are 0-indexed. Thus, to freeze only the top row,
3338
supply (1, 0).
3339
* Alternatively, cell notation can be used to supply the cell. For example,
3340
"A2" indicates the split occurs at the top-left of cell A2, which is the
3341
equivalent of (1, 0).
3342
* If (row, col, top_row, top_col) are supplied, the panes are split based on
3343
the `row` and `col`, and the scrolling region is initialized to begin at
3344
the `top_row` and `top_col`. Thus, to freeze only the top row and have the
3345
scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4).
3346
Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent.
3347
3348
Notes
3349
-----
3350
* A list of compatible `xlsxwriter` format property names can be found here:
3351
https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties
3352
3353
* Conditional formatting dictionaries should provide xlsxwriter-compatible
3354
definitions; polars will take care of how they are applied on the worksheet
3355
with respect to the relative sheet/column position. For supported options,
3356
see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html
3357
3358
* Similarly, sparkline option dictionaries should contain xlsxwriter-compatible
3359
key/values, as well as a mandatory polars "columns" key that defines the
3360
sparkline source data; these source columns should all be adjacent. Two other
3361
polars-specific keys are available to help define where the sparkline appears
3362
in the table: "insert_after", and "insert_before". The value associated with
3363
these keys should be the name of a column in the exported table.
3364
https://xlsxwriter.readthedocs.io/working_with_sparklines.html
3365
3366
* Formula dictionaries *must* contain a key called "formula", and then optional
3367
"insert_after", "insert_before", and/or "return_dtype" keys. These additional
3368
keys allow the column to be injected into the table at a specific location,
3369
and/or to define the return type of the formula (eg: "Int64", "Float64", etc).
3370
Formulas that refer to table columns should use Excel's structured references
3371
syntax to ensure the formula is applied correctly and is table-relative.
3372
https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e
3373
3374
Examples
3375
--------
3376
Instantiate a basic DataFrame:
3377
3378
>>> from random import uniform
3379
>>> from datetime import date
3380
>>>
3381
>>> df = pl.DataFrame(
3382
... {
3383
... "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)],
3384
... "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)],
3385
... "val": [10_000, 20_000, 30_000],
3386
... }
3387
... )
3388
3389
Export to "dataframe.xlsx" (the default workbook name, if not specified) in the
3390
working directory, add column totals ("sum" by default) on all numeric columns,
3391
then autofit:
3392
3393
>>> df.write_excel(column_totals=True, autofit=True) # doctest: +SKIP
3394
3395
Write frame to a specific location on the sheet, set a named table style,
3396
apply US-style date formatting, increase default float precision, apply a
3397
non-default total function to a single column, autofit:
3398
3399
>>> df.write_excel( # doctest: +SKIP
3400
... position="B4",
3401
... table_style="Table Style Light 16",
3402
... dtype_formats={pl.Date: "mm/dd/yyyy"},
3403
... column_totals={"num": "average"},
3404
... float_precision=6,
3405
... autofit=True,
3406
... )
3407
3408
Write the same frame to a named worksheet twice, applying different styles
3409
and conditional formatting to each table, adding table titles using explicit
3410
xlsxwriter integration:
3411
3412
>>> from xlsxwriter import Workbook
3413
>>> with Workbook("multi_frame.xlsx") as wb: # doctest: +SKIP
3414
... # basic/default conditional formatting
3415
... df.write_excel(
3416
... workbook=wb,
3417
... worksheet="data",
3418
... position=(3, 1), # specify position as (row,col) coordinates
3419
... conditional_formats={"num": "3_color_scale", "val": "data_bar"},
3420
... table_style="Table Style Medium 4",
3421
... )
3422
...
3423
... # advanced conditional formatting, custom styles
3424
... df.write_excel(
3425
... workbook=wb,
3426
... worksheet="data",
3427
... position=(df.height + 7, 1),
3428
... table_style={
3429
... "style": "Table Style Light 4",
3430
... "first_column": True,
3431
... },
3432
... conditional_formats={
3433
... "num": {
3434
... "type": "3_color_scale",
3435
... "min_color": "#76933c",
3436
... "mid_color": "#c4d79b",
3437
... "max_color": "#ebf1de",
3438
... },
3439
... "val": {
3440
... "type": "data_bar",
3441
... "data_bar_2010": True,
3442
... "bar_color": "#9bbb59",
3443
... "bar_negative_color_same": True,
3444
... "bar_negative_border_color_same": True,
3445
... },
3446
... },
3447
... column_formats={"num": "#,##0.000;[White]-#,##0.000"},
3448
... column_widths={"val": 125},
3449
... autofit=True,
3450
... )
3451
...
3452
... # add some table titles (with a custom format)
3453
... ws = wb.get_worksheet_by_name("data")
3454
... fmt_title = wb.add_format(
3455
... {
3456
... "font_color": "#4f6228",
3457
... "font_size": 12,
3458
... "italic": True,
3459
... "bold": True,
3460
... }
3461
... )
3462
... ws.write(2, 1, "Basic/default conditional formatting", fmt_title)
3463
... ws.write(
3464
... df.height + 6, 1, "Customised conditional formatting", fmt_title
3465
... )
3466
3467
Export a table containing two different types of sparklines. Use default
3468
options for the "trend" sparkline and customized options (and positioning)
3469
for the "+/-" win_loss sparkline, with non-default integer dtype formatting,
3470
column totals, a subtle two-tone heatmap and hidden worksheet gridlines:
3471
3472
>>> df = pl.DataFrame(
3473
... {
3474
... "id": ["aaa", "bbb", "ccc", "ddd", "eee"],
3475
... "q1": [100, 55, -20, 0, 35],
3476
... "q2": [30, -10, 15, 60, 20],
3477
... "q3": [-50, 0, 40, 80, 80],
3478
... "q4": [75, 55, 25, -10, -55],
3479
... }
3480
... )
3481
>>> df.write_excel( # doctest: +SKIP
3482
... table_style="Table Style Light 2",
3483
... # apply accounting format to all flavours of integer
3484
... dtype_formats={dt: "#,##0_);(#,##0)" for dt in [pl.Int32, pl.Int64]},
3485
... sparklines={
3486
... # default options; just provide source cols
3487
... "trend": ["q1", "q2", "q3", "q4"],
3488
... # customized sparkline type, with positioning directive
3489
... "+/-": {
3490
... "columns": ["q1", "q2", "q3", "q4"],
3491
... "insert_after": "id",
3492
... "type": "win_loss",
3493
... },
3494
... },
3495
... conditional_formats={
3496
... # create a unified multi-column heatmap
3497
... ("q1", "q2", "q3", "q4"): {
3498
... "type": "2_color_scale",
3499
... "min_color": "#95b3d7",
3500
... "max_color": "#ffffff",
3501
... },
3502
... },
3503
... column_totals=["q1", "q2", "q3", "q4"],
3504
... row_totals=True,
3505
... hide_gridlines=True,
3506
... )
3507
3508
Export a table containing an Excel formula-based column that calculates a
3509
standardised Z-score, showing use of structured references in conjunction
3510
with positioning directives, column totals, and custom formatting.
3511
3512
>>> df = pl.DataFrame(
3513
... {
3514
... "id": ["a123", "b345", "c567", "d789", "e101"],
3515
... "points": [99, 45, 50, 85, 35],
3516
... }
3517
... )
3518
>>> df.write_excel( # doctest: +SKIP
3519
... table_style={
3520
... "style": "Table Style Medium 15",
3521
... "first_column": True,
3522
... },
3523
... column_formats={
3524
... "id": {"font": "Consolas"},
3525
... "points": {"align": "center"},
3526
... "z-score": {"align": "center"},
3527
... },
3528
... column_totals="average",
3529
... formulas={
3530
... "z-score": {
3531
... # use structured references to refer to the table columns and 'totals' row
3532
... "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))",
3533
... "insert_after": "points",
3534
... "return_dtype": pl.Float64,
3535
... }
3536
... },
3537
... hide_gridlines=True,
3538
... sheet_zoom=125,
3539
... )
3540
3541
Create and reference a Worksheet object directly, adding a basic chart.
3542
Taking advantage of structured references to set chart series values and
3543
categories is strongly recommended so that you do not have to calculate
3544
cell positions with respect to the frame data and worksheet:
3545
3546
>>> with Workbook("basic_chart.xlsx") as wb: # doctest: +SKIP
3547
... # create worksheet object and write frame data to it
3548
... ws = wb.add_worksheet("demo")
3549
... df.write_excel(
3550
... workbook=wb,
3551
... worksheet=ws,
3552
... table_name="DataTable",
3553
... table_style="Table Style Medium 26",
3554
... hide_gridlines=True,
3555
... )
3556
... # create chart object, point to the written table
3557
... # data using structured references, and style it
3558
... chart = wb.add_chart({"type": "column"})
3559
... chart.set_title({"name": "Example Chart"})
3560
... chart.set_legend({"none": True})
3561
... chart.set_style(38)
3562
... chart.add_series(
3563
... { # note the use of structured references
3564
... "values": "=DataTable[points]",
3565
... "categories": "=DataTable[id]",
3566
... "data_labels": {"value": True},
3567
... }
3568
... )
3569
... # add chart to the worksheet
3570
... ws.insert_chart("D1", chart)
3571
""" # noqa: W505
3572
from polars.io.spreadsheet._write_utils import (
3573
_unpack_multi_column_dict,
3574
_xl_apply_conditional_formats,
3575
_xl_inject_sparklines,
3576
_xl_setup_table_columns,
3577
_xl_setup_table_options,
3578
_xl_setup_workbook,
3579
_xl_unique_table_name,
3580
_XLFormatCache,
3581
)
3582
3583
xlsxwriter = import_optional("xlsxwriter", err_prefix="Excel export requires")
3584
from xlsxwriter.utility import xl_cell_to_rowcol
3585
3586
# setup workbook/worksheet
3587
wb, ws, can_close = _xl_setup_workbook(workbook, worksheet)
3588
df, is_empty = self, self.is_empty()
3589
3590
# note: `_xl_setup_table_columns` converts nested data (List, Struct, etc.) to
3591
# string, so we keep a reference to the original so that column selection with
3592
# selectors that target such types remains correct
3593
df_original = df
3594
3595
# setup table format/columns
3596
fmt_cache = _XLFormatCache(wb)
3597
column_formats = column_formats or {}
3598
table_style, table_options = _xl_setup_table_options(table_style)
3599
table_name = table_name or _xl_unique_table_name(wb)
3600
table_columns, column_formats, df = _xl_setup_table_columns( # type: ignore[assignment]
3601
df=df,
3602
format_cache=fmt_cache,
3603
column_formats=column_formats,
3604
column_totals=column_totals,
3605
dtype_formats=dtype_formats,
3606
header_format=header_format,
3607
float_precision=float_precision,
3608
table_style=table_style,
3609
row_totals=row_totals,
3610
sparklines=sparklines,
3611
formulas=formulas,
3612
)
3613
3614
# normalise cell refs (eg: "B3" => (2,1)) and establish table start/finish,
3615
# accounting for potential presence/absence of headers and a totals row.
3616
table_start = (
3617
xl_cell_to_rowcol(position) if isinstance(position, str) else position
3618
)
3619
table_finish = (
3620
table_start[0]
3621
+ df.height
3622
+ int(is_empty)
3623
- int(not include_header)
3624
+ int(bool(column_totals)),
3625
table_start[1] + df.width - 1,
3626
)
3627
3628
excel_max_valid_rows = 1048575
3629
excel_max_valid_cols = 16384
3630
3631
if (
3632
table_finish[0] > excel_max_valid_rows
3633
or table_finish[1] > excel_max_valid_cols
3634
):
3635
msg = f"writing {df.height}x{df.width} frame at {position!r} does not fit worksheet dimensions of {excel_max_valid_rows} rows and {excel_max_valid_cols} columns"
3636
raise InvalidOperationError(msg)
3637
3638
# write table structure and formats into the target sheet
3639
if not is_empty or include_header:
3640
ws.add_table(
3641
*table_start,
3642
*table_finish,
3643
{
3644
"data": df.rows(),
3645
"style": table_style,
3646
"columns": table_columns,
3647
"header_row": include_header,
3648
"autofilter": autofilter,
3649
"total_row": bool(column_totals) and not is_empty,
3650
"name": table_name,
3651
**table_options,
3652
},
3653
)
3654
3655
# apply conditional formats
3656
if conditional_formats:
3657
_xl_apply_conditional_formats(
3658
df=df,
3659
ws=ws,
3660
conditional_formats=conditional_formats,
3661
table_start=table_start,
3662
include_header=include_header,
3663
format_cache=fmt_cache,
3664
)
3665
3666
# additional column-level properties
3667
if hidden_columns is None:
3668
hidden = set()
3669
elif isinstance(hidden_columns, str):
3670
hidden = {hidden_columns}
3671
else:
3672
hidden = set(_expand_selectors(df_original, hidden_columns))
3673
3674
# Autofit section needs to be present above column_widths section
3675
# to ensure that parameters provided in the column_widths section
3676
# are not overwritten by autofit
3677
#
3678
# table/rows all written; apply (optional) autofit
3679
if autofit and not is_empty:
3680
xlv = xlsxwriter.__version__
3681
if parse_version(xlv) < (3, 0, 8):
3682
msg = f"`autofit=True` requires xlsxwriter 3.0.8 or higher, found {xlv}"
3683
raise ModuleUpgradeRequiredError(msg)
3684
ws.autofit()
3685
3686
if isinstance(column_widths, int):
3687
column_widths = dict.fromkeys(df.columns, column_widths)
3688
else:
3689
column_widths = _expand_selector_dicts( # type: ignore[assignment]
3690
df_original, column_widths, expand_keys=True, expand_values=False
3691
)
3692
column_widths = _unpack_multi_column_dict(column_widths or {}) # type: ignore[assignment]
3693
3694
for column in df.columns:
3695
options = {"hidden": True} if column in hidden else {}
3696
col_idx = table_start[1] + df.get_column_index(column)
3697
if column in column_widths: # type: ignore[operator]
3698
ws.set_column_pixels(
3699
col_idx,
3700
col_idx,
3701
column_widths[column], # type: ignore[index]
3702
None,
3703
options,
3704
)
3705
elif options:
3706
ws.set_column(col_idx, col_idx, None, None, options)
3707
3708
# finally, inject any sparklines into the table
3709
for column, params in (sparklines or {}).items():
3710
_xl_inject_sparklines(
3711
ws,
3712
df,
3713
table_start,
3714
column,
3715
include_header=include_header,
3716
params=params,
3717
)
3718
3719
# worksheet options
3720
if hide_gridlines:
3721
ws.hide_gridlines(2)
3722
if sheet_zoom:
3723
ws.set_zoom(sheet_zoom)
3724
if row_heights:
3725
if isinstance(row_heights, int):
3726
for idx in range(table_start[0], table_finish[0] + 1):
3727
ws.set_row_pixels(idx, row_heights)
3728
elif isinstance(row_heights, dict):
3729
for idx, height in _unpack_multi_column_dict(row_heights).items(): # type: ignore[assignment]
3730
ws.set_row_pixels(idx, height)
3731
3732
if freeze_panes:
3733
if isinstance(freeze_panes, str):
3734
ws.freeze_panes(freeze_panes)
3735
else:
3736
ws.freeze_panes(*freeze_panes)
3737
3738
if can_close:
3739
wb.close()
3740
return wb
3741
3742
@overload
3743
def write_ipc(
3744
self,
3745
file: None,
3746
*,
3747
compression: IpcCompression = "uncompressed",
3748
compat_level: CompatLevel | None = None,
3749
storage_options: dict[str, Any] | None = None,
3750
credential_provider: (
3751
CredentialProviderFunction | Literal["auto"] | None
3752
) = "auto",
3753
retries: int = 2,
3754
) -> BytesIO: ...
3755
3756
@overload
3757
def write_ipc(
3758
self,
3759
file: str | Path | IO[bytes],
3760
*,
3761
compression: IpcCompression = "uncompressed",
3762
compat_level: CompatLevel | None = None,
3763
storage_options: dict[str, Any] | None = None,
3764
credential_provider: (
3765
CredentialProviderFunction | Literal["auto"] | None
3766
) = "auto",
3767
retries: int = 2,
3768
) -> None: ...
3769
3770
@deprecate_renamed_parameter("future", "compat_level", version="1.1")
3771
def write_ipc(
3772
self,
3773
file: str | Path | IO[bytes] | None,
3774
*,
3775
compression: IpcCompression = "uncompressed",
3776
compat_level: CompatLevel | None = None,
3777
storage_options: dict[str, Any] | None = None,
3778
credential_provider: (
3779
CredentialProviderFunction | Literal["auto"] | None
3780
) = "auto",
3781
retries: int = 2,
3782
) -> BytesIO | None:
3783
"""
3784
Write to Arrow IPC binary stream or Feather file.
3785
3786
See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html.
3787
3788
.. versionchanged:: 1.1
3789
The `future` parameter was renamed `compat_level`.
3790
3791
Parameters
3792
----------
3793
file
3794
Path or writable file-like object to which the IPC data will be
3795
written. If set to `None`, the output is returned as a BytesIO object.
3796
compression : {'uncompressed', 'lz4', 'zstd'}
3797
Compression method. Defaults to "uncompressed".
3798
compat_level
3799
Use a specific compatibility level
3800
when exporting Polars' internal data structures.
3801
storage_options
3802
Options that indicate how to connect to a cloud provider.
3803
3804
The cloud providers currently supported are AWS, GCP, and Azure.
3805
See supported keys here:
3806
3807
* `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
3808
* `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
3809
* `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
3810
* Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
3811
`{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
3812
3813
If `storage_options` is not provided, Polars will try to infer the
3814
information from environment variables.
3815
credential_provider
3816
Provide a function that can be called to provide cloud storage
3817
credentials. The function is expected to return a dictionary of
3818
credential keys along with an optional credential expiry time.
3819
3820
.. warning::
3821
This functionality is considered **unstable**. It may be changed
3822
at any point without it being considered a breaking change.
3823
retries
3824
Number of retries if accessing a cloud instance fails.
3825
3826
Examples
3827
--------
3828
>>> import pathlib
3829
>>>
3830
>>> df = pl.DataFrame(
3831
... {
3832
... "foo": [1, 2, 3, 4, 5],
3833
... "bar": [6, 7, 8, 9, 10],
3834
... "ham": ["a", "b", "c", "d", "e"],
3835
... }
3836
... )
3837
>>> path: pathlib.Path = dirpath / "new_file.arrow"
3838
>>> df.write_ipc(path)
3839
"""
3840
return_bytes = file is None
3841
target: str | Path | IO[bytes]
3842
if file is None:
3843
target = BytesIO()
3844
else:
3845
target = file
3846
3847
from polars.lazyframe.opt_flags import QueryOptFlags
3848
3849
self.lazy().sink_ipc(
3850
target,
3851
compression=compression,
3852
compat_level=compat_level,
3853
storage_options=storage_options,
3854
credential_provider=credential_provider,
3855
retries=retries,
3856
optimizations=QueryOptFlags._eager(),
3857
engine="in-memory",
3858
)
3859
return target if return_bytes else None # type: ignore[return-value]
3860
3861
@overload
3862
def write_ipc_stream(
3863
self,
3864
file: None,
3865
*,
3866
compression: IpcCompression = "uncompressed",
3867
compat_level: CompatLevel | None = None,
3868
) -> BytesIO: ...
3869
3870
@overload
3871
def write_ipc_stream(
3872
self,
3873
file: str | Path | IO[bytes],
3874
*,
3875
compression: IpcCompression = "uncompressed",
3876
compat_level: CompatLevel | None = None,
3877
) -> None: ...
3878
3879
@deprecate_renamed_parameter("future", "compat_level", version="1.1")
3880
def write_ipc_stream(
3881
self,
3882
file: str | Path | IO[bytes] | None,
3883
*,
3884
compression: IpcCompression = "uncompressed",
3885
compat_level: CompatLevel | None = None,
3886
) -> BytesIO | None:
3887
"""
3888
Write to Arrow IPC record batch stream.
3889
3890
See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html.
3891
3892
.. versionchanged:: 1.1
3893
The `future` parameter was renamed `compat_level`.
3894
3895
Parameters
3896
----------
3897
file
3898
Path or writable file-like object to which the IPC record batch data will
3899
be written. If set to `None`, the output is returned as a BytesIO object.
3900
compression : {'uncompressed', 'lz4', 'zstd'}
3901
Compression method. Defaults to "uncompressed".
3902
compat_level
3903
Use a specific compatibility level
3904
when exporting Polars' internal data structures.
3905
3906
Examples
3907
--------
3908
>>> import pathlib
3909
>>>
3910
>>> df = pl.DataFrame(
3911
... {
3912
... "foo": [1, 2, 3, 4, 5],
3913
... "bar": [6, 7, 8, 9, 10],
3914
... "ham": ["a", "b", "c", "d", "e"],
3915
... }
3916
... )
3917
>>> path: pathlib.Path = dirpath / "new_file.arrow"
3918
>>> df.write_ipc_stream(path)
3919
"""
3920
return_bytes = file is None
3921
if return_bytes:
3922
file = BytesIO()
3923
elif isinstance(file, (str, Path)):
3924
file = normalize_filepath(file)
3925
3926
compat_level_py: int | bool
3927
if compat_level is None:
3928
compat_level_py = True
3929
elif isinstance(compat_level, CompatLevel):
3930
compat_level_py = compat_level._version
3931
3932
if compression is None:
3933
compression = "uncompressed"
3934
3935
self._df.write_ipc_stream(file, compression, compat_level_py)
3936
return file if return_bytes else None # type: ignore[return-value]
3937
3938
def write_parquet(
3939
self,
3940
file: str | Path | IO[bytes],
3941
*,
3942
compression: ParquetCompression = "zstd",
3943
compression_level: int | None = None,
3944
statistics: bool | str | dict[str, bool] = True,
3945
row_group_size: int | None = None,
3946
data_page_size: int | None = None,
3947
use_pyarrow: bool = False,
3948
pyarrow_options: dict[str, Any] | None = None,
3949
partition_by: str | Sequence[str] | None = None,
3950
partition_chunk_size_bytes: int = 4_294_967_296,
3951
storage_options: dict[str, Any] | None = None,
3952
credential_provider: (
3953
CredentialProviderFunction | Literal["auto"] | None
3954
) = "auto",
3955
retries: int = 2,
3956
metadata: ParquetMetadata | None = None,
3957
mkdir: bool = False,
3958
) -> None:
3959
"""
3960
Write to Apache Parquet file.
3961
3962
Parameters
3963
----------
3964
file
3965
File path or writable file-like object to which the result will be written.
3966
This should be a path to a directory if writing a partitioned dataset.
3967
compression : {'lz4', 'uncompressed', 'snappy', 'gzip', 'lzo', 'brotli', 'zstd'}
3968
Choose "zstd" for good compression performance.
3969
Choose "lz4" for fast compression/decompression.
3970
Choose "snappy" for more backwards compatibility guarantees
3971
when you deal with older parquet readers.
3972
compression_level
3973
The level of compression to use. Higher compression means smaller files on
3974
disk.
3975
3976
- "gzip" : min-level: 0, max-level: 9.
3977
- "brotli" : min-level: 0, max-level: 11.
3978
- "zstd" : min-level: 1, max-level: 22.
3979
3980
statistics
3981
Write statistics to the parquet headers. This is the default behavior.
3982
3983
Possible values:
3984
3985
- `True`: enable default set of statistics (default). Some
3986
statistics may be disabled.
3987
- `False`: disable all statistics
3988
- "full": calculate and write all available statistics. Cannot be
3989
combined with `use_pyarrow`.
3990
- `{ "statistic-key": True / False, ... }`. Cannot be combined with
3991
`use_pyarrow`. Available keys:
3992
3993
- "min": column minimum value (default: `True`)
3994
- "max": column maximum value (default: `True`)
3995
- "distinct_count": number of unique column values (default: `False`)
3996
- "null_count": number of null values in column (default: `True`)
3997
row_group_size
3998
Size of the row groups in number of rows. Defaults to 512^2 rows.
3999
data_page_size
4000
Size of the data page in bytes. Defaults to 1024^2 bytes.
4001
use_pyarrow
4002
Use C++ parquet implementation vs Rust parquet implementation.
4003
At the moment C++ supports more features.
4004
pyarrow_options
4005
Arguments passed to `pyarrow.parquet.write_table`.
4006
4007
If you pass `partition_cols` here, the dataset will be written
4008
using `pyarrow.parquet.write_to_dataset`.
4009
The `partition_cols` parameter leads to write the dataset to a directory.
4010
Similar to Spark's partitioned datasets.
4011
partition_by
4012
Column(s) to partition by. A partitioned dataset will be written if this is
4013
specified. This parameter is considered unstable and is subject to change.
4014
partition_chunk_size_bytes
4015
Approximate size to split DataFrames within a single partition when
4016
writing. Note this is calculated using the size of the DataFrame in
4017
memory - the size of the output file may differ depending on the
4018
file format / compression.
4019
storage_options
4020
Options that indicate how to connect to a cloud provider.
4021
4022
The cloud providers currently supported are AWS, GCP, and Azure.
4023
See supported keys here:
4024
4025
* `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
4026
* `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
4027
* `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
4028
* Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
4029
`{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
4030
4031
If `storage_options` is not provided, Polars will try to infer the
4032
information from environment variables.
4033
credential_provider
4034
Provide a function that can be called to provide cloud storage
4035
credentials. The function is expected to return a dictionary of
4036
credential keys along with an optional credential expiry time.
4037
4038
.. warning::
4039
This functionality is considered **unstable**. It may be changed
4040
at any point without it being considered a breaking change.
4041
retries
4042
Number of retries if accessing a cloud instance fails.
4043
metadata
4044
A dictionary or callback to add key-values to the file-level Parquet
4045
metadata.
4046
4047
.. warning::
4048
This functionality is considered **experimental**. It may be removed or
4049
changed at any point without it being considered a breaking change.
4050
mkdir: bool
4051
Recursively create all the directories in the path.
4052
4053
.. warning::
4054
This functionality is considered **unstable**. It may be changed at any
4055
point without it being considered a breaking change.
4056
4057
Examples
4058
--------
4059
>>> import pathlib
4060
>>>
4061
>>> df = pl.DataFrame(
4062
... {
4063
... "foo": [1, 2, 3, 4, 5],
4064
... "bar": [6, 7, 8, 9, 10],
4065
... "ham": ["a", "b", "c", "d", "e"],
4066
... }
4067
... )
4068
>>> path: pathlib.Path = dirpath / "new_file.parquet"
4069
>>> df.write_parquet(path)
4070
4071
We can use pyarrow with use_pyarrow_write_to_dataset=True
4072
to write partitioned datasets. The following example will
4073
write the first row to ../watermark=1/*.parquet and the
4074
other rows to ../watermark=2/*.parquet.
4075
4076
>>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]})
4077
>>> path: pathlib.Path = dirpath / "partitioned_object"
4078
>>> df.write_parquet(
4079
... path,
4080
... use_pyarrow=True,
4081
... pyarrow_options={"partition_cols": ["watermark"]},
4082
... )
4083
"""
4084
if compression is None:
4085
compression = "uncompressed"
4086
if isinstance(file, (str, Path)):
4087
if partition_by is not None or (
4088
pyarrow_options is not None and pyarrow_options.get("partition_cols")
4089
):
4090
file = normalize_filepath(file, check_not_directory=False)
4091
else:
4092
file = normalize_filepath(file)
4093
4094
if use_pyarrow:
4095
if statistics == "full" or isinstance(statistics, dict):
4096
msg = "write_parquet with `use_pyarrow=True` allows only boolean values for `statistics`"
4097
raise ValueError(msg)
4098
if metadata is not None:
4099
msg = "write_parquet with `use_pyarrow=True` cannot be combined with `metadata`"
4100
raise ValueError(msg)
4101
if mkdir:
4102
msg = "write_parquet with `use_pyarrow=True` cannot be combined with `mkdir`"
4103
raise ValueError(msg)
4104
4105
tbl = self.to_arrow()
4106
data = {}
4107
4108
for i, column in enumerate(tbl):
4109
# extract the name before casting
4110
name = f"column_{i}" if column._name is None else column._name
4111
4112
data[name] = column
4113
4114
tbl = pa.table(data)
4115
4116
# do not remove this import!
4117
# needed below
4118
import pyarrow.parquet # noqa: F401
4119
4120
if pyarrow_options is None:
4121
pyarrow_options = {}
4122
pyarrow_options["compression"] = (
4123
None if compression == "uncompressed" else compression
4124
)
4125
pyarrow_options["compression_level"] = compression_level
4126
pyarrow_options["write_statistics"] = statistics
4127
pyarrow_options["row_group_size"] = row_group_size
4128
pyarrow_options["data_page_size"] = data_page_size
4129
4130
if pyarrow_options.get("partition_cols"):
4131
pa.parquet.write_to_dataset(
4132
table=tbl,
4133
root_path=file,
4134
**(pyarrow_options or {}),
4135
)
4136
else:
4137
pa.parquet.write_table(
4138
table=tbl,
4139
where=file,
4140
**(pyarrow_options or {}),
4141
)
4142
4143
return
4144
4145
target: str | Path | IO[bytes] | PartitioningScheme = file
4146
engine: EngineType = "in-memory"
4147
if partition_by is not None:
4148
if not isinstance(file, str):
4149
msg = "expected file to be a `str` since partition-by is set"
4150
raise TypeError(msg)
4151
4152
from polars.io import PartitionByKey
4153
4154
target = PartitionByKey(file, by=partition_by)
4155
mkdir = True
4156
engine = "streaming"
4157
4158
from polars.lazyframe.opt_flags import QueryOptFlags
4159
4160
self.lazy().sink_parquet(
4161
target,
4162
compression=compression,
4163
compression_level=compression_level,
4164
statistics=statistics,
4165
row_group_size=row_group_size,
4166
data_page_size=data_page_size,
4167
storage_options=storage_options,
4168
credential_provider=credential_provider,
4169
retries=retries,
4170
metadata=metadata,
4171
engine=engine,
4172
mkdir=mkdir,
4173
optimizations=QueryOptFlags._eager(),
4174
)
4175
4176
def write_database(
4177
self,
4178
table_name: str,
4179
connection: ConnectionOrCursor | str,
4180
*,
4181
if_table_exists: DbWriteMode = "fail",
4182
engine: DbWriteEngine | None = None,
4183
engine_options: dict[str, Any] | None = None,
4184
) -> int:
4185
"""
4186
Write the data in a Polars DataFrame to a database.
4187
4188
.. versionadded:: 0.20.26
4189
Support for instantiated connection objects in addition to URI strings, and
4190
a new `engine_options` parameter.
4191
4192
Parameters
4193
----------
4194
table_name
4195
Schema-qualified name of the table to create or append to in the target
4196
SQL database. If your table name contains special characters, it should
4197
be quoted.
4198
connection
4199
An existing SQLAlchemy or ADBC connection against the target database, or
4200
a URI string that will be used to instantiate such a connection, such as:
4201
4202
* "postgresql://user:pass@server:port/database"
4203
* "sqlite:////path/to/database.db"
4204
if_table_exists : {'append', 'replace', 'fail'}
4205
The insert mode:
4206
4207
* 'replace' will create a new database table, overwriting an existing one.
4208
* 'append' will append to an existing table.
4209
* 'fail' will fail if table already exists.
4210
engine : {'sqlalchemy', 'adbc'}
4211
Select the engine to use for writing frame data; only necessary when
4212
supplying a URI string (defaults to 'sqlalchemy' if unset)
4213
engine_options
4214
Additional options to pass to the insert method associated with the engine
4215
specified by the option `engine`.
4216
4217
* Setting `engine` to "sqlalchemy" currently inserts using Pandas' `to_sql`
4218
method (though this will eventually be phased out in favor of a native
4219
solution).
4220
* Setting `engine` to "adbc" inserts using the ADBC cursor's `adbc_ingest`
4221
method.
4222
4223
Examples
4224
--------
4225
Insert into a temporary table using a PostgreSQL URI and the ADBC engine:
4226
4227
>>> df.write_database(
4228
... table_name="target_table",
4229
... connection="postgresql://user:pass@server:port/database",
4230
... engine="adbc",
4231
... engine_options={"temporary": True},
4232
... ) # doctest: +SKIP
4233
4234
Insert into a table using a `pyodbc` SQLAlchemy connection to SQL Server
4235
that was instantiated with "fast_executemany=True" to improve performance:
4236
4237
>>> pyodbc_uri = (
4238
... "mssql+pyodbc://user:pass@server:1433/test?"
4239
... "driver=ODBC+Driver+18+for+SQL+Server"
4240
... )
4241
>>> engine = create_engine(pyodbc_uri, fast_executemany=True) # doctest: +SKIP
4242
>>> df.write_database(
4243
... table_name="target_table",
4244
... connection=engine,
4245
... ) # doctest: +SKIP
4246
4247
Returns
4248
-------
4249
int
4250
The number of rows affected, if the driver provides this information.
4251
Otherwise, returns -1.
4252
"""
4253
if if_table_exists not in (valid_write_modes := get_args(DbWriteMode)):
4254
allowed = ", ".join(repr(m) for m in valid_write_modes)
4255
msg = f"write_database `if_table_exists` must be one of {{{allowed}}}, got {if_table_exists!r}"
4256
raise ValueError(msg)
4257
4258
connection_module_root = type(connection).__module__.split(".", 1)[0]
4259
4260
if engine is None:
4261
if isinstance(connection, str) or connection_module_root == "sqlalchemy":
4262
engine = "sqlalchemy"
4263
elif connection_module_root.startswith("adbc"):
4264
engine = "adbc"
4265
4266
def unpack_table_name(name: str) -> tuple[str | None, str | None, str]:
4267
"""Unpack optionally qualified table name to catalog/schema/table tuple."""
4268
from csv import reader as delimited_read
4269
4270
components: list[str | None] = next(delimited_read([name], delimiter=".")) # type: ignore[arg-type]
4271
if len(components) > 3:
4272
msg = f"`table_name` appears to be invalid: '{name}'"
4273
raise ValueError(msg)
4274
catalog, schema, tbl = ([None] * (3 - len(components))) + components
4275
return catalog, schema, tbl # type: ignore[return-value]
4276
4277
if engine == "adbc":
4278
from polars.io.database._utils import (
4279
_get_adbc_module_name_from_uri,
4280
_import_optional_adbc_driver,
4281
_open_adbc_connection,
4282
)
4283
4284
conn, can_close_conn = (
4285
(_open_adbc_connection(connection), True)
4286
if isinstance(connection, str)
4287
else (connection, False)
4288
)
4289
4290
driver_manager = import_optional("adbc_driver_manager")
4291
4292
# base class for ADBC connections
4293
if not isinstance(conn, driver_manager.dbapi.Connection):
4294
msg = f"unrecognised connection type {connection!r}"
4295
raise TypeError(msg)
4296
4297
driver_manager_str_version = getattr(driver_manager, "__version__", "0.0")
4298
driver_manager_version = parse_version(driver_manager_str_version)
4299
4300
if if_table_exists == "fail":
4301
# if the table exists, 'create' will raise an error,
4302
# resulting in behaviour equivalent to 'fail'
4303
mode = "create"
4304
elif if_table_exists == "replace":
4305
if driver_manager_version < (0, 7):
4306
msg = (
4307
"`if_table_exists = 'replace'` requires ADBC version >= 0.7, "
4308
f"found {driver_manager_str_version}"
4309
)
4310
raise ModuleUpgradeRequiredError(msg)
4311
mode = "replace"
4312
elif if_table_exists == "append":
4313
mode = "append"
4314
else:
4315
msg = (
4316
f"unexpected value for `if_table_exists`: {if_table_exists!r}"
4317
f"\n\nChoose one of {{'fail', 'replace', 'append'}}"
4318
)
4319
raise ValueError(msg)
4320
4321
with (
4322
conn if can_close_conn else contextlib.nullcontext(),
4323
conn.cursor() as cursor,
4324
):
4325
catalog, db_schema, unpacked_table_name = unpack_table_name(table_name)
4326
n_rows: int
4327
4328
adbc_module_name = (
4329
_get_adbc_module_name_from_uri(connection)
4330
if isinstance(connection, str)
4331
else connection_module_root
4332
)
4333
adbc_driver = _import_optional_adbc_driver(
4334
adbc_module_name, dbapi_submodule=False
4335
)
4336
adbc_driver_str_version = getattr(adbc_driver, "__version__", "0.0")
4337
adbc_driver_version = parse_version(adbc_driver_str_version)
4338
4339
if adbc_module_name.split("_")[-1] == "sqlite":
4340
catalog, db_schema = db_schema, None
4341
4342
# note: ADBC didnt't support 'replace' until adbc-driver-sqlite
4343
# version 0.11 (it was released for other drivers in version 0.7)
4344
if (
4345
driver_manager_version >= (0, 7)
4346
and adbc_driver_version < (0, 11)
4347
and if_table_exists == "replace"
4348
):
4349
cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
4350
mode = "create"
4351
4352
# As of adbc_driver_manager 1.6.0, adbc_ingest can take a Polars
4353
# DataFrame via the PyCapsule interface
4354
data = self if driver_manager_version >= (1, 6) else self.to_arrow()
4355
4356
# use of schema-qualified table names was released in
4357
# adbc-driver-manager 0.7.0 and is working without bugs from driver
4358
# version (e.g., adbc-driver-postgresql) version 0.8.0
4359
if driver_manager_version >= (0, 7) and adbc_driver_version >= (0, 8):
4360
n_rows = cursor.adbc_ingest(
4361
unpacked_table_name,
4362
data=data,
4363
mode=mode,
4364
catalog_name=catalog,
4365
db_schema_name=db_schema,
4366
**(engine_options or {}),
4367
)
4368
elif db_schema is not None:
4369
adbc_driver_pypi_name = adbc_module_name.replace("_", "-")
4370
msg = (
4371
"use of schema-qualified table names requires "
4372
"adbc-driver-manager version >= 0.7.0, found "
4373
f"{driver_manager_str_version} and {adbc_driver_pypi_name} "
4374
f"version >= 0.8.0, found {adbc_driver_str_version}"
4375
)
4376
raise ModuleUpgradeRequiredError(
4377
# https://github.com/apache/arrow-adbc/issues/1000
4378
# https://github.com/apache/arrow-adbc/issues/1109
4379
msg
4380
)
4381
else:
4382
n_rows = cursor.adbc_ingest(
4383
table_name=unpacked_table_name,
4384
data=data,
4385
mode=mode,
4386
**(engine_options or {}),
4387
)
4388
conn.commit()
4389
return n_rows
4390
4391
elif engine == "sqlalchemy":
4392
if not _PANDAS_AVAILABLE:
4393
msg = "writing with 'sqlalchemy' engine currently requires pandas.\n\nInstall with: pip install pandas"
4394
raise ModuleNotFoundError(msg)
4395
elif (pd_version := parse_version(pd.__version__)) < (1, 5):
4396
msg = f"writing with 'sqlalchemy' engine requires pandas >= 1.5; found {pd.__version__!r}"
4397
raise ModuleUpgradeRequiredError(msg)
4398
4399
import_optional(
4400
module_name="sqlalchemy",
4401
min_version=("2.0" if pd_version >= (2, 2) else "1.4"),
4402
min_err_prefix="pandas >= 2.2 requires",
4403
)
4404
# note: the catalog (database) should be a part of the connection string
4405
from sqlalchemy.engine import Connectable, create_engine
4406
from sqlalchemy.orm import Session
4407
4408
sa_object: Connectable
4409
if isinstance(connection, str):
4410
sa_object = create_engine(connection)
4411
elif isinstance(connection, Session):
4412
sa_object = connection.connection()
4413
elif isinstance(connection, Connectable):
4414
sa_object = connection
4415
else:
4416
msg = f"unrecognised connection type {connection!r}"
4417
raise TypeError(msg)
4418
4419
catalog, db_schema, unpacked_table_name = unpack_table_name(table_name)
4420
if catalog:
4421
msg = f"Unexpected three-part table name; provide the database/catalog ({catalog!r}) on the connection URI"
4422
raise ValueError(msg)
4423
4424
# ensure conversion to pandas uses the pyarrow extension array option
4425
# so that we can make use of the sql/db export *without* copying data
4426
res: int | None = self.to_pandas(
4427
use_pyarrow_extension_array=True,
4428
).to_sql(
4429
name=unpacked_table_name,
4430
schema=db_schema,
4431
con=sa_object,
4432
if_exists=if_table_exists,
4433
index=False,
4434
**(engine_options or {}),
4435
)
4436
return -1 if res is None else res
4437
4438
elif isinstance(engine, str):
4439
msg = f"engine {engine!r} is not supported"
4440
raise ValueError(msg)
4441
else:
4442
msg = f"unrecognised connection type {connection!r}"
4443
raise TypeError(msg)
4444
4445
@unstable()
4446
def write_iceberg(
4447
self,
4448
target: str | pyiceberg.table.Table,
4449
mode: Literal["append", "overwrite"],
4450
) -> None:
4451
"""
4452
Write DataFrame to an Iceberg table.
4453
4454
.. warning::
4455
This functionality is currently considered **unstable**. It may be
4456
changed at any point without it being considered a breaking change.
4457
4458
Parameters
4459
----------
4460
target
4461
Name of the table or the Table object representing an Iceberg table.
4462
mode : {'append', 'overwrite'}
4463
How to handle existing data.
4464
4465
- If 'append', will add new data.
4466
- If 'overwrite', will replace table with new data.
4467
4468
"""
4469
from pyiceberg.catalog import load_catalog
4470
4471
if isinstance(target, str):
4472
catalog = load_catalog()
4473
table = catalog.load_table(target)
4474
else:
4475
table = target
4476
4477
data = self.to_arrow(compat_level=CompatLevel.oldest())
4478
4479
if mode == "append":
4480
table.append(data)
4481
else:
4482
table.overwrite(data)
4483
4484
@overload
4485
def write_delta(
4486
self,
4487
target: str | Path | deltalake.DeltaTable,
4488
*,
4489
mode: Literal["error", "append", "overwrite", "ignore"] = ...,
4490
overwrite_schema: bool | None = ...,
4491
storage_options: dict[str, str] | None = ...,
4492
credential_provider: CredentialProviderFunction | Literal["auto"] | None = ...,
4493
delta_write_options: dict[str, Any] | None = ...,
4494
) -> None: ...
4495
4496
@overload
4497
def write_delta(
4498
self,
4499
target: str | Path | deltalake.DeltaTable,
4500
*,
4501
mode: Literal["merge"],
4502
overwrite_schema: bool | None = ...,
4503
storage_options: dict[str, str] | None = ...,
4504
credential_provider: CredentialProviderFunction | Literal["auto"] | None = ...,
4505
delta_merge_options: dict[str, Any],
4506
) -> deltalake.table.TableMerger: ...
4507
4508
def write_delta(
4509
self,
4510
target: str | Path | deltalake.DeltaTable,
4511
*,
4512
mode: Literal["error", "append", "overwrite", "ignore", "merge"] = "error",
4513
overwrite_schema: bool | None = None,
4514
storage_options: dict[str, str] | None = None,
4515
credential_provider: CredentialProviderFunction
4516
| Literal["auto"]
4517
| None = "auto",
4518
delta_write_options: dict[str, Any] | None = None,
4519
delta_merge_options: dict[str, Any] | None = None,
4520
) -> deltalake.table.TableMerger | None:
4521
"""
4522
Write DataFrame as delta table.
4523
4524
Parameters
4525
----------
4526
target
4527
URI of a table or a DeltaTable object.
4528
mode : {'error', 'append', 'overwrite', 'ignore', 'merge'}
4529
How to handle existing data.
4530
4531
- If 'error', throw an error if the table already exists (default).
4532
- If 'append', will add new data.
4533
- If 'overwrite', will replace table with new data.
4534
- If 'ignore', will not write anything if table already exists.
4535
- If 'merge', return a `TableMerger` object to merge data from the DataFrame
4536
with the existing data.
4537
overwrite_schema
4538
If True, allows updating the schema of the table.
4539
4540
.. deprecated:: 0.20.14
4541
Use the parameter `delta_write_options` instead and pass
4542
`{"schema_mode": "overwrite"}`.
4543
storage_options
4544
Extra options for the storage backends supported by `deltalake`.
4545
For cloud storages, this may include configurations for authentication etc.
4546
4547
- See a list of supported storage options for S3 `here <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html#variants>`__.
4548
- See a list of supported storage options for GCS `here <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html#variants>`__.
4549
- See a list of supported storage options for Azure `here <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variants>`__.
4550
credential_provider
4551
Provide a function that can be called to provide cloud storage
4552
credentials. The function is expected to return a dictionary of
4553
credential keys along with an optional credential expiry time.
4554
4555
.. warning::
4556
This functionality is considered **unstable**. It may be changed
4557
at any point without it being considered a breaking change.
4558
delta_write_options
4559
Additional keyword arguments while writing a Delta lake Table.
4560
See a list of supported write options `here <https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake>`__.
4561
delta_merge_options
4562
Keyword arguments which are required to `MERGE` a Delta lake Table.
4563
See a list of supported merge options `here <https://delta-io.github.io/delta-rs/api/delta_table/#deltalake.DeltaTable.merge>`__.
4564
4565
Raises
4566
------
4567
TypeError
4568
If the DataFrame contains unsupported data types.
4569
ArrowInvalidError
4570
If the DataFrame contains data types that could not be cast to their
4571
primitive type.
4572
TableNotFoundError
4573
If the delta table doesn't exist and MERGE action is triggered
4574
4575
Notes
4576
-----
4577
The Polars data types :class:`Null` and :class:`Time` are not supported
4578
by the delta protocol specification and will raise a TypeError. Columns
4579
using The :class:`Categorical` data type will be converted to
4580
normal (non-categorical) strings when written.
4581
4582
Polars columns are always nullable. To write data to a delta table with
4583
non-nullable columns, a custom pyarrow schema has to be passed to the
4584
`delta_write_options`. See the last example below.
4585
4586
Examples
4587
--------
4588
Write a dataframe to the local filesystem as a Delta Lake table.
4589
4590
>>> df = pl.DataFrame(
4591
... {
4592
... "foo": [1, 2, 3, 4, 5],
4593
... "bar": [6, 7, 8, 9, 10],
4594
... "ham": ["a", "b", "c", "d", "e"],
4595
... }
4596
... )
4597
>>> table_path = "/path/to/delta-table/"
4598
>>> df.write_delta(table_path) # doctest: +SKIP
4599
4600
Append data to an existing Delta Lake table on the local filesystem.
4601
Note that this will fail if the schema of the new data does not match the
4602
schema of the existing table.
4603
4604
>>> df.write_delta(table_path, mode="append") # doctest: +SKIP
4605
4606
Overwrite a Delta Lake table as a new version.
4607
If the schemas of the new and old data are the same, specifying the
4608
`schema_mode` is not required.
4609
4610
>>> existing_table_path = "/path/to/delta-table/"
4611
>>> df.write_delta(
4612
... existing_table_path,
4613
... mode="overwrite",
4614
... delta_write_options={"schema_mode": "overwrite"},
4615
... ) # doctest: +SKIP
4616
4617
Write a DataFrame as a Delta Lake table to a cloud object store like S3.
4618
4619
>>> table_path = "s3://bucket/prefix/to/delta-table/"
4620
>>> df.write_delta(
4621
... table_path,
4622
... storage_options={
4623
... "AWS_REGION": "THE_AWS_REGION",
4624
... "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID",
4625
... "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY",
4626
... },
4627
... ) # doctest: +SKIP
4628
4629
Write DataFrame as a Delta Lake table with non-nullable columns.
4630
4631
>>> import pyarrow as pa
4632
>>> existing_table_path = "/path/to/delta-table/"
4633
>>> df.write_delta(
4634
... existing_table_path,
4635
... delta_write_options={
4636
... "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)])
4637
... },
4638
... ) # doctest: +SKIP
4639
4640
Write DataFrame as a Delta Lake table with zstd compression.
4641
For all `delta_write_options` keyword arguments, check the deltalake docs
4642
`here
4643
<https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake>`__,
4644
and for Writer Properties in particular `here
4645
<https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.WriterProperties>`__.
4646
4647
>>> import deltalake
4648
>>> df.write_delta(
4649
... table_path,
4650
... delta_write_options={
4651
... "writer_properties": deltalake.WriterProperties(compression="zstd"),
4652
... },
4653
... ) # doctest: +SKIP
4654
4655
Merge the DataFrame with an existing Delta Lake table.
4656
For all `TableMerger` methods, check the deltalake docs
4657
`here <https://delta-io.github.io/delta-rs/api/delta_table/delta_table_merger/>`__.
4658
4659
>>> df = pl.DataFrame(
4660
... {
4661
... "foo": [1, 2, 3, 4, 5],
4662
... "bar": [6, 7, 8, 9, 10],
4663
... "ham": ["a", "b", "c", "d", "e"],
4664
... }
4665
... )
4666
>>> table_path = "/path/to/delta-table/"
4667
>>> (
4668
... df.write_delta(
4669
... "table_path",
4670
... mode="merge",
4671
... delta_merge_options={
4672
... "predicate": "s.foo = t.foo",
4673
... "source_alias": "s",
4674
... "target_alias": "t",
4675
... },
4676
... )
4677
... .when_matched_update_all()
4678
... .when_not_matched_insert_all()
4679
... .execute()
4680
... ) # doctest: +SKIP
4681
"""
4682
if overwrite_schema is not None:
4683
issue_deprecation_warning(
4684
"the parameter `overwrite_schema` for `write_delta` is deprecated."
4685
' Use the parameter `delta_write_options` instead and pass `{"schema_mode": "overwrite"}`.',
4686
version="0.20.14",
4687
)
4688
4689
from polars.io.delta import (
4690
_check_for_unsupported_types,
4691
_check_if_delta_available,
4692
_resolve_delta_lake_uri,
4693
)
4694
4695
_check_if_delta_available()
4696
4697
from deltalake import DeltaTable, write_deltalake
4698
4699
_check_for_unsupported_types(self.dtypes)
4700
4701
if isinstance(target, (str, Path)):
4702
target = _resolve_delta_lake_uri(str(target), strict=False)
4703
4704
from polars.io.cloud.credential_provider._builder import (
4705
_init_credential_provider_builder,
4706
)
4707
from polars.io.cloud.credential_provider._providers import (
4708
_get_credentials_from_provider_expiry_aware,
4709
)
4710
4711
if not isinstance(target, DeltaTable):
4712
credential_provider_builder = _init_credential_provider_builder(
4713
credential_provider, target, storage_options, "write_delta"
4714
)
4715
elif credential_provider is not None and credential_provider != "auto":
4716
msg = "cannot use credential_provider when passing a DeltaTable object"
4717
raise ValueError(msg)
4718
else:
4719
credential_provider_builder = None
4720
4721
del credential_provider
4722
4723
credential_provider_creds = {}
4724
4725
if credential_provider_builder and (
4726
provider := credential_provider_builder.build_credential_provider()
4727
):
4728
credential_provider_creds = (
4729
_get_credentials_from_provider_expiry_aware(provider) or {}
4730
)
4731
4732
# We aren't calling into polars-native write functions so we just update
4733
# the storage_options here.
4734
storage_options = (
4735
{**(storage_options or {}), **credential_provider_creds}
4736
if storage_options is not None or credential_provider_builder is not None
4737
else None
4738
)
4739
4740
if mode == "merge":
4741
if delta_merge_options is None:
4742
msg = "you need to pass delta_merge_options with at least a given predicate for `MERGE` to work."
4743
raise ValueError(msg)
4744
if isinstance(target, str):
4745
dt = DeltaTable(table_uri=target, storage_options=storage_options)
4746
else:
4747
dt = target
4748
4749
return dt.merge(self, **delta_merge_options)
4750
4751
else:
4752
if delta_write_options is None:
4753
delta_write_options = {}
4754
4755
if overwrite_schema:
4756
delta_write_options["schema_mode"] = "overwrite"
4757
4758
write_deltalake(
4759
table_or_uri=target,
4760
data=self,
4761
mode=mode,
4762
storage_options=storage_options,
4763
**delta_write_options,
4764
)
4765
return None
4766
4767
def estimated_size(self, unit: SizeUnit = "b") -> int | float:
4768
"""
4769
Return an estimation of the total (heap) allocated size of the `DataFrame`.
4770
4771
Estimated size is given in the specified unit (bytes by default).
4772
4773
This estimation is the sum of the size of its buffers, validity, including
4774
nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
4775
size of 2 arrays is not the sum of the sizes computed from this function. In
4776
particular, [`StructArray`]'s size is an upper bound.
4777
4778
When an array is sliced, its allocated size remains constant because the buffer
4779
unchanged. However, this function will yield a smaller number. This is because
4780
this function returns the visible size of the buffer, not its total capacity.
4781
4782
FFI buffers are included in this estimation.
4783
4784
Notes
4785
-----
4786
For data with Object dtype, the estimated size only reports the pointer
4787
size, which is a huge underestimation.
4788
4789
Parameters
4790
----------
4791
unit : {'b', 'kb', 'mb', 'gb', 'tb'}
4792
Scale the returned size to the given unit.
4793
4794
Examples
4795
--------
4796
>>> df = pl.DataFrame(
4797
... {
4798
... "x": list(reversed(range(1_000_000))),
4799
... "y": [v / 1000 for v in range(1_000_000)],
4800
... "z": [str(v) for v in range(1_000_000)],
4801
... },
4802
... schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.String)],
4803
... )
4804
>>> df.estimated_size()
4805
17888890
4806
>>> df.estimated_size("mb")
4807
17.0601749420166
4808
"""
4809
sz = self._df.estimated_size()
4810
return scale_bytes(sz, unit)
4811
4812
def transpose(
4813
self,
4814
*,
4815
include_header: bool = False,
4816
header_name: str = "column",
4817
column_names: str | Iterable[str] | None = None,
4818
) -> DataFrame:
4819
"""
4820
Transpose a DataFrame over the diagonal.
4821
4822
Parameters
4823
----------
4824
include_header
4825
If set, the column names will be added as first column.
4826
header_name
4827
If `include_header` is set, this determines the name of the column that will
4828
be inserted.
4829
column_names
4830
Optional iterable yielding strings or a string naming an existing column.
4831
These will name the value (non-header) columns in the transposed data.
4832
4833
Notes
4834
-----
4835
This is a very expensive operation. Perhaps you can do it differently.
4836
4837
Returns
4838
-------
4839
DataFrame
4840
4841
Examples
4842
--------
4843
>>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
4844
>>> df.transpose(include_header=True)
4845
shape: (2, 4)
4846
┌────────┬──────────┬──────────┬──────────┐
4847
│ column ┆ column_0 ┆ column_1 ┆ column_2 │
4848
│ --- ┆ --- ┆ --- ┆ --- │
4849
│ str ┆ i64 ┆ i64 ┆ i64 │
4850
╞════════╪══════════╪══════════╪══════════╡
4851
│ a ┆ 1 ┆ 2 ┆ 3 │
4852
│ b ┆ 4 ┆ 5 ┆ 6 │
4853
└────────┴──────────┴──────────┴──────────┘
4854
4855
Replace the auto-generated column names with a list
4856
4857
>>> df.transpose(include_header=False, column_names=["x", "y", "z"])
4858
shape: (2, 3)
4859
┌─────┬─────┬─────┐
4860
│ x ┆ y ┆ z │
4861
│ --- ┆ --- ┆ --- │
4862
│ i64 ┆ i64 ┆ i64 │
4863
╞═════╪═════╪═════╡
4864
│ 1 ┆ 2 ┆ 3 │
4865
│ 4 ┆ 5 ┆ 6 │
4866
└─────┴─────┴─────┘
4867
4868
Include the header as a separate column
4869
4870
>>> df.transpose(
4871
... include_header=True, header_name="foo", column_names=["x", "y", "z"]
4872
... )
4873
shape: (2, 4)
4874
┌─────┬─────┬─────┬─────┐
4875
│ foo ┆ x ┆ y ┆ z │
4876
│ --- ┆ --- ┆ --- ┆ --- │
4877
│ str ┆ i64 ┆ i64 ┆ i64 │
4878
╞═════╪═════╪═════╪═════╡
4879
│ a ┆ 1 ┆ 2 ┆ 3 │
4880
│ b ┆ 4 ┆ 5 ┆ 6 │
4881
└─────┴─────┴─────┴─────┘
4882
4883
Replace the auto-generated column with column names from a generator function
4884
4885
>>> def name_generator():
4886
... base_name = "my_column_"
4887
... count = 0
4888
... while True:
4889
... yield f"{base_name}{count}"
4890
... count += 1
4891
>>> df.transpose(include_header=False, column_names=name_generator())
4892
shape: (2, 3)
4893
┌─────────────┬─────────────┬─────────────┐
4894
│ my_column_0 ┆ my_column_1 ┆ my_column_2 │
4895
│ --- ┆ --- ┆ --- │
4896
│ i64 ┆ i64 ┆ i64 │
4897
╞═════════════╪═════════════╪═════════════╡
4898
│ 1 ┆ 2 ┆ 3 │
4899
│ 4 ┆ 5 ┆ 6 │
4900
└─────────────┴─────────────┴─────────────┘
4901
4902
Use an existing column as the new column names
4903
4904
>>> df = pl.DataFrame(dict(id=["i", "j", "k"], a=[1, 2, 3], b=[4, 5, 6]))
4905
>>> df.transpose(column_names="id")
4906
shape: (2, 3)
4907
┌─────┬─────┬─────┐
4908
│ i ┆ j ┆ k │
4909
│ --- ┆ --- ┆ --- │
4910
│ i64 ┆ i64 ┆ i64 │
4911
╞═════╪═════╪═════╡
4912
│ 1 ┆ 2 ┆ 3 │
4913
│ 4 ┆ 5 ┆ 6 │
4914
└─────┴─────┴─────┘
4915
>>> df.transpose(include_header=True, header_name="new_id", column_names="id")
4916
shape: (2, 4)
4917
┌────────┬─────┬─────┬─────┐
4918
│ new_id ┆ i ┆ j ┆ k │
4919
│ --- ┆ --- ┆ --- ┆ --- │
4920
│ str ┆ i64 ┆ i64 ┆ i64 │
4921
╞════════╪═════╪═════╪═════╡
4922
│ a ┆ 1 ┆ 2 ┆ 3 │
4923
│ b ┆ 4 ┆ 5 ┆ 6 │
4924
└────────┴─────┴─────┴─────┘
4925
"""
4926
keep_names_as = header_name if include_header else None
4927
column_names_: Sequence[str] | None
4928
if isinstance(column_names, Generator):
4929
column_names_ = [next(column_names) for _ in range(self.height)]
4930
else:
4931
column_names_ = column_names # type: ignore[assignment]
4932
return self._from_pydf(self._df.transpose(keep_names_as, column_names_))
4933
4934
def reverse(self) -> DataFrame:
4935
"""
4936
Reverse the DataFrame.
4937
4938
Examples
4939
--------
4940
>>> df = pl.DataFrame(
4941
... {
4942
... "key": ["a", "b", "c"],
4943
... "val": [1, 2, 3],
4944
... }
4945
... )
4946
>>> df.reverse()
4947
shape: (3, 2)
4948
┌─────┬─────┐
4949
│ key ┆ val │
4950
│ --- ┆ --- │
4951
│ str ┆ i64 │
4952
╞═════╪═════╡
4953
│ c ┆ 3 │
4954
│ b ┆ 2 │
4955
│ a ┆ 1 │
4956
└─────┴─────┘
4957
"""
4958
return self.select(F.col("*").reverse())
4959
4960
def rename(
4961
self, mapping: Mapping[str, str] | Callable[[str], str], *, strict: bool = True
4962
) -> DataFrame:
4963
"""
4964
Rename column names.
4965
4966
Parameters
4967
----------
4968
mapping
4969
Key value pairs that map from old name to new name, or a function
4970
that takes the old name as input and returns the new name.
4971
strict
4972
Validate that all column names exist in the current schema,
4973
and throw an exception if any do not. (Note that this parameter
4974
is a no-op when passing a function to `mapping`).
4975
4976
Examples
4977
--------
4978
>>> df = pl.DataFrame(
4979
... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]}
4980
... )
4981
>>> df.rename({"foo": "apple"})
4982
shape: (3, 3)
4983
┌───────┬─────┬─────┐
4984
│ apple ┆ bar ┆ ham │
4985
│ --- ┆ --- ┆ --- │
4986
│ i64 ┆ i64 ┆ str │
4987
╞═══════╪═════╪═════╡
4988
│ 1 ┆ 6 ┆ a │
4989
│ 2 ┆ 7 ┆ b │
4990
│ 3 ┆ 8 ┆ c │
4991
└───────┴─────┴─────┘
4992
>>> df.rename(lambda column_name: "c" + column_name[1:])
4993
shape: (3, 3)
4994
┌─────┬─────┬─────┐
4995
│ coo ┆ car ┆ cam │
4996
│ --- ┆ --- ┆ --- │
4997
│ i64 ┆ i64 ┆ str │
4998
╞═════╪═════╪═════╡
4999
│ 1 ┆ 6 ┆ a │
5000
│ 2 ┆ 7 ┆ b │
5001
│ 3 ┆ 8 ┆ c │
5002
└─────┴─────┴─────┘
5003
"""
5004
from polars.lazyframe.opt_flags import QueryOptFlags
5005
5006
return (
5007
self.lazy()
5008
.rename(mapping, strict=strict)
5009
.collect(optimizations=QueryOptFlags._eager())
5010
)
5011
5012
def insert_column(self, index: int, column: IntoExprColumn) -> DataFrame:
5013
"""
5014
Insert a Series (or expression) at a certain column index.
5015
5016
This operation is in place.
5017
5018
Parameters
5019
----------
5020
index
5021
Index at which to insert the new column.
5022
column
5023
`Series` or expression to insert.
5024
5025
Examples
5026
--------
5027
Insert a new Series column at the given index:
5028
5029
>>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
5030
>>> s = pl.Series("baz", [97, 98, 99])
5031
>>> df.insert_column(1, s)
5032
shape: (3, 3)
5033
┌─────┬─────┬─────┐
5034
│ foo ┆ baz ┆ bar │
5035
│ --- ┆ --- ┆ --- │
5036
│ i64 ┆ i64 ┆ i64 │
5037
╞═════╪═════╪═════╡
5038
│ 1 ┆ 97 ┆ 4 │
5039
│ 2 ┆ 98 ┆ 5 │
5040
│ 3 ┆ 99 ┆ 6 │
5041
└─────┴─────┴─────┘
5042
5043
Insert a new expression column at the given index:
5044
5045
>>> df = pl.DataFrame(
5046
... {"a": [2, 4, 2], "b": [0.5, 4, 10], "c": ["xx", "yy", "zz"]}
5047
... )
5048
>>> expr = (pl.col("b") / pl.col("a")).alias("b_div_a")
5049
>>> df.insert_column(2, expr)
5050
shape: (3, 4)
5051
┌─────┬──────┬─────────┬─────┐
5052
│ a ┆ b ┆ b_div_a ┆ c │
5053
│ --- ┆ --- ┆ --- ┆ --- │
5054
│ i64 ┆ f64 ┆ f64 ┆ str │
5055
╞═════╪══════╪═════════╪═════╡
5056
│ 2 ┆ 0.5 ┆ 0.25 ┆ xx │
5057
│ 4 ┆ 4.0 ┆ 1.0 ┆ yy │
5058
│ 2 ┆ 10.0 ┆ 5.0 ┆ zz │
5059
└─────┴──────┴─────────┴─────┘
5060
"""
5061
if (original_index := index) < 0:
5062
index = self.width + index
5063
if index < 0:
5064
msg = f"column index {original_index} is out of range (frame has {self.width} columns)"
5065
raise IndexError(msg)
5066
elif index > self.width:
5067
msg = f"column index {original_index} is out of range (frame has {self.width} columns)"
5068
raise IndexError(msg)
5069
5070
if isinstance(column, pl.Series):
5071
self._df.insert_column(index, column._s)
5072
else:
5073
if isinstance(column, str):
5074
column = F.col(column)
5075
if isinstance(column, pl.Expr):
5076
cols = self.columns
5077
cols.insert(index, column) # type: ignore[arg-type]
5078
self._df = self.select(cols)._df
5079
else:
5080
msg = f"column must be a Series or Expr, got {column!r} (type={qualified_type_name(column)})"
5081
raise TypeError(msg)
5082
return self
5083
5084
def filter(
5085
self,
5086
*predicates: (
5087
IntoExprColumn
5088
| Iterable[IntoExprColumn]
5089
| bool
5090
| list[bool]
5091
| np.ndarray[Any, Any]
5092
),
5093
**constraints: Any,
5094
) -> DataFrame:
5095
"""
5096
Filter rows, retaining those that match the given predicate expression(s).
5097
5098
The original order of the remaining rows is preserved.
5099
5100
Only rows where the predicate resolves as True are retained; when the
5101
predicate result is False (or null), the row is discarded.
5102
5103
Parameters
5104
----------
5105
predicates
5106
Expression(s) that evaluate to a boolean Series.
5107
constraints
5108
Column filters; use `name = value` to filter columns by the supplied value.
5109
Each constraint will behave the same as `pl.col(name).eq(value)`, and
5110
be implicitly joined with the other filter conditions using `&`.
5111
5112
Notes
5113
-----
5114
If you are transitioning from Pandas, and performing filter operations based on
5115
the comparison of two or more columns, please note that in Polars any comparison
5116
involving `null` values will result in a `null` result, *not* boolean True or
5117
False. As a result, these rows will not be retained. Ensure that null values
5118
are handled appropriately to avoid unexpected behaviour (see examples below).
5119
5120
See Also
5121
--------
5122
remove
5123
5124
Examples
5125
--------
5126
>>> df = pl.DataFrame(
5127
... {
5128
... "foo": [1, 2, 3, None, 4, None, 0],
5129
... "bar": [6, 7, 8, None, None, 9, 0],
5130
... "ham": ["a", "b", "c", None, "d", "e", "f"],
5131
... }
5132
... )
5133
5134
Filter rows matching a condition:
5135
5136
>>> df.filter(pl.col("foo") > 1)
5137
shape: (3, 3)
5138
┌─────┬──────┬─────┐
5139
│ foo ┆ bar ┆ ham │
5140
│ --- ┆ --- ┆ --- │
5141
│ i64 ┆ i64 ┆ str │
5142
╞═════╪══════╪═════╡
5143
│ 2 ┆ 7 ┆ b │
5144
│ 3 ┆ 8 ┆ c │
5145
│ 4 ┆ null ┆ d │
5146
└─────┴──────┴─────┘
5147
5148
Filter on multiple conditions, combined with and/or operators:
5149
5150
>>> df.filter(
5151
... (pl.col("foo") < 3) & (pl.col("ham") == "a"),
5152
... )
5153
shape: (1, 3)
5154
┌─────┬─────┬─────┐
5155
│ foo ┆ bar ┆ ham │
5156
│ --- ┆ --- ┆ --- │
5157
│ i64 ┆ i64 ┆ str │
5158
╞═════╪═════╪═════╡
5159
│ 1 ┆ 6 ┆ a │
5160
└─────┴─────┴─────┘
5161
5162
>>> df.filter(
5163
... (pl.col("foo") == 1) | (pl.col("ham") == "c"),
5164
... )
5165
shape: (2, 3)
5166
┌─────┬─────┬─────┐
5167
│ foo ┆ bar ┆ ham │
5168
│ --- ┆ --- ┆ --- │
5169
│ i64 ┆ i64 ┆ str │
5170
╞═════╪═════╪═════╡
5171
│ 1 ┆ 6 ┆ a │
5172
│ 3 ┆ 8 ┆ c │
5173
└─────┴─────┴─────┘
5174
5175
Provide multiple filters using `*args` syntax:
5176
5177
>>> df.filter(
5178
... pl.col("foo") <= 2,
5179
... ~pl.col("ham").is_in(["b", "c"]),
5180
... )
5181
shape: (2, 3)
5182
┌─────┬─────┬─────┐
5183
│ foo ┆ bar ┆ ham │
5184
│ --- ┆ --- ┆ --- │
5185
│ i64 ┆ i64 ┆ str │
5186
╞═════╪═════╪═════╡
5187
│ 1 ┆ 6 ┆ a │
5188
│ 0 ┆ 0 ┆ f │
5189
└─────┴─────┴─────┘
5190
5191
Provide multiple filters using `**kwargs` syntax:
5192
5193
>>> df.filter(foo=2, ham="b")
5194
shape: (1, 3)
5195
┌─────┬─────┬─────┐
5196
│ foo ┆ bar ┆ ham │
5197
│ --- ┆ --- ┆ --- │
5198
│ i64 ┆ i64 ┆ str │
5199
╞═════╪═════╪═════╡
5200
│ 2 ┆ 7 ┆ b │
5201
└─────┴─────┴─────┘
5202
5203
Filter by comparing two columns against each other:
5204
5205
>>> df.filter(
5206
... pl.col("foo") == pl.col("bar"),
5207
... )
5208
shape: (1, 3)
5209
┌─────┬─────┬─────┐
5210
│ foo ┆ bar ┆ ham │
5211
│ --- ┆ --- ┆ --- │
5212
│ i64 ┆ i64 ┆ str │
5213
╞═════╪═════╪═════╡
5214
│ 0 ┆ 0 ┆ f │
5215
└─────┴─────┴─────┘
5216
5217
>>> df.filter(
5218
... pl.col("foo") != pl.col("bar"),
5219
... )
5220
shape: (3, 3)
5221
┌─────┬─────┬─────┐
5222
│ foo ┆ bar ┆ ham │
5223
│ --- ┆ --- ┆ --- │
5224
│ i64 ┆ i64 ┆ str │
5225
╞═════╪═════╪═════╡
5226
│ 1 ┆ 6 ┆ a │
5227
│ 2 ┆ 7 ┆ b │
5228
│ 3 ┆ 8 ┆ c │
5229
└─────┴─────┴─────┘
5230
5231
Notice how the row with `None` values is filtered out. In order to keep the
5232
same behavior as pandas, use:
5233
5234
>>> df.filter(
5235
... pl.col("foo").ne_missing(pl.col("bar")),
5236
... )
5237
shape: (5, 3)
5238
┌──────┬──────┬─────┐
5239
│ foo ┆ bar ┆ ham │
5240
│ --- ┆ --- ┆ --- │
5241
│ i64 ┆ i64 ┆ str │
5242
╞══════╪══════╪═════╡
5243
│ 1 ┆ 6 ┆ a │
5244
│ 2 ┆ 7 ┆ b │
5245
│ 3 ┆ 8 ┆ c │
5246
│ 4 ┆ null ┆ d │
5247
│ null ┆ 9 ┆ e │
5248
└──────┴──────┴─────┘
5249
"""
5250
from polars.lazyframe.opt_flags import QueryOptFlags
5251
5252
return (
5253
self.lazy()
5254
.filter(*predicates, **constraints)
5255
.collect(optimizations=QueryOptFlags._eager())
5256
)
5257
5258
def remove(
5259
self,
5260
*predicates: (
5261
IntoExprColumn
5262
| Iterable[IntoExprColumn]
5263
| bool
5264
| list[bool]
5265
| np.ndarray[Any, Any]
5266
),
5267
**constraints: Any,
5268
) -> DataFrame:
5269
"""
5270
Remove rows, dropping those that match the given predicate expression(s).
5271
5272
The original order of the remaining rows is preserved.
5273
5274
Rows where the filter predicate does not evaluate to True are retained
5275
(this includes rows where the predicate evaluates as `null`).
5276
5277
Parameters
5278
----------
5279
predicates
5280
Expression that evaluates to a boolean Series.
5281
constraints
5282
Column filters; use `name = value` to filter columns using the supplied
5283
value. Each constraint behaves the same as `pl.col(name).eq(value)`,
5284
and is implicitly joined with the other filter conditions using `&`.
5285
5286
Notes
5287
-----
5288
If you are transitioning from Pandas, and performing filter operations based on
5289
the comparison of two or more columns, please note that in Polars any comparison
5290
involving `null` values will result in a `null` result, *not* boolean True or
5291
False. As a result, these rows will not be removed. Ensure that null values
5292
are handled appropriately to avoid unexpected behaviour (see examples below).
5293
5294
See Also
5295
--------
5296
filter
5297
5298
Examples
5299
--------
5300
>>> df = pl.DataFrame(
5301
... {
5302
... "foo": [2, 3, None, 4, 0],
5303
... "bar": [5, 6, None, None, 0],
5304
... "ham": ["a", "b", None, "c", "d"],
5305
... }
5306
... )
5307
5308
Remove rows matching a condition:
5309
5310
>>> df.remove(pl.col("bar") >= 5)
5311
shape: (3, 3)
5312
┌──────┬──────┬──────┐
5313
│ foo ┆ bar ┆ ham │
5314
│ --- ┆ --- ┆ --- │
5315
│ i64 ┆ i64 ┆ str │
5316
╞══════╪══════╪══════╡
5317
│ null ┆ null ┆ null │
5318
│ 4 ┆ null ┆ c │
5319
│ 0 ┆ 0 ┆ d │
5320
└──────┴──────┴──────┘
5321
5322
Discard rows based on multiple conditions, combined with and/or operators:
5323
5324
>>> df.remove(
5325
... (pl.col("foo") >= 0) & (pl.col("bar") >= 0),
5326
... )
5327
shape: (2, 3)
5328
┌──────┬──────┬──────┐
5329
│ foo ┆ bar ┆ ham │
5330
│ --- ┆ --- ┆ --- │
5331
│ i64 ┆ i64 ┆ str │
5332
╞══════╪══════╪══════╡
5333
│ null ┆ null ┆ null │
5334
│ 4 ┆ null ┆ c │
5335
└──────┴──────┴──────┘
5336
5337
>>> df.remove(
5338
... (pl.col("foo") >= 0) | (pl.col("bar") >= 0),
5339
... )
5340
shape: (1, 3)
5341
┌──────┬──────┬──────┐
5342
│ foo ┆ bar ┆ ham │
5343
│ --- ┆ --- ┆ --- │
5344
│ i64 ┆ i64 ┆ str │
5345
╞══════╪══════╪══════╡
5346
│ null ┆ null ┆ null │
5347
└──────┴──────┴──────┘
5348
5349
Provide multiple constraints using `*args` syntax:
5350
5351
>>> df.remove(
5352
... pl.col("ham").is_not_null(),
5353
... pl.col("bar") >= 0,
5354
... )
5355
shape: (2, 3)
5356
┌──────┬──────┬──────┐
5357
│ foo ┆ bar ┆ ham │
5358
│ --- ┆ --- ┆ --- │
5359
│ i64 ┆ i64 ┆ str │
5360
╞══════╪══════╪══════╡
5361
│ null ┆ null ┆ null │
5362
│ 4 ┆ null ┆ c │
5363
└──────┴──────┴──────┘
5364
5365
Provide constraints(s) using `**kwargs` syntax:
5366
5367
>>> df.remove(foo=0, bar=0)
5368
shape: (4, 3)
5369
┌──────┬──────┬──────┐
5370
│ foo ┆ bar ┆ ham │
5371
│ --- ┆ --- ┆ --- │
5372
│ i64 ┆ i64 ┆ str │
5373
╞══════╪══════╪══════╡
5374
│ 2 ┆ 5 ┆ a │
5375
│ 3 ┆ 6 ┆ b │
5376
│ null ┆ null ┆ null │
5377
│ 4 ┆ null ┆ c │
5378
└──────┴──────┴──────┘
5379
5380
Remove rows by comparing two columns against each other:
5381
5382
>>> df.remove(
5383
... pl.col("foo").ne_missing(pl.col("bar")),
5384
... )
5385
shape: (2, 3)
5386
┌──────┬──────┬──────┐
5387
│ foo ┆ bar ┆ ham │
5388
│ --- ┆ --- ┆ --- │
5389
│ i64 ┆ i64 ┆ str │
5390
╞══════╪══════╪══════╡
5391
│ null ┆ null ┆ null │
5392
│ 0 ┆ 0 ┆ d │
5393
└──────┴──────┴──────┘
5394
"""
5395
from polars.lazyframe.opt_flags import QueryOptFlags
5396
5397
return (
5398
self.lazy()
5399
.remove(*predicates, **constraints)
5400
.collect(optimizations=QueryOptFlags._eager())
5401
)
5402
5403
@overload
5404
def glimpse(
5405
self,
5406
*,
5407
max_items_per_column: int = ...,
5408
max_colname_length: int = ...,
5409
return_as_string: Literal[False] = ...,
5410
) -> None: ...
5411
5412
@overload
5413
def glimpse(
5414
self,
5415
*,
5416
max_items_per_column: int = ...,
5417
max_colname_length: int = ...,
5418
return_as_string: Literal[True],
5419
) -> str: ...
5420
5421
@overload
5422
def glimpse(
5423
self,
5424
*,
5425
max_items_per_column: int = ...,
5426
max_colname_length: int = ...,
5427
return_as_string: bool,
5428
) -> str | None: ...
5429
5430
def glimpse(
5431
self,
5432
*,
5433
max_items_per_column: int = 10,
5434
max_colname_length: int = 50,
5435
return_as_string: bool = False,
5436
) -> str | None:
5437
"""
5438
Return a dense preview of the DataFrame.
5439
5440
The formatting shows one line per column so that wide dataframes display
5441
cleanly. Each line shows the column name, the data type, and the first
5442
few values.
5443
5444
Parameters
5445
----------
5446
max_items_per_column
5447
Maximum number of items to show per column.
5448
max_colname_length
5449
Maximum length of the displayed column names; values that exceed this
5450
value are truncated with a trailing ellipsis.
5451
return_as_string
5452
If True, return the preview as a string instead of printing to stdout.
5453
5454
See Also
5455
--------
5456
describe, head, tail
5457
5458
Examples
5459
--------
5460
>>> from datetime import date
5461
>>> df = pl.DataFrame(
5462
... {
5463
... "a": [1.0, 2.8, 3.0],
5464
... "b": [4, 5, None],
5465
... "c": [True, False, True],
5466
... "d": [None, "b", "c"],
5467
... "e": ["usd", "eur", None],
5468
... "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)],
5469
... }
5470
... )
5471
>>> df.glimpse()
5472
Rows: 3
5473
Columns: 6
5474
$ a <f64> 1.0, 2.8, 3.0
5475
$ b <i64> 4, 5, None
5476
$ c <bool> True, False, True
5477
$ d <str> None, 'b', 'c'
5478
$ e <str> 'usd', 'eur', None
5479
$ f <date> 2020-01-01, 2021-01-02, 2022-01-01
5480
"""
5481
# always print at most this number of values (mainly ensures that
5482
# we do not cast long arrays to strings, which would be slow)
5483
max_n_values = min(max_items_per_column, self.height)
5484
schema = self.schema
5485
5486
def _parse_column(col_name: str, dtype: PolarsDataType) -> tuple[str, str, str]:
5487
fn = repr if schema[col_name] == String else str
5488
values = self[:max_n_values, col_name].to_list()
5489
val_str = ", ".join(fn(v) for v in values)
5490
if len(col_name) > max_colname_length:
5491
col_name = col_name[: (max_colname_length - 1)] + "…"
5492
return col_name, f"<{_dtype_str_repr(dtype)}>", val_str
5493
5494
data = [_parse_column(s, dtype) for s, dtype in self.schema.items()]
5495
5496
# determine column layout widths
5497
max_col_name = max((len(col_name) for col_name, _, _ in data))
5498
max_col_dtype = max((len(dtype_str) for _, dtype_str, _ in data))
5499
5500
# print header
5501
output = StringIO()
5502
output.write(f"Rows: {self.height}\nColumns: {self.width}\n")
5503
5504
# print individual columns: one row per column
5505
for col_name, dtype_str, val_str in data:
5506
output.write(
5507
f"$ {col_name:<{max_col_name}} {dtype_str:>{max_col_dtype}} {val_str}\n"
5508
)
5509
5510
s = output.getvalue()
5511
if return_as_string:
5512
return s
5513
5514
print(s, end=None)
5515
return None
5516
5517
def describe(
5518
self,
5519
percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75),
5520
*,
5521
interpolation: QuantileMethod = "nearest",
5522
) -> DataFrame:
5523
"""
5524
Summary statistics for a DataFrame.
5525
5526
Parameters
5527
----------
5528
percentiles
5529
One or more percentiles to include in the summary statistics.
5530
All values must be in the range `[0, 1]`.
5531
5532
interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear', 'equiprobable'}
5533
Interpolation method used when calculating percentiles.
5534
5535
Notes
5536
-----
5537
The median is included by default as the 50% percentile.
5538
5539
Warnings
5540
--------
5541
We do not guarantee the output of `describe` to be stable. It will show
5542
statistics that we deem informative, and may be updated in the future.
5543
Using `describe` programmatically (versus interactive exploration) is
5544
not recommended for this reason.
5545
5546
See Also
5547
--------
5548
glimpse
5549
5550
Examples
5551
--------
5552
>>> from datetime import date, time
5553
>>> df = pl.DataFrame(
5554
... {
5555
... "float": [1.0, 2.8, 3.0],
5556
... "int": [40, 50, None],
5557
... "bool": [True, False, True],
5558
... "str": ["zz", "xx", "yy"],
5559
... "date": [date(2020, 1, 1), date(2021, 7, 5), date(2022, 12, 31)],
5560
... "time": [time(10, 20, 30), time(14, 45, 50), time(23, 15, 10)],
5561
... }
5562
... )
5563
5564
Show default frame statistics:
5565
5566
>>> df.describe()
5567
shape: (9, 7)
5568
┌────────────┬──────────┬──────────┬──────────┬──────┬─────────────────────┬──────────┐
5569
│ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │
5570
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
5571
│ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │
5572
╞════════════╪══════════╪══════════╪══════════╪══════╪═════════════════════╪══════════╡
5573
│ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │
5574
│ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │
5575
│ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 16:00:00 ┆ 16:07:10 │
5576
│ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │
5577
│ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │
5578
│ 25% ┆ 2.8 ┆ 40.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │
5579
│ 50% ┆ 2.8 ┆ 50.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │
5580
│ 75% ┆ 3.0 ┆ 50.0 ┆ null ┆ null ┆ 2022-12-31 ┆ 23:15:10 │
5581
│ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │
5582
└────────────┴──────────┴──────────┴──────────┴──────┴─────────────────────┴──────────┘
5583
5584
Customize which percentiles are displayed, applying linear interpolation:
5585
5586
>>> with pl.Config(tbl_rows=12):
5587
... df.describe(
5588
... percentiles=[0.1, 0.3, 0.5, 0.7, 0.9],
5589
... interpolation="linear",
5590
... )
5591
shape: (11, 7)
5592
┌────────────┬──────────┬──────────┬──────────┬──────┬─────────────────────┬──────────┐
5593
│ statistic ┆ float ┆ int ┆ bool ┆ str ┆ date ┆ time │
5594
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
5595
│ str ┆ f64 ┆ f64 ┆ f64 ┆ str ┆ str ┆ str │
5596
╞════════════╪══════════╪══════════╪══════════╪══════╪═════════════════════╪══════════╡
5597
│ count ┆ 3.0 ┆ 2.0 ┆ 3.0 ┆ 3 ┆ 3 ┆ 3 │
5598
│ null_count ┆ 0.0 ┆ 1.0 ┆ 0.0 ┆ 0 ┆ 0 ┆ 0 │
5599
│ mean ┆ 2.266667 ┆ 45.0 ┆ 0.666667 ┆ null ┆ 2021-07-02 16:00:00 ┆ 16:07:10 │
5600
│ std ┆ 1.101514 ┆ 7.071068 ┆ null ┆ null ┆ null ┆ null │
5601
│ min ┆ 1.0 ┆ 40.0 ┆ 0.0 ┆ xx ┆ 2020-01-01 ┆ 10:20:30 │
5602
│ 10% ┆ 1.36 ┆ 41.0 ┆ null ┆ null ┆ 2020-04-20 ┆ 11:13:34 │
5603
│ 30% ┆ 2.08 ┆ 43.0 ┆ null ┆ null ┆ 2020-11-26 ┆ 12:59:42 │
5604
│ 50% ┆ 2.8 ┆ 45.0 ┆ null ┆ null ┆ 2021-07-05 ┆ 14:45:50 │
5605
│ 70% ┆ 2.88 ┆ 47.0 ┆ null ┆ null ┆ 2022-02-07 ┆ 18:09:34 │
5606
│ 90% ┆ 2.96 ┆ 49.0 ┆ null ┆ null ┆ 2022-09-13 ┆ 21:33:18 │
5607
│ max ┆ 3.0 ┆ 50.0 ┆ 1.0 ┆ zz ┆ 2022-12-31 ┆ 23:15:10 │
5608
└────────────┴──────────┴──────────┴──────────┴──────┴─────────────────────┴──────────┘
5609
""" # noqa: W505
5610
if not self.columns:
5611
msg = "cannot describe a DataFrame that has no columns"
5612
raise TypeError(msg)
5613
5614
return self.lazy().describe(
5615
percentiles=percentiles, interpolation=interpolation
5616
)
5617
5618
def get_column_index(self, name: str) -> int:
5619
"""
5620
Find the index of a column by name.
5621
5622
Parameters
5623
----------
5624
name
5625
Name of the column to find.
5626
5627
Examples
5628
--------
5629
>>> df = pl.DataFrame(
5630
... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]}
5631
... )
5632
>>> df.get_column_index("ham")
5633
2
5634
>>> df.get_column_index("sandwich") # doctest: +SKIP
5635
ColumnNotFoundError: sandwich
5636
"""
5637
return self._df.get_column_index(name)
5638
5639
def replace_column(self, index: int, column: Series) -> DataFrame:
5640
"""
5641
Replace a column at an index location.
5642
5643
This operation is in place.
5644
5645
Parameters
5646
----------
5647
index
5648
Column index.
5649
column
5650
Series that will replace the column.
5651
5652
Examples
5653
--------
5654
>>> df = pl.DataFrame(
5655
... {
5656
... "foo": [1, 2, 3],
5657
... "bar": [6, 7, 8],
5658
... "ham": ["a", "b", "c"],
5659
... }
5660
... )
5661
>>> s = pl.Series("apple", [10, 20, 30])
5662
>>> df.replace_column(0, s)
5663
shape: (3, 3)
5664
┌───────┬─────┬─────┐
5665
│ apple ┆ bar ┆ ham │
5666
│ --- ┆ --- ┆ --- │
5667
│ i64 ┆ i64 ┆ str │
5668
╞═══════╪═════╪═════╡
5669
│ 10 ┆ 6 ┆ a │
5670
│ 20 ┆ 7 ┆ b │
5671
│ 30 ┆ 8 ┆ c │
5672
└───────┴─────┴─────┘
5673
"""
5674
if index < 0:
5675
index = self.width + index
5676
self._df.replace_column(index, column._s)
5677
return self
5678
5679
def sort(
5680
self,
5681
by: IntoExpr | Iterable[IntoExpr],
5682
*more_by: IntoExpr,
5683
descending: bool | Sequence[bool] = False,
5684
nulls_last: bool | Sequence[bool] = False,
5685
multithreaded: bool = True,
5686
maintain_order: bool = False,
5687
) -> DataFrame:
5688
"""
5689
Sort the dataframe by the given columns.
5690
5691
Parameters
5692
----------
5693
by
5694
Column(s) to sort by. Accepts expression input, including selectors. Strings
5695
are parsed as column names.
5696
*more_by
5697
Additional columns to sort by, specified as positional arguments.
5698
descending
5699
Sort in descending order. When sorting by multiple columns, can be specified
5700
per column by passing a sequence of booleans.
5701
nulls_last
5702
Place null values last; can specify a single boolean applying to all columns
5703
or a sequence of booleans for per-column control.
5704
multithreaded
5705
Sort using multiple threads.
5706
maintain_order
5707
Whether the order should be maintained if elements are equal.
5708
5709
Examples
5710
--------
5711
Pass a single column name to sort by that column.
5712
5713
>>> df = pl.DataFrame(
5714
... {
5715
... "a": [1, 2, None],
5716
... "b": [6.0, 5.0, 4.0],
5717
... "c": ["a", "c", "b"],
5718
... }
5719
... )
5720
>>> df.sort("a")
5721
shape: (3, 3)
5722
┌──────┬─────┬─────┐
5723
│ a ┆ b ┆ c │
5724
│ --- ┆ --- ┆ --- │
5725
│ i64 ┆ f64 ┆ str │
5726
╞══════╪═════╪═════╡
5727
│ null ┆ 4.0 ┆ b │
5728
│ 1 ┆ 6.0 ┆ a │
5729
│ 2 ┆ 5.0 ┆ c │
5730
└──────┴─────┴─────┘
5731
5732
Sorting by expressions is also supported.
5733
5734
>>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True)
5735
shape: (3, 3)
5736
┌──────┬─────┬─────┐
5737
│ a ┆ b ┆ c │
5738
│ --- ┆ --- ┆ --- │
5739
│ i64 ┆ f64 ┆ str │
5740
╞══════╪═════╪═════╡
5741
│ 2 ┆ 5.0 ┆ c │
5742
│ 1 ┆ 6.0 ┆ a │
5743
│ null ┆ 4.0 ┆ b │
5744
└──────┴─────┴─────┘
5745
5746
Sort by multiple columns by passing a list of columns.
5747
5748
>>> df.sort(["c", "a"], descending=True)
5749
shape: (3, 3)
5750
┌──────┬─────┬─────┐
5751
│ a ┆ b ┆ c │
5752
│ --- ┆ --- ┆ --- │
5753
│ i64 ┆ f64 ┆ str │
5754
╞══════╪═════╪═════╡
5755
│ 2 ┆ 5.0 ┆ c │
5756
│ null ┆ 4.0 ┆ b │
5757
│ 1 ┆ 6.0 ┆ a │
5758
└──────┴─────┴─────┘
5759
5760
Or use positional arguments to sort by multiple columns in the same way.
5761
5762
>>> df.sort("c", "a", descending=[False, True])
5763
shape: (3, 3)
5764
┌──────┬─────┬─────┐
5765
│ a ┆ b ┆ c │
5766
│ --- ┆ --- ┆ --- │
5767
│ i64 ┆ f64 ┆ str │
5768
╞══════╪═════╪═════╡
5769
│ 1 ┆ 6.0 ┆ a │
5770
│ null ┆ 4.0 ┆ b │
5771
│ 2 ┆ 5.0 ┆ c │
5772
└──────┴─────┴─────┘
5773
"""
5774
from polars.lazyframe import QueryOptFlags
5775
5776
return (
5777
self.lazy()
5778
.sort(
5779
by,
5780
*more_by,
5781
descending=descending,
5782
nulls_last=nulls_last,
5783
multithreaded=multithreaded,
5784
maintain_order=maintain_order,
5785
)
5786
.collect(optimizations=QueryOptFlags._eager())
5787
)
5788
5789
def sql(self, query: str, *, table_name: str = "self") -> DataFrame:
5790
"""
5791
Execute a SQL query against the DataFrame.
5792
5793
.. versionadded:: 0.20.24
5794
5795
.. warning::
5796
This functionality is considered **unstable**, although it is close to
5797
being considered stable. It may be changed at any point without it being
5798
considered a breaking change.
5799
5800
Parameters
5801
----------
5802
query
5803
SQL query to execute.
5804
table_name
5805
Optionally provide an explicit name for the table that represents the
5806
calling frame (defaults to "self").
5807
5808
Notes
5809
-----
5810
* The calling frame is automatically registered as a table in the SQL context
5811
under the name "self". If you want access to the DataFrames and LazyFrames
5812
found in the current globals, use the top-level :meth:`pl.sql <polars.sql>`.
5813
* More control over registration and execution behaviour is available by
5814
using the :class:`SQLContext` object.
5815
* The SQL query executes in lazy mode before being collected and returned
5816
as a DataFrame.
5817
5818
See Also
5819
--------
5820
SQLContext
5821
5822
Examples
5823
--------
5824
>>> from datetime import date
5825
>>> df1 = pl.DataFrame(
5826
... {
5827
... "a": [1, 2, 3],
5828
... "b": ["zz", "yy", "xx"],
5829
... "c": [date(1999, 12, 31), date(2010, 10, 10), date(2077, 8, 8)],
5830
... }
5831
... )
5832
5833
Query the DataFrame using SQL:
5834
5835
>>> df1.sql("SELECT c, b FROM self WHERE a > 1")
5836
shape: (2, 2)
5837
┌────────────┬─────┐
5838
│ c ┆ b │
5839
│ --- ┆ --- │
5840
│ date ┆ str │
5841
╞════════════╪═════╡
5842
│ 2010-10-10 ┆ yy │
5843
│ 2077-08-08 ┆ xx │
5844
└────────────┴─────┘
5845
5846
Apply transformations to a DataFrame using SQL, aliasing "self" to "frame".
5847
5848
>>> df1.sql(
5849
... query='''
5850
... SELECT
5851
... a,
5852
... (a % 2 == 0) AS a_is_even,
5853
... CONCAT_WS(':', b, b) AS b_b,
5854
... EXTRACT(year FROM c) AS year,
5855
... 0::float4 AS "zero",
5856
... FROM frame
5857
... ''',
5858
... table_name="frame",
5859
... )
5860
shape: (3, 5)
5861
┌─────┬───────────┬───────┬──────┬──────┐
5862
│ a ┆ a_is_even ┆ b_b ┆ year ┆ zero │
5863
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
5864
│ i64 ┆ bool ┆ str ┆ i32 ┆ f32 │
5865
╞═════╪═══════════╪═══════╪══════╪══════╡
5866
│ 1 ┆ false ┆ zz:zz ┆ 1999 ┆ 0.0 │
5867
│ 2 ┆ true ┆ yy:yy ┆ 2010 ┆ 0.0 │
5868
│ 3 ┆ false ┆ xx:xx ┆ 2077 ┆ 0.0 │
5869
└─────┴───────────┴───────┴──────┴──────┘
5870
"""
5871
from polars.sql import SQLContext
5872
5873
issue_unstable_warning(
5874
"`sql` is considered **unstable** (although it is close to being considered stable)."
5875
)
5876
with SQLContext(register_globals=False, eager=True) as ctx:
5877
name = table_name if table_name else "self"
5878
ctx.register(name=name, frame=self)
5879
return ctx.execute(query)
5880
5881
@deprecate_renamed_parameter("descending", "reverse", version="1.0.0")
5882
def top_k(
5883
self,
5884
k: int,
5885
*,
5886
by: IntoExpr | Iterable[IntoExpr],
5887
reverse: bool | Sequence[bool] = False,
5888
) -> DataFrame:
5889
"""
5890
Return the `k` largest rows.
5891
5892
Non-null elements are always preferred over null elements, regardless of
5893
the value of `reverse`. The output is not guaranteed to be in any
5894
particular order, call :func:`sort` after this function if you wish the
5895
output to be sorted.
5896
5897
.. versionchanged:: 1.0.0
5898
The `descending` parameter was renamed `reverse`.
5899
5900
Parameters
5901
----------
5902
k
5903
Number of rows to return.
5904
by
5905
Column(s) used to determine the top rows.
5906
Accepts expression input. Strings are parsed as column names.
5907
reverse
5908
Consider the `k` smallest elements of the `by` column(s) (instead of the `k`
5909
largest). This can be specified per column by passing a sequence of
5910
booleans.
5911
5912
See Also
5913
--------
5914
bottom_k
5915
5916
Examples
5917
--------
5918
>>> df = pl.DataFrame(
5919
... {
5920
... "a": ["a", "b", "a", "b", "b", "c"],
5921
... "b": [2, 1, 1, 3, 2, 1],
5922
... }
5923
... )
5924
5925
Get the rows which contain the 4 largest values in column b.
5926
5927
>>> df.top_k(4, by="b")
5928
shape: (4, 2)
5929
┌─────┬─────┐
5930
│ a ┆ b │
5931
│ --- ┆ --- │
5932
│ str ┆ i64 │
5933
╞═════╪═════╡
5934
│ b ┆ 3 │
5935
│ a ┆ 2 │
5936
│ b ┆ 2 │
5937
│ b ┆ 1 │
5938
└─────┴─────┘
5939
5940
Get the rows which contain the 4 largest values when sorting on column b and a.
5941
5942
>>> df.top_k(4, by=["b", "a"])
5943
shape: (4, 2)
5944
┌─────┬─────┐
5945
│ a ┆ b │
5946
│ --- ┆ --- │
5947
│ str ┆ i64 │
5948
╞═════╪═════╡
5949
│ b ┆ 3 │
5950
│ b ┆ 2 │
5951
│ a ┆ 2 │
5952
│ c ┆ 1 │
5953
└─────┴─────┘
5954
"""
5955
from polars.lazyframe.opt_flags import QueryOptFlags
5956
5957
return (
5958
self.lazy()
5959
.top_k(k, by=by, reverse=reverse)
5960
.collect(
5961
optimizations=QueryOptFlags(
5962
projection_pushdown=False,
5963
predicate_pushdown=False,
5964
comm_subplan_elim=False,
5965
slice_pushdown=True,
5966
)
5967
)
5968
)
5969
5970
@deprecate_renamed_parameter("descending", "reverse", version="1.0.0")
5971
def bottom_k(
5972
self,
5973
k: int,
5974
*,
5975
by: IntoExpr | Iterable[IntoExpr],
5976
reverse: bool | Sequence[bool] = False,
5977
) -> DataFrame:
5978
"""
5979
Return the `k` smallest rows.
5980
5981
Non-null elements are always preferred over null elements, regardless of
5982
the value of `reverse`. The output is not guaranteed to be in any
5983
particular order, call :func:`sort` after this function if you wish the
5984
output to be sorted.
5985
5986
.. versionchanged:: 1.0.0
5987
The `descending` parameter was renamed `reverse`.
5988
5989
Parameters
5990
----------
5991
k
5992
Number of rows to return.
5993
by
5994
Column(s) used to determine the bottom rows.
5995
Accepts expression input. Strings are parsed as column names.
5996
reverse
5997
Consider the `k` largest elements of the `by` column(s) (instead of the `k`
5998
smallest). This can be specified per column by passing a sequence of
5999
booleans.
6000
6001
See Also
6002
--------
6003
top_k
6004
6005
Examples
6006
--------
6007
>>> df = pl.DataFrame(
6008
... {
6009
... "a": ["a", "b", "a", "b", "b", "c"],
6010
... "b": [2, 1, 1, 3, 2, 1],
6011
... }
6012
... )
6013
6014
Get the rows which contain the 4 smallest values in column b.
6015
6016
>>> df.bottom_k(4, by="b")
6017
shape: (4, 2)
6018
┌─────┬─────┐
6019
│ a ┆ b │
6020
│ --- ┆ --- │
6021
│ str ┆ i64 │
6022
╞═════╪═════╡
6023
│ b ┆ 1 │
6024
│ a ┆ 1 │
6025
│ c ┆ 1 │
6026
│ a ┆ 2 │
6027
└─────┴─────┘
6028
6029
Get the rows which contain the 4 smallest values when sorting on column a and b.
6030
6031
>>> df.bottom_k(4, by=["a", "b"])
6032
shape: (4, 2)
6033
┌─────┬─────┐
6034
│ a ┆ b │
6035
│ --- ┆ --- │
6036
│ str ┆ i64 │
6037
╞═════╪═════╡
6038
│ a ┆ 1 │
6039
│ a ┆ 2 │
6040
│ b ┆ 1 │
6041
│ b ┆ 2 │
6042
└─────┴─────┘
6043
"""
6044
from polars.lazyframe.opt_flags import QueryOptFlags
6045
6046
return (
6047
self.lazy()
6048
.bottom_k(k, by=by, reverse=reverse)
6049
.collect(
6050
optimizations=QueryOptFlags(
6051
projection_pushdown=False,
6052
predicate_pushdown=False,
6053
comm_subplan_elim=False,
6054
slice_pushdown=True,
6055
)
6056
)
6057
)
6058
6059
def equals(self, other: DataFrame, *, null_equal: bool = True) -> bool:
6060
"""
6061
Check whether the DataFrame is equal to another DataFrame.
6062
6063
Parameters
6064
----------
6065
other
6066
DataFrame to compare with.
6067
null_equal
6068
Consider null values as equal.
6069
6070
See Also
6071
--------
6072
polars.testing.assert_frame_equal
6073
6074
Examples
6075
--------
6076
>>> df1 = pl.DataFrame(
6077
... {
6078
... "foo": [1, 2, 3],
6079
... "bar": [6.0, 7.0, 8.0],
6080
... "ham": ["a", "b", "c"],
6081
... }
6082
... )
6083
>>> df2 = pl.DataFrame(
6084
... {
6085
... "foo": [3, 2, 1],
6086
... "bar": [8.0, 7.0, 6.0],
6087
... "ham": ["c", "b", "a"],
6088
... }
6089
... )
6090
>>> df1.equals(df1)
6091
True
6092
>>> df1.equals(df2)
6093
False
6094
"""
6095
require_same_type(self, other)
6096
return self._df.equals(other._df, null_equal=null_equal)
6097
6098
def slice(self, offset: int, length: int | None = None) -> DataFrame:
6099
"""
6100
Get a slice of this DataFrame.
6101
6102
Parameters
6103
----------
6104
offset
6105
Start index. Negative indexing is supported.
6106
length
6107
Length of the slice. If set to `None`, all rows starting at the offset
6108
will be selected.
6109
6110
Examples
6111
--------
6112
>>> df = pl.DataFrame(
6113
... {
6114
... "foo": [1, 2, 3],
6115
... "bar": [6.0, 7.0, 8.0],
6116
... "ham": ["a", "b", "c"],
6117
... }
6118
... )
6119
>>> df.slice(1, 2)
6120
shape: (2, 3)
6121
┌─────┬─────┬─────┐
6122
│ foo ┆ bar ┆ ham │
6123
│ --- ┆ --- ┆ --- │
6124
│ i64 ┆ f64 ┆ str │
6125
╞═════╪═════╪═════╡
6126
│ 2 ┆ 7.0 ┆ b │
6127
│ 3 ┆ 8.0 ┆ c │
6128
└─────┴─────┴─────┘
6129
"""
6130
if (length is not None) and length < 0:
6131
length = self.height - offset + length
6132
return self._from_pydf(self._df.slice(offset, length))
6133
6134
def head(self, n: int = 5) -> DataFrame:
6135
"""
6136
Get the first `n` rows.
6137
6138
Parameters
6139
----------
6140
n
6141
Number of rows to return. If a negative value is passed, return all rows
6142
except the last `abs(n)`.
6143
6144
See Also
6145
--------
6146
tail, glimpse, slice
6147
6148
Examples
6149
--------
6150
>>> df = pl.DataFrame(
6151
... {
6152
... "foo": [1, 2, 3, 4, 5],
6153
... "bar": [6, 7, 8, 9, 10],
6154
... "ham": ["a", "b", "c", "d", "e"],
6155
... }
6156
... )
6157
>>> df.head(3)
6158
shape: (3, 3)
6159
┌─────┬─────┬─────┐
6160
│ foo ┆ bar ┆ ham │
6161
│ --- ┆ --- ┆ --- │
6162
│ i64 ┆ i64 ┆ str │
6163
╞═════╪═════╪═════╡
6164
│ 1 ┆ 6 ┆ a │
6165
│ 2 ┆ 7 ┆ b │
6166
│ 3 ┆ 8 ┆ c │
6167
└─────┴─────┴─────┘
6168
6169
Pass a negative value to get all rows `except` the last `abs(n)`.
6170
6171
>>> df.head(-3)
6172
shape: (2, 3)
6173
┌─────┬─────┬─────┐
6174
│ foo ┆ bar ┆ ham │
6175
│ --- ┆ --- ┆ --- │
6176
│ i64 ┆ i64 ┆ str │
6177
╞═════╪═════╪═════╡
6178
│ 1 ┆ 6 ┆ a │
6179
│ 2 ┆ 7 ┆ b │
6180
└─────┴─────┴─────┘
6181
"""
6182
if n < 0:
6183
n = max(0, self.height + n)
6184
return self._from_pydf(self._df.head(n))
6185
6186
def tail(self, n: int = 5) -> DataFrame:
6187
"""
6188
Get the last `n` rows.
6189
6190
Parameters
6191
----------
6192
n
6193
Number of rows to return. If a negative value is passed, return all rows
6194
except the first `abs(n)`.
6195
6196
See Also
6197
--------
6198
head, slice
6199
6200
Examples
6201
--------
6202
>>> df = pl.DataFrame(
6203
... {
6204
... "foo": [1, 2, 3, 4, 5],
6205
... "bar": [6, 7, 8, 9, 10],
6206
... "ham": ["a", "b", "c", "d", "e"],
6207
... }
6208
... )
6209
>>> df.tail(3)
6210
shape: (3, 3)
6211
┌─────┬─────┬─────┐
6212
│ foo ┆ bar ┆ ham │
6213
│ --- ┆ --- ┆ --- │
6214
│ i64 ┆ i64 ┆ str │
6215
╞═════╪═════╪═════╡
6216
│ 3 ┆ 8 ┆ c │
6217
│ 4 ┆ 9 ┆ d │
6218
│ 5 ┆ 10 ┆ e │
6219
└─────┴─────┴─────┘
6220
6221
Pass a negative value to get all rows `except` the first `abs(n)`.
6222
6223
>>> df.tail(-3)
6224
shape: (2, 3)
6225
┌─────┬─────┬─────┐
6226
│ foo ┆ bar ┆ ham │
6227
│ --- ┆ --- ┆ --- │
6228
│ i64 ┆ i64 ┆ str │
6229
╞═════╪═════╪═════╡
6230
│ 4 ┆ 9 ┆ d │
6231
│ 5 ┆ 10 ┆ e │
6232
└─────┴─────┴─────┘
6233
"""
6234
if n < 0:
6235
n = max(0, self.height + n)
6236
return self._from_pydf(self._df.tail(n))
6237
6238
def limit(self, n: int = 5) -> DataFrame:
6239
"""
6240
Get the first `n` rows.
6241
6242
Alias for :func:`DataFrame.head`.
6243
6244
Parameters
6245
----------
6246
n
6247
Number of rows to return. If a negative value is passed, return all rows
6248
except the last `abs(n)`.
6249
6250
See Also
6251
--------
6252
head
6253
6254
Examples
6255
--------
6256
Get the first 3 rows of a DataFrame.
6257
6258
>>> df = pl.DataFrame(
6259
... {
6260
... "foo": [1, 2, 3, 4, 5],
6261
... "bar": [6, 7, 8, 9, 10],
6262
... "ham": ["a", "b", "c", "d", "e"],
6263
... }
6264
... )
6265
>>> df.limit(3)
6266
shape: (3, 3)
6267
┌─────┬─────┬─────┐
6268
│ foo ┆ bar ┆ ham │
6269
│ --- ┆ --- ┆ --- │
6270
│ i64 ┆ i64 ┆ str │
6271
╞═════╪═════╪═════╡
6272
│ 1 ┆ 6 ┆ a │
6273
│ 2 ┆ 7 ┆ b │
6274
│ 3 ┆ 8 ┆ c │
6275
└─────┴─────┴─────┘
6276
"""
6277
return self.head(n)
6278
6279
def drop_nans(
6280
self,
6281
subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None,
6282
) -> DataFrame:
6283
"""
6284
Drop all rows that contain one or more NaN values.
6285
6286
The original order of the remaining rows is preserved.
6287
6288
Parameters
6289
----------
6290
subset
6291
Column name(s) for which NaN values are considered; if set to `None`
6292
(default), use all columns (note that only floating-point columns
6293
can contain NaNs).
6294
6295
See Also
6296
--------
6297
drop_nulls
6298
6299
Notes
6300
-----
6301
A NaN value is not the same as a null value.
6302
To drop null values, use :func:`drop_nulls`.
6303
6304
Examples
6305
--------
6306
>>> df = pl.DataFrame(
6307
... {
6308
... "foo": [-20.5, float("nan"), 80.0],
6309
... "bar": [float("nan"), 110.0, 25.5],
6310
... "ham": ["xxx", "yyy", None],
6311
... }
6312
... )
6313
6314
The default behavior of this method is to drop rows where any single
6315
value in the row is NaN:
6316
6317
>>> df.drop_nans()
6318
shape: (1, 3)
6319
┌──────┬──────┬──────┐
6320
│ foo ┆ bar ┆ ham │
6321
│ --- ┆ --- ┆ --- │
6322
│ f64 ┆ f64 ┆ str │
6323
╞══════╪══════╪══════╡
6324
│ 80.0 ┆ 25.5 ┆ null │
6325
└──────┴──────┴──────┘
6326
6327
This behaviour can be constrained to consider only a subset of columns, as
6328
defined by name, or with a selector. For example, dropping rows only if
6329
there is a NaN in the "bar" column:
6330
6331
>>> df.drop_nans(subset=["bar"])
6332
shape: (2, 3)
6333
┌──────┬───────┬──────┐
6334
│ foo ┆ bar ┆ ham │
6335
│ --- ┆ --- ┆ --- │
6336
│ f64 ┆ f64 ┆ str │
6337
╞══════╪═══════╪══════╡
6338
│ NaN ┆ 110.0 ┆ yyy │
6339
│ 80.0 ┆ 25.5 ┆ null │
6340
└──────┴───────┴──────┘
6341
6342
Dropping a row only if *all* values are NaN requires a different formulation:
6343
6344
>>> df = pl.DataFrame(
6345
... {
6346
... "a": [float("nan"), float("nan"), float("nan"), float("nan")],
6347
... "b": [10.0, 2.5, float("nan"), 5.25],
6348
... "c": [65.75, float("nan"), float("nan"), 10.5],
6349
... }
6350
... )
6351
>>> df.filter(~pl.all_horizontal(pl.all().is_nan()))
6352
shape: (3, 3)
6353
┌─────┬──────┬───────┐
6354
│ a ┆ b ┆ c │
6355
│ --- ┆ --- ┆ --- │
6356
│ f64 ┆ f64 ┆ f64 │
6357
╞═════╪══════╪═══════╡
6358
│ NaN ┆ 10.0 ┆ 65.75 │
6359
│ NaN ┆ 2.5 ┆ NaN │
6360
│ NaN ┆ 5.25 ┆ 10.5 │
6361
└─────┴──────┴───────┘
6362
"""
6363
from polars.lazyframe.opt_flags import QueryOptFlags
6364
6365
return (
6366
self.lazy().drop_nans(subset).collect(optimizations=QueryOptFlags._eager())
6367
)
6368
6369
def drop_nulls(
6370
self,
6371
subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None,
6372
) -> DataFrame:
6373
"""
6374
Drop all rows that contain one or more null values.
6375
6376
The original order of the remaining rows is preserved.
6377
6378
Parameters
6379
----------
6380
subset
6381
Column name(s) for which null values are considered.
6382
If set to `None` (default), use all columns.
6383
6384
See Also
6385
--------
6386
drop_nans
6387
6388
Notes
6389
-----
6390
A null value is not the same as a NaN value.
6391
To drop NaN values, use :func:`drop_nans`.
6392
6393
Examples
6394
--------
6395
>>> df = pl.DataFrame(
6396
... {
6397
... "foo": [1, 2, 3],
6398
... "bar": [6, None, 8],
6399
... "ham": ["a", "b", None],
6400
... }
6401
... )
6402
6403
The default behavior of this method is to drop rows where any single
6404
value of the row is null.
6405
6406
>>> df.drop_nulls()
6407
shape: (1, 3)
6408
┌─────┬─────┬─────┐
6409
│ foo ┆ bar ┆ ham │
6410
│ --- ┆ --- ┆ --- │
6411
│ i64 ┆ i64 ┆ str │
6412
╞═════╪═════╪═════╡
6413
│ 1 ┆ 6 ┆ a │
6414
└─────┴─────┴─────┘
6415
6416
This behaviour can be constrained to consider only a subset of columns, as
6417
defined by name or with a selector. For example, dropping rows if there is
6418
a null in any of the integer columns:
6419
6420
>>> import polars.selectors as cs
6421
>>> df.drop_nulls(subset=cs.integer())
6422
shape: (2, 3)
6423
┌─────┬─────┬──────┐
6424
│ foo ┆ bar ┆ ham │
6425
│ --- ┆ --- ┆ --- │
6426
│ i64 ┆ i64 ┆ str │
6427
╞═════╪═════╪══════╡
6428
│ 1 ┆ 6 ┆ a │
6429
│ 3 ┆ 8 ┆ null │
6430
└─────┴─────┴──────┘
6431
6432
Below are some additional examples that show how to drop null
6433
values based on other conditions.
6434
6435
>>> df = pl.DataFrame(
6436
... {
6437
... "a": [None, None, None, None],
6438
... "b": [1, 2, None, 1],
6439
... "c": [1, None, None, 1],
6440
... }
6441
... )
6442
>>> df
6443
shape: (4, 3)
6444
┌──────┬──────┬──────┐
6445
│ a ┆ b ┆ c │
6446
│ --- ┆ --- ┆ --- │
6447
│ null ┆ i64 ┆ i64 │
6448
╞══════╪══════╪══════╡
6449
│ null ┆ 1 ┆ 1 │
6450
│ null ┆ 2 ┆ null │
6451
│ null ┆ null ┆ null │
6452
│ null ┆ 1 ┆ 1 │
6453
└──────┴──────┴──────┘
6454
6455
Drop a row only if all values are null:
6456
6457
>>> df.filter(~pl.all_horizontal(pl.all().is_null()))
6458
shape: (3, 3)
6459
┌──────┬─────┬──────┐
6460
│ a ┆ b ┆ c │
6461
│ --- ┆ --- ┆ --- │
6462
│ null ┆ i64 ┆ i64 │
6463
╞══════╪═════╪══════╡
6464
│ null ┆ 1 ┆ 1 │
6465
│ null ┆ 2 ┆ null │
6466
│ null ┆ 1 ┆ 1 │
6467
└──────┴─────┴──────┘
6468
6469
Drop a column if all values are null:
6470
6471
>>> df[[s.name for s in df if not (s.null_count() == df.height)]]
6472
shape: (4, 2)
6473
┌──────┬──────┐
6474
│ b ┆ c │
6475
│ --- ┆ --- │
6476
│ i64 ┆ i64 │
6477
╞══════╪══════╡
6478
│ 1 ┆ 1 │
6479
│ 2 ┆ null │
6480
│ null ┆ null │
6481
│ 1 ┆ 1 │
6482
└──────┴──────┘
6483
"""
6484
from polars.lazyframe.opt_flags import QueryOptFlags
6485
6486
return (
6487
self.lazy().drop_nulls(subset).collect(optimizations=QueryOptFlags._eager())
6488
)
6489
6490
def pipe(
6491
self,
6492
function: Callable[Concatenate[DataFrame, P], T],
6493
*args: P.args,
6494
**kwargs: P.kwargs,
6495
) -> T:
6496
"""
6497
Offers a structured way to apply a sequence of user-defined functions (UDFs).
6498
6499
Parameters
6500
----------
6501
function
6502
Callable; will receive the frame as the first parameter,
6503
followed by any given args/kwargs.
6504
*args
6505
Arguments to pass to the UDF.
6506
**kwargs
6507
Keyword arguments to pass to the UDF.
6508
6509
Notes
6510
-----
6511
It is recommended to use LazyFrame when piping operations, in order
6512
to fully take advantage of query optimization and parallelization.
6513
See :meth:`df.lazy() <polars.DataFrame.lazy>`.
6514
6515
Examples
6516
--------
6517
>>> def cast_str_to_int(data, col_name):
6518
... return data.with_columns(pl.col(col_name).cast(pl.Int64))
6519
>>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]})
6520
>>> df.pipe(cast_str_to_int, col_name="b")
6521
shape: (4, 2)
6522
┌─────┬─────┐
6523
│ a ┆ b │
6524
│ --- ┆ --- │
6525
│ i64 ┆ i64 │
6526
╞═════╪═════╡
6527
│ 1 ┆ 10 │
6528
│ 2 ┆ 20 │
6529
│ 3 ┆ 30 │
6530
│ 4 ┆ 40 │
6531
└─────┴─────┘
6532
6533
>>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]})
6534
>>> df
6535
shape: (2, 2)
6536
┌─────┬─────┐
6537
│ b ┆ a │
6538
│ --- ┆ --- │
6539
│ i64 ┆ i64 │
6540
╞═════╪═════╡
6541
│ 1 ┆ 3 │
6542
│ 2 ┆ 4 │
6543
└─────┴─────┘
6544
>>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns)))
6545
shape: (2, 2)
6546
┌─────┬─────┐
6547
│ a ┆ b │
6548
│ --- ┆ --- │
6549
│ i64 ┆ i64 │
6550
╞═════╪═════╡
6551
│ 3 ┆ 1 │
6552
│ 4 ┆ 2 │
6553
└─────┴─────┘
6554
"""
6555
return function(self, *args, **kwargs)
6556
6557
def map_columns(
6558
self,
6559
column_names: str | Sequence[str] | pl.Selector,
6560
function: Callable[[Series], Series],
6561
*args: P.args,
6562
**kwargs: P.kwargs,
6563
) -> DataFrame:
6564
"""
6565
Apply eager functions to columns of a DataFrame.
6566
6567
Users should always prefer :meth:`with_columns` unless they are using
6568
expressions that are only possible on `Series` and not on `Expr`. This is almost
6569
never the case, except for a very select few functions that cannot know the
6570
output datatype without looking at the data.
6571
6572
Parameters
6573
----------
6574
column_names
6575
The columns to apply the UDF to.
6576
function
6577
Callable; will receive a column series as the first parameter,
6578
followed by any given args/kwargs.
6579
*args
6580
Arguments to pass to the UDF.
6581
**kwargs
6582
Keyword arguments to pass to the UDF.
6583
6584
Examples
6585
--------
6586
>>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]})
6587
>>> df.map_columns("a", lambda s: s.shrink_dtype())
6588
shape: (4, 2)
6589
┌─────┬─────┐
6590
│ a ┆ b │
6591
│ --- ┆ --- │
6592
│ i8 ┆ str │
6593
╞═════╪═════╡
6594
│ 1 ┆ 10 │
6595
│ 2 ┆ 20 │
6596
│ 3 ┆ 30 │
6597
│ 4 ┆ 40 │
6598
└─────┴─────┘
6599
6600
>>> df = pl.DataFrame(
6601
... {
6602
... "a": ['{"x":"a"}', None, '{"x":"b"}', None],
6603
... "b": ['{"a":1, "b": true}', None, '{"a":2, "b": false}', None],
6604
... }
6605
... )
6606
>>> df.map_columns(["a", "b"], lambda s: s.str.json_decode())
6607
shape: (4, 2)
6608
┌───────────┬───────────┐
6609
│ a ┆ b │
6610
│ --- ┆ --- │
6611
│ struct[1] ┆ struct[2] │
6612
╞═══════════╪═══════════╡
6613
│ {"a"} ┆ {1,true} │
6614
│ null ┆ null │
6615
│ {"b"} ┆ {2,false} │
6616
│ null ┆ null │
6617
└───────────┴───────────┘
6618
>>> import polars.selectors as cs
6619
>>> df.map_columns(cs.all(), lambda s: s.str.json_decode())
6620
shape: (4, 2)
6621
┌───────────┬───────────┐
6622
│ a ┆ b │
6623
│ --- ┆ --- │
6624
│ struct[1] ┆ struct[2] │
6625
╞═══════════╪═══════════╡
6626
│ {"a"} ┆ {1,true} │
6627
│ null ┆ null │
6628
│ {"b"} ┆ {2,false} │
6629
│ null ┆ null │
6630
└───────────┴───────────┘
6631
6632
See Also
6633
--------
6634
with_columns
6635
"""
6636
c_names: list[str]
6637
if isinstance(column_names, (pl.Selector, pl.Expr)):
6638
from polars.selectors import expand_selector
6639
6640
c_names = list(expand_selector(self, column_names))
6641
elif isinstance(column_names, str):
6642
c_names = [column_names]
6643
else:
6644
c_names = list(column_names)
6645
6646
return self.with_columns(
6647
**{c: function(self[c], *args, **kwargs) for c in c_names}
6648
)
6649
6650
def with_row_index(self, name: str = "index", offset: int = 0) -> DataFrame:
6651
"""
6652
Add a row index as the first column in the DataFrame.
6653
6654
Parameters
6655
----------
6656
name
6657
Name of the index column.
6658
offset
6659
Start the index at this offset. Cannot be negative.
6660
6661
Notes
6662
-----
6663
The resulting column does not have any special properties. It is a regular
6664
column of type `UInt32` (or `UInt64` in `polars-u64-idx`).
6665
6666
Examples
6667
--------
6668
>>> df = pl.DataFrame(
6669
... {
6670
... "a": [1, 3, 5],
6671
... "b": [2, 4, 6],
6672
... }
6673
... )
6674
>>> df.with_row_index()
6675
shape: (3, 3)
6676
┌───────┬─────┬─────┐
6677
│ index ┆ a ┆ b │
6678
│ --- ┆ --- ┆ --- │
6679
│ u32 ┆ i64 ┆ i64 │
6680
╞═══════╪═════╪═════╡
6681
│ 0 ┆ 1 ┆ 2 │
6682
│ 1 ┆ 3 ┆ 4 │
6683
│ 2 ┆ 5 ┆ 6 │
6684
└───────┴─────┴─────┘
6685
>>> df.with_row_index("id", offset=1000)
6686
shape: (3, 3)
6687
┌──────┬─────┬─────┐
6688
│ id ┆ a ┆ b │
6689
│ --- ┆ --- ┆ --- │
6690
│ u32 ┆ i64 ┆ i64 │
6691
╞══════╪═════╪═════╡
6692
│ 1000 ┆ 1 ┆ 2 │
6693
│ 1001 ┆ 3 ┆ 4 │
6694
│ 1002 ┆ 5 ┆ 6 │
6695
└──────┴─────┴─────┘
6696
6697
An index column can also be created using the expressions :func:`int_range`
6698
and :func:`len`.
6699
6700
>>> df.select(
6701
... pl.int_range(pl.len(), dtype=pl.UInt32).alias("index"),
6702
... pl.all(),
6703
... )
6704
shape: (3, 3)
6705
┌───────┬─────┬─────┐
6706
│ index ┆ a ┆ b │
6707
│ --- ┆ --- ┆ --- │
6708
│ u32 ┆ i64 ┆ i64 │
6709
╞═══════╪═════╪═════╡
6710
│ 0 ┆ 1 ┆ 2 │
6711
│ 1 ┆ 3 ┆ 4 │
6712
│ 2 ┆ 5 ┆ 6 │
6713
└───────┴─────┴─────┘
6714
"""
6715
try:
6716
return self._from_pydf(self._df.with_row_index(name, offset))
6717
except OverflowError:
6718
issue = "negative" if offset < 0 else "greater than the maximum index value"
6719
msg = f"`offset` input for `with_row_index` cannot be {issue}, got {offset}"
6720
raise ValueError(msg) from None
6721
6722
@deprecated(
6723
"`DataFrame.with_row_count` is deprecated; use `with_row_index` instead."
6724
" Note that the default column name has changed from 'row_nr' to 'index'."
6725
)
6726
def with_row_count(self, name: str = "row_nr", offset: int = 0) -> DataFrame:
6727
"""
6728
Add a column at index 0 that counts the rows.
6729
6730
.. deprecated:: 0.20.4
6731
Use the :meth:`with_row_index` method instead.
6732
Note that the default column name has changed from 'row_nr' to 'index'.
6733
6734
Parameters
6735
----------
6736
name
6737
Name of the column to add.
6738
offset
6739
Start the row count at this offset. Default = 0
6740
6741
Examples
6742
--------
6743
>>> df = pl.DataFrame(
6744
... {
6745
... "a": [1, 3, 5],
6746
... "b": [2, 4, 6],
6747
... }
6748
... )
6749
>>> df.with_row_count() # doctest: +SKIP
6750
shape: (3, 3)
6751
┌────────┬─────┬─────┐
6752
│ row_nr ┆ a ┆ b │
6753
│ --- ┆ --- ┆ --- │
6754
│ u32 ┆ i64 ┆ i64 │
6755
╞════════╪═════╪═════╡
6756
│ 0 ┆ 1 ┆ 2 │
6757
│ 1 ┆ 3 ┆ 4 │
6758
│ 2 ┆ 5 ┆ 6 │
6759
└────────┴─────┴─────┘
6760
"""
6761
return self.with_row_index(name, offset)
6762
6763
def group_by(
6764
self,
6765
*by: IntoExpr | Iterable[IntoExpr],
6766
maintain_order: bool = False,
6767
**named_by: IntoExpr,
6768
) -> GroupBy:
6769
"""
6770
Start a group by operation.
6771
6772
Parameters
6773
----------
6774
*by
6775
Column(s) to group by. Accepts expression input. Strings are parsed as
6776
column names.
6777
maintain_order
6778
Ensure that the order of the groups is consistent with the input data.
6779
This is slower than a default group by.
6780
Settings this to `True` blocks the possibility
6781
to run on the streaming engine.
6782
6783
.. note::
6784
Within each group, the order of rows is always preserved, regardless
6785
of this argument.
6786
**named_by
6787
Additional columns to group by, specified as keyword arguments.
6788
The columns will be renamed to the keyword used.
6789
6790
Returns
6791
-------
6792
GroupBy
6793
Object which can be used to perform aggregations.
6794
6795
Examples
6796
--------
6797
Group by one column and call `agg` to compute the grouped sum of another
6798
column.
6799
6800
>>> df = pl.DataFrame(
6801
... {
6802
... "a": ["a", "b", "a", "b", "c"],
6803
... "b": [1, 2, 1, 3, 3],
6804
... "c": [5, 4, 3, 2, 1],
6805
... }
6806
... )
6807
>>> df.group_by("a").agg(pl.col("b").sum()) # doctest: +IGNORE_RESULT
6808
shape: (3, 2)
6809
┌─────┬─────┐
6810
│ a ┆ b │
6811
│ --- ┆ --- │
6812
│ str ┆ i64 │
6813
╞═════╪═════╡
6814
│ a ┆ 2 │
6815
│ b ┆ 5 │
6816
│ c ┆ 3 │
6817
└─────┴─────┘
6818
6819
Set `maintain_order=True` to ensure the order of the groups is consistent with
6820
the input.
6821
6822
>>> df.group_by("a", maintain_order=True).agg(pl.col("c"))
6823
shape: (3, 2)
6824
┌─────┬───────────┐
6825
│ a ┆ c │
6826
│ --- ┆ --- │
6827
│ str ┆ list[i64] │
6828
╞═════╪═══════════╡
6829
│ a ┆ [5, 3] │
6830
│ b ┆ [4, 2] │
6831
│ c ┆ [1] │
6832
└─────┴───────────┘
6833
6834
Group by multiple columns by passing a list of column names.
6835
6836
>>> df.group_by(["a", "b"]).agg(pl.max("c")) # doctest: +IGNORE_RESULT
6837
shape: (4, 3)
6838
┌─────┬─────┬─────┐
6839
│ a ┆ b ┆ c │
6840
│ --- ┆ --- ┆ --- │
6841
│ str ┆ i64 ┆ i64 │
6842
╞═════╪═════╪═════╡
6843
│ a ┆ 1 ┆ 5 │
6844
│ b ┆ 2 ┆ 4 │
6845
│ b ┆ 3 ┆ 2 │
6846
│ c ┆ 3 ┆ 1 │
6847
└─────┴─────┴─────┘
6848
6849
Or use positional arguments to group by multiple columns in the same way.
6850
Expressions are also accepted.
6851
6852
>>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean()) # doctest: +SKIP
6853
shape: (3, 3)
6854
┌─────┬─────┬─────┐
6855
│ a ┆ b ┆ c │
6856
│ --- ┆ --- ┆ --- │
6857
│ str ┆ i64 ┆ f64 │
6858
╞═════╪═════╪═════╡
6859
│ a ┆ 0 ┆ 4.0 │
6860
│ b ┆ 1 ┆ 3.0 │
6861
│ c ┆ 1 ┆ 1.0 │
6862
└─────┴─────┴─────┘
6863
6864
The `GroupBy` object returned by this method is iterable, returning the name
6865
and data of each group.
6866
6867
>>> for name, data in df.group_by("a"): # doctest: +SKIP
6868
... print(name)
6869
... print(data)
6870
('a',)
6871
shape: (2, 3)
6872
┌─────┬─────┬─────┐
6873
│ a ┆ b ┆ c │
6874
│ --- ┆ --- ┆ --- │
6875
│ str ┆ i64 ┆ i64 │
6876
╞═════╪═════╪═════╡
6877
│ a ┆ 1 ┆ 5 │
6878
│ a ┆ 1 ┆ 3 │
6879
└─────┴─────┴─────┘
6880
('b',)
6881
shape: (2, 3)
6882
┌─────┬─────┬─────┐
6883
│ a ┆ b ┆ c │
6884
│ --- ┆ --- ┆ --- │
6885
│ str ┆ i64 ┆ i64 │
6886
╞═════╪═════╪═════╡
6887
│ b ┆ 2 ┆ 4 │
6888
│ b ┆ 3 ┆ 2 │
6889
└─────┴─────┴─────┘
6890
('c',)
6891
shape: (1, 3)
6892
┌─────┬─────┬─────┐
6893
│ a ┆ b ┆ c │
6894
│ --- ┆ --- ┆ --- │
6895
│ str ┆ i64 ┆ i64 │
6896
╞═════╪═════╪═════╡
6897
│ c ┆ 3 ┆ 1 │
6898
└─────┴─────┴─────┘
6899
"""
6900
for value in named_by.values():
6901
if not isinstance(value, (str, pl.Expr, pl.Series)):
6902
msg = (
6903
f"Expected Polars expression or object convertible to one, got {type(value)}.\n\n"
6904
"Hint: if you tried\n"
6905
f" group_by(by={value!r})\n"
6906
"then you probably want to use this instead:\n"
6907
f" group_by({value!r})"
6908
)
6909
raise TypeError(msg)
6910
return GroupBy(self, *by, **named_by, maintain_order=maintain_order)
6911
6912
@deprecate_renamed_parameter("by", "group_by", version="0.20.14")
6913
def rolling(
6914
self,
6915
index_column: IntoExpr,
6916
*,
6917
period: str | timedelta,
6918
offset: str | timedelta | None = None,
6919
closed: ClosedInterval = "right",
6920
group_by: IntoExpr | Iterable[IntoExpr] | None = None,
6921
) -> RollingGroupBy:
6922
"""
6923
Create rolling groups based on a temporal or integer column.
6924
6925
Different from a `group_by_dynamic` the windows are now determined by the
6926
individual values and are not of constant intervals. For constant intervals use
6927
:func:`DataFrame.group_by_dynamic`.
6928
6929
If you have a time series `<t_0, t_1, ..., t_n>`, then by default the
6930
windows created will be
6931
6932
* (t_0 - period, t_0]
6933
* (t_1 - period, t_1]
6934
* ...
6935
* (t_n - period, t_n]
6936
6937
whereas if you pass a non-default `offset`, then the windows will be
6938
6939
* (t_0 + offset, t_0 + offset + period]
6940
* (t_1 + offset, t_1 + offset + period]
6941
* ...
6942
* (t_n + offset, t_n + offset + period]
6943
6944
The `period` and `offset` arguments are created either from a timedelta, or
6945
by using the following string language:
6946
6947
- 1ns (1 nanosecond)
6948
- 1us (1 microsecond)
6949
- 1ms (1 millisecond)
6950
- 1s (1 second)
6951
- 1m (1 minute)
6952
- 1h (1 hour)
6953
- 1d (1 calendar day)
6954
- 1w (1 calendar week)
6955
- 1mo (1 calendar month)
6956
- 1q (1 calendar quarter)
6957
- 1y (1 calendar year)
6958
- 1i (1 index count)
6959
6960
Or combine them:
6961
"3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
6962
6963
By "calendar day", we mean the corresponding time on the next day (which may
6964
not be 24 hours, due to daylight savings). Similarly for "calendar week",
6965
"calendar month", "calendar quarter", and "calendar year".
6966
6967
.. versionchanged:: 0.20.14
6968
The `by` parameter was renamed `group_by`.
6969
6970
Parameters
6971
----------
6972
index_column
6973
Column used to group based on the time window.
6974
Often of type Date/Datetime.
6975
This column must be sorted in ascending order (or, if `group_by` is
6976
specified, then it must be sorted in ascending order within each group).
6977
6978
In case of a rolling operation on indices, dtype needs to be one of
6979
{UInt32, UInt64, Int32, Int64}. Note that the first three get temporarily
6980
cast to Int64, so if performance matters use an Int64 column.
6981
period
6982
Length of the window - must be non-negative.
6983
offset
6984
Offset of the window. Default is `-period`.
6985
closed : {'right', 'left', 'both', 'none'}
6986
Define which sides of the temporal interval are closed (inclusive).
6987
group_by
6988
Also group by this column/these columns
6989
6990
Returns
6991
-------
6992
RollingGroupBy
6993
Object you can call `.agg` on to aggregate by groups, the result
6994
of which will be sorted by `index_column` (but note that if `group_by`
6995
columns are passed, it will only be sorted within each group).
6996
6997
See Also
6998
--------
6999
group_by_dynamic
7000
7001
Examples
7002
--------
7003
>>> dates = [
7004
... "2020-01-01 13:45:48",
7005
... "2020-01-01 16:42:13",
7006
... "2020-01-01 16:45:09",
7007
... "2020-01-02 18:12:48",
7008
... "2020-01-03 19:45:32",
7009
... "2020-01-08 23:16:43",
7010
... ]
7011
>>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns(
7012
... pl.col("dt").str.strptime(pl.Datetime).set_sorted()
7013
... )
7014
>>> out = df.rolling(index_column="dt", period="2d").agg(
7015
... [
7016
... pl.sum("a").alias("sum_a"),
7017
... pl.min("a").alias("min_a"),
7018
... pl.max("a").alias("max_a"),
7019
... ]
7020
... )
7021
>>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1]
7022
>>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1]
7023
>>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1]
7024
>>> out
7025
shape: (6, 4)
7026
┌─────────────────────┬───────┬───────┬───────┐
7027
│ dt ┆ sum_a ┆ min_a ┆ max_a │
7028
│ --- ┆ --- ┆ --- ┆ --- │
7029
│ datetime[μs] ┆ i64 ┆ i64 ┆ i64 │
7030
╞═════════════════════╪═══════╪═══════╪═══════╡
7031
│ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
7032
│ 2020-01-01 16:42:13 ┆ 10 ┆ 3 ┆ 7 │
7033
│ 2020-01-01 16:45:09 ┆ 15 ┆ 3 ┆ 7 │
7034
│ 2020-01-02 18:12:48 ┆ 24 ┆ 3 ┆ 9 │
7035
│ 2020-01-03 19:45:32 ┆ 11 ┆ 2 ┆ 9 │
7036
│ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
7037
└─────────────────────┴───────┴───────┴───────┘
7038
7039
If you use an index count in `period` or `offset`, then it's based on the
7040
values in `index_column`:
7041
7042
>>> df = pl.DataFrame({"int": [0, 4, 5, 6, 8], "value": [1, 4, 2, 4, 1]})
7043
>>> df.rolling("int", period="3i").agg(pl.col("int").alias("aggregated"))
7044
shape: (5, 2)
7045
┌─────┬────────────┐
7046
│ int ┆ aggregated │
7047
│ --- ┆ --- │
7048
│ i64 ┆ list[i64] │
7049
╞═════╪════════════╡
7050
│ 0 ┆ [0] │
7051
│ 4 ┆ [4] │
7052
│ 5 ┆ [4, 5] │
7053
│ 6 ┆ [4, 5, 6] │
7054
│ 8 ┆ [6, 8] │
7055
└─────┴────────────┘
7056
7057
If you want the index count to be based on row number, then you may want to
7058
combine `rolling` with :meth:`.with_row_index`.
7059
"""
7060
return RollingGroupBy(
7061
self,
7062
index_column=index_column,
7063
period=period,
7064
offset=offset,
7065
closed=closed,
7066
group_by=group_by,
7067
)
7068
7069
@deprecate_renamed_parameter("by", "group_by", version="0.20.14")
7070
def group_by_dynamic(
7071
self,
7072
index_column: IntoExpr,
7073
*,
7074
every: str | timedelta,
7075
period: str | timedelta | None = None,
7076
offset: str | timedelta | None = None,
7077
include_boundaries: bool = False,
7078
closed: ClosedInterval = "left",
7079
label: Label = "left",
7080
group_by: IntoExpr | Iterable[IntoExpr] | None = None,
7081
start_by: StartBy = "window",
7082
) -> DynamicGroupBy:
7083
"""
7084
Group based on a time value (or index value of type Int32, Int64).
7085
7086
Time windows are calculated and rows are assigned to windows. Different from a
7087
normal group by is that a row can be member of multiple groups.
7088
By default, the windows look like:
7089
7090
- [start, start + period)
7091
- [start + every, start + every + period)
7092
- [start + 2*every, start + 2*every + period)
7093
- ...
7094
7095
where `start` is determined by `start_by`, `offset`, `every`, and the earliest
7096
datapoint. See the `start_by` argument description for details.
7097
7098
.. warning::
7099
The index column must be sorted in ascending order. If `group_by` is passed, then
7100
the index column must be sorted in ascending order within each group.
7101
7102
.. versionchanged:: 0.20.14
7103
The `by` parameter was renamed `group_by`.
7104
7105
Parameters
7106
----------
7107
index_column
7108
Column used to group based on the time window.
7109
Often of type Date/Datetime.
7110
This column must be sorted in ascending order (or, if `group_by` is specified,
7111
then it must be sorted in ascending order within each group).
7112
7113
In case of a dynamic group by on indices, dtype needs to be one of
7114
{Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if
7115
performance matters use an Int64 column.
7116
every
7117
interval of the window
7118
period
7119
length of the window, if None it will equal 'every'
7120
offset
7121
offset of the window, does not take effect if `start_by` is 'datapoint'.
7122
Defaults to zero.
7123
include_boundaries
7124
Add the lower and upper bound of the window to the "_lower_boundary" and
7125
"_upper_boundary" columns. This will impact performance because it's harder to
7126
parallelize
7127
closed : {'left', 'right', 'both', 'none'}
7128
Define which sides of the temporal interval are closed (inclusive).
7129
label : {'left', 'right', 'datapoint'}
7130
Define which label to use for the window:
7131
7132
- 'left': lower boundary of the window
7133
- 'right': upper boundary of the window
7134
- 'datapoint': the first value of the index column in the given window.
7135
If you don't need the label to be at one of the boundaries, choose this
7136
option for maximum performance
7137
group_by
7138
Also group by this column/these columns
7139
start_by : {'window', 'datapoint', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'}
7140
The strategy to determine the start of the first window by.
7141
7142
* 'window': Start by taking the earliest timestamp, truncating it with
7143
`every`, and then adding `offset`.
7144
Note that weekly windows start on Monday.
7145
* 'datapoint': Start from the first encountered data point.
7146
* a day of the week (only takes effect if `every` contains `'w'`):
7147
7148
* 'monday': Start the window on the Monday before the first data point.
7149
* 'tuesday': Start the window on the Tuesday before the first data point.
7150
* ...
7151
* 'sunday': Start the window on the Sunday before the first data point.
7152
7153
The resulting window is then shifted back until the earliest datapoint
7154
is in or in front of it.
7155
7156
Returns
7157
-------
7158
DynamicGroupBy
7159
Object you can call `.agg` on to aggregate by groups, the result
7160
of which will be sorted by `index_column` (but note that if `group_by` columns are
7161
passed, it will only be sorted within each group).
7162
7163
See Also
7164
--------
7165
rolling
7166
7167
Notes
7168
-----
7169
1) If you're coming from pandas, then
7170
7171
.. code-block:: python
7172
7173
# polars
7174
df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum())
7175
7176
is equivalent to
7177
7178
.. code-block:: python
7179
7180
# pandas
7181
df.set_index("ts").resample("D")["value"].sum().reset_index()
7182
7183
though note that, unlike pandas, polars doesn't add extra rows for empty
7184
windows. If you need `index_column` to be evenly spaced, then please combine
7185
with :func:`DataFrame.upsample`.
7186
7187
2) The `every`, `period` and `offset` arguments are created with
7188
the following string language:
7189
7190
- 1ns (1 nanosecond)
7191
- 1us (1 microsecond)
7192
- 1ms (1 millisecond)
7193
- 1s (1 second)
7194
- 1m (1 minute)
7195
- 1h (1 hour)
7196
- 1d (1 calendar day)
7197
- 1w (1 calendar week)
7198
- 1mo (1 calendar month)
7199
- 1q (1 calendar quarter)
7200
- 1y (1 calendar year)
7201
- 1i (1 index count)
7202
7203
Or combine them (except in `every`):
7204
"3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
7205
7206
By "calendar day", we mean the corresponding time on the next day (which may
7207
not be 24 hours, due to daylight savings). Similarly for "calendar week",
7208
"calendar month", "calendar quarter", and "calendar year".
7209
7210
In case of a group_by_dynamic on an integer column, the windows are defined by:
7211
7212
- "1i" # length 1
7213
- "10i" # length 10
7214
7215
Examples
7216
--------
7217
>>> from datetime import datetime
7218
>>> df = pl.DataFrame(
7219
... {
7220
... "time": pl.datetime_range(
7221
... start=datetime(2021, 12, 16),
7222
... end=datetime(2021, 12, 16, 3),
7223
... interval="30m",
7224
... eager=True,
7225
... ),
7226
... "n": range(7),
7227
... }
7228
... )
7229
>>> df
7230
shape: (7, 2)
7231
┌─────────────────────┬─────┐
7232
│ time ┆ n │
7233
│ --- ┆ --- │
7234
│ datetime[μs] ┆ i64 │
7235
╞═════════════════════╪═════╡
7236
│ 2021-12-16 00:00:00 ┆ 0 │
7237
│ 2021-12-16 00:30:00 ┆ 1 │
7238
│ 2021-12-16 01:00:00 ┆ 2 │
7239
│ 2021-12-16 01:30:00 ┆ 3 │
7240
│ 2021-12-16 02:00:00 ┆ 4 │
7241
│ 2021-12-16 02:30:00 ┆ 5 │
7242
│ 2021-12-16 03:00:00 ┆ 6 │
7243
└─────────────────────┴─────┘
7244
7245
Group by windows of 1 hour.
7246
7247
>>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n"))
7248
shape: (4, 2)
7249
┌─────────────────────┬───────────┐
7250
│ time ┆ n │
7251
│ --- ┆ --- │
7252
│ datetime[μs] ┆ list[i64] │
7253
╞═════════════════════╪═══════════╡
7254
│ 2021-12-15 23:00:00 ┆ [0] │
7255
│ 2021-12-16 00:00:00 ┆ [1, 2] │
7256
│ 2021-12-16 01:00:00 ┆ [3, 4] │
7257
│ 2021-12-16 02:00:00 ┆ [5, 6] │
7258
└─────────────────────┴───────────┘
7259
7260
The window boundaries can also be added to the aggregation result
7261
7262
>>> df.group_by_dynamic(
7263
... "time", every="1h", include_boundaries=True, closed="right"
7264
... ).agg(pl.col("n").mean())
7265
shape: (4, 4)
7266
┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐
7267
│ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │
7268
│ --- ┆ --- ┆ --- ┆ --- │
7269
│ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ f64 │
7270
╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡
7271
│ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │
7272
│ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │
7273
│ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │
7274
│ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │
7275
└─────────────────────┴─────────────────────┴─────────────────────┴─────┘
7276
7277
When closed="left", the window excludes the right end of interval:
7278
[lower_bound, upper_bound)
7279
7280
>>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n"))
7281
shape: (4, 2)
7282
┌─────────────────────┬───────────┐
7283
│ time ┆ n │
7284
│ --- ┆ --- │
7285
│ datetime[μs] ┆ list[i64] │
7286
╞═════════════════════╪═══════════╡
7287
│ 2021-12-16 00:00:00 ┆ [0, 1] │
7288
│ 2021-12-16 01:00:00 ┆ [2, 3] │
7289
│ 2021-12-16 02:00:00 ┆ [4, 5] │
7290
│ 2021-12-16 03:00:00 ┆ [6] │
7291
└─────────────────────┴───────────┘
7292
7293
When closed="both" the time values at the window boundaries belong to 2 groups.
7294
7295
>>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n"))
7296
shape: (4, 2)
7297
┌─────────────────────┬───────────┐
7298
│ time ┆ n │
7299
│ --- ┆ --- │
7300
│ datetime[μs] ┆ list[i64] │
7301
╞═════════════════════╪═══════════╡
7302
│ 2021-12-16 00:00:00 ┆ [0, 1, 2] │
7303
│ 2021-12-16 01:00:00 ┆ [2, 3, 4] │
7304
│ 2021-12-16 02:00:00 ┆ [4, 5, 6] │
7305
│ 2021-12-16 03:00:00 ┆ [6] │
7306
└─────────────────────┴───────────┘
7307
7308
Dynamic group bys can also be combined with grouping on normal keys
7309
7310
>>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"]))
7311
>>> df
7312
shape: (7, 3)
7313
┌─────────────────────┬─────┬────────┐
7314
│ time ┆ n ┆ groups │
7315
│ --- ┆ --- ┆ --- │
7316
│ datetime[μs] ┆ i64 ┆ str │
7317
╞═════════════════════╪═════╪════════╡
7318
│ 2021-12-16 00:00:00 ┆ 0 ┆ a │
7319
│ 2021-12-16 00:30:00 ┆ 1 ┆ a │
7320
│ 2021-12-16 01:00:00 ┆ 2 ┆ a │
7321
│ 2021-12-16 01:30:00 ┆ 3 ┆ b │
7322
│ 2021-12-16 02:00:00 ┆ 4 ┆ b │
7323
│ 2021-12-16 02:30:00 ┆ 5 ┆ a │
7324
│ 2021-12-16 03:00:00 ┆ 6 ┆ a │
7325
└─────────────────────┴─────┴────────┘
7326
>>> df.group_by_dynamic(
7327
... "time",
7328
... every="1h",
7329
... closed="both",
7330
... group_by="groups",
7331
... include_boundaries=True,
7332
... ).agg(pl.col("n"))
7333
shape: (6, 5)
7334
┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐
7335
│ groups ┆ _lower_boundary ┆ _upper_boundary ┆ time ┆ n │
7336
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
7337
│ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] ┆ list[i64] │
7338
╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡
7339
│ a ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │
7340
│ a ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2] │
7341
│ a ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6] │
7342
│ a ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6] │
7343
│ b ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4] │
7344
│ b ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4] │
7345
└────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘
7346
7347
Dynamic group by on an index column
7348
7349
>>> df = pl.DataFrame(
7350
... {
7351
... "idx": pl.int_range(0, 6, eager=True),
7352
... "A": ["A", "A", "B", "B", "B", "C"],
7353
... }
7354
... )
7355
>>> (
7356
... df.group_by_dynamic(
7357
... "idx",
7358
... every="2i",
7359
... period="3i",
7360
... include_boundaries=True,
7361
... closed="right",
7362
... ).agg(pl.col("A").alias("A_agg_list"))
7363
... )
7364
shape: (4, 4)
7365
┌─────────────────┬─────────────────┬─────┬─────────────────┐
7366
│ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list │
7367
│ --- ┆ --- ┆ --- ┆ --- │
7368
│ i64 ┆ i64 ┆ i64 ┆ list[str] │
7369
╞═════════════════╪═════════════════╪═════╪═════════════════╡
7370
│ -2 ┆ 1 ┆ -2 ┆ ["A", "A"] │
7371
│ 0 ┆ 3 ┆ 0 ┆ ["A", "B", "B"] │
7372
│ 2 ┆ 5 ┆ 2 ┆ ["B", "B", "C"] │
7373
│ 4 ┆ 7 ┆ 4 ┆ ["C"] │
7374
└─────────────────┴─────────────────┴─────┴─────────────────┘
7375
""" # noqa: W505
7376
return DynamicGroupBy(
7377
self,
7378
index_column=index_column,
7379
every=every,
7380
period=period,
7381
offset=offset,
7382
label=label,
7383
include_boundaries=include_boundaries,
7384
closed=closed,
7385
group_by=group_by,
7386
start_by=start_by,
7387
)
7388
7389
@deprecate_renamed_parameter("by", "group_by", version="0.20.14")
7390
def upsample(
7391
self,
7392
time_column: str,
7393
*,
7394
every: str | timedelta,
7395
group_by: str | Sequence[str] | None = None,
7396
maintain_order: bool = False,
7397
) -> DataFrame:
7398
"""
7399
Upsample a DataFrame at a regular frequency.
7400
7401
The `every` argument is created with the following string language:
7402
7403
- 1ns (1 nanosecond)
7404
- 1us (1 microsecond)
7405
- 1ms (1 millisecond)
7406
- 1s (1 second)
7407
- 1m (1 minute)
7408
- 1h (1 hour)
7409
- 1d (1 calendar day)
7410
- 1w (1 calendar week)
7411
- 1mo (1 calendar month)
7412
- 1q (1 calendar quarter)
7413
- 1y (1 calendar year)
7414
- 1i (1 index count)
7415
7416
Or combine them:
7417
7418
- "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
7419
7420
By "calendar day", we mean the corresponding time on the next day (which may
7421
not be 24 hours, due to daylight savings). Similarly for "calendar week",
7422
"calendar month", "calendar quarter", and "calendar year".
7423
7424
.. versionchanged:: 0.20.14
7425
The `by` parameter was renamed `group_by`.
7426
7427
Parameters
7428
----------
7429
time_column
7430
Time column will be used to determine a date_range.
7431
Note that this column has to be sorted for the output to make sense.
7432
every
7433
Interval will start 'every' duration.
7434
group_by
7435
First group by these columns and then upsample for every group.
7436
maintain_order
7437
Keep the ordering predictable. This is slower.
7438
7439
Returns
7440
-------
7441
DataFrame
7442
Result will be sorted by `time_column` (but note that if `group_by` columns
7443
are passed, it will only be sorted within each group).
7444
7445
Examples
7446
--------
7447
Upsample a DataFrame by a certain interval.
7448
7449
>>> from datetime import datetime
7450
>>> df = pl.DataFrame(
7451
... {
7452
... "time": [
7453
... datetime(2021, 2, 1),
7454
... datetime(2021, 4, 1),
7455
... datetime(2021, 5, 1),
7456
... datetime(2021, 6, 1),
7457
... ],
7458
... "groups": ["A", "B", "A", "B"],
7459
... "values": [0, 1, 2, 3],
7460
... }
7461
... ).set_sorted("time")
7462
>>> df.upsample(
7463
... time_column="time", every="1mo", group_by="groups", maintain_order=True
7464
... ).select(pl.all().fill_null(strategy="forward"))
7465
shape: (7, 3)
7466
┌─────────────────────┬────────┬────────┐
7467
│ time ┆ groups ┆ values │
7468
│ --- ┆ --- ┆ --- │
7469
│ datetime[μs] ┆ str ┆ i64 │
7470
╞═════════════════════╪════════╪════════╡
7471
│ 2021-02-01 00:00:00 ┆ A ┆ 0 │
7472
│ 2021-03-01 00:00:00 ┆ A ┆ 0 │
7473
│ 2021-04-01 00:00:00 ┆ A ┆ 0 │
7474
│ 2021-05-01 00:00:00 ┆ A ┆ 2 │
7475
│ 2021-04-01 00:00:00 ┆ B ┆ 1 │
7476
│ 2021-05-01 00:00:00 ┆ B ┆ 1 │
7477
│ 2021-06-01 00:00:00 ┆ B ┆ 3 │
7478
└─────────────────────┴────────┴────────┘
7479
"""
7480
if group_by is None:
7481
group_by = []
7482
if isinstance(group_by, str):
7483
group_by = [group_by]
7484
7485
every = parse_as_duration_string(every)
7486
7487
return self._from_pydf(
7488
self._df.upsample(group_by, time_column, every, maintain_order)
7489
)
7490
7491
def join_asof(
7492
self,
7493
other: DataFrame,
7494
*,
7495
left_on: str | None | Expr = None,
7496
right_on: str | None | Expr = None,
7497
on: str | None | Expr = None,
7498
by_left: str | Sequence[str] | None = None,
7499
by_right: str | Sequence[str] | None = None,
7500
by: str | Sequence[str] | None = None,
7501
strategy: AsofJoinStrategy = "backward",
7502
suffix: str = "_right",
7503
tolerance: str | int | float | timedelta | None = None,
7504
allow_parallel: bool = True,
7505
force_parallel: bool = False,
7506
coalesce: bool = True,
7507
allow_exact_matches: bool = True,
7508
check_sortedness: bool = True,
7509
) -> DataFrame:
7510
"""
7511
Perform an asof join.
7512
7513
This is similar to a left-join except that we match on nearest key rather than
7514
equal keys.
7515
7516
Both DataFrames must be sorted by the `on` key (within each `by` group, if
7517
specified).
7518
7519
For each row in the left DataFrame:
7520
7521
- A "backward" search selects the last row in the right DataFrame whose
7522
'on' key is less than or equal to the left's key.
7523
7524
- A "forward" search selects the first row in the right DataFrame whose
7525
'on' key is greater than or equal to the left's key.
7526
7527
- A "nearest" search selects the last row in the right DataFrame whose value
7528
is nearest to the left's key. String keys are not currently supported for a
7529
nearest search.
7530
7531
The default is "backward".
7532
7533
Parameters
7534
----------
7535
other
7536
Lazy DataFrame to join with.
7537
left_on
7538
Join column of the left DataFrame.
7539
right_on
7540
Join column of the right DataFrame.
7541
on
7542
Join column of both DataFrames. If set, `left_on` and `right_on` should be
7543
None.
7544
by
7545
Join on these columns before doing asof join
7546
by_left
7547
Join on these columns before doing asof join
7548
by_right
7549
Join on these columns before doing asof join
7550
strategy : {'backward', 'forward', 'nearest'}
7551
Join strategy.
7552
suffix
7553
Suffix to append to columns with a duplicate name.
7554
tolerance
7555
Numeric tolerance. By setting this the join will only be done if the near
7556
keys are within this distance. If an asof join is done on columns of dtype
7557
"Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta
7558
object or the following string language:
7559
7560
- 1ns (1 nanosecond)
7561
- 1us (1 microsecond)
7562
- 1ms (1 millisecond)
7563
- 1s (1 second)
7564
- 1m (1 minute)
7565
- 1h (1 hour)
7566
- 1d (1 calendar day)
7567
- 1w (1 calendar week)
7568
- 1mo (1 calendar month)
7569
- 1q (1 calendar quarter)
7570
- 1y (1 calendar year)
7571
7572
Or combine them:
7573
"3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
7574
7575
By "calendar day", we mean the corresponding time on the next day
7576
(which may not be 24 hours, due to daylight savings). Similarly for
7577
"calendar week", "calendar month", "calendar quarter", and
7578
"calendar year".
7579
7580
allow_parallel
7581
Allow the physical plan to optionally evaluate the computation of both
7582
DataFrames up to the join in parallel.
7583
force_parallel
7584
Force the physical plan to evaluate the computation of both DataFrames up to
7585
the join in parallel.
7586
coalesce
7587
Coalescing behavior (merging of `on` / `left_on` / `right_on` columns):
7588
7589
- *True*: Always coalesce join columns.
7590
- *False*: Never coalesce join columns.
7591
7592
Note that joining on any other expressions than `col`
7593
will turn off coalescing.
7594
allow_exact_matches
7595
Whether exact matches are valid join predicates.
7596
7597
- If True, allow matching with the same ``on`` value
7598
(i.e. less-than-or-equal-to / greater-than-or-equal-to)
7599
- If False, don't match the same ``on`` value
7600
(i.e., strictly less-than / strictly greater-than).
7601
check_sortedness
7602
Check the sortedness of the asof keys. If the keys are not sorted Polars
7603
will error. Currently, sortedness cannot be checked if 'by' groups are
7604
provided.
7605
7606
Examples
7607
--------
7608
>>> from datetime import date
7609
>>> gdp = pl.DataFrame(
7610
... {
7611
... "date": pl.date_range(
7612
... date(2016, 1, 1),
7613
... date(2020, 1, 1),
7614
... "1y",
7615
... eager=True,
7616
... ),
7617
... "gdp": [4164, 4411, 4566, 4696, 4827],
7618
... }
7619
... )
7620
>>> gdp
7621
shape: (5, 2)
7622
┌────────────┬──────┐
7623
│ date ┆ gdp │
7624
│ --- ┆ --- │
7625
│ date ┆ i64 │
7626
╞════════════╪══════╡
7627
│ 2016-01-01 ┆ 4164 │
7628
│ 2017-01-01 ┆ 4411 │
7629
│ 2018-01-01 ┆ 4566 │
7630
│ 2019-01-01 ┆ 4696 │
7631
│ 2020-01-01 ┆ 4827 │
7632
└────────────┴──────┘
7633
7634
>>> population = pl.DataFrame(
7635
... {
7636
... "date": [date(2016, 3, 1), date(2018, 8, 1), date(2019, 1, 1)],
7637
... "population": [82.19, 82.66, 83.12],
7638
... }
7639
... ).sort("date")
7640
>>> population
7641
shape: (3, 2)
7642
┌────────────┬────────────┐
7643
│ date ┆ population │
7644
│ --- ┆ --- │
7645
│ date ┆ f64 │
7646
╞════════════╪════════════╡
7647
│ 2016-03-01 ┆ 82.19 │
7648
│ 2018-08-01 ┆ 82.66 │
7649
│ 2019-01-01 ┆ 83.12 │
7650
└────────────┴────────────┘
7651
7652
Note how the dates don't quite match. If we join them using `join_asof` and
7653
`strategy='backward'`, then each date from `population` which doesn't have an
7654
exact match is matched with the closest earlier date from `gdp`:
7655
7656
>>> population.join_asof(gdp, on="date", strategy="backward")
7657
shape: (3, 3)
7658
┌────────────┬────────────┬──────┐
7659
│ date ┆ population ┆ gdp │
7660
│ --- ┆ --- ┆ --- │
7661
│ date ┆ f64 ┆ i64 │
7662
╞════════════╪════════════╪══════╡
7663
│ 2016-03-01 ┆ 82.19 ┆ 4164 │
7664
│ 2018-08-01 ┆ 82.66 ┆ 4566 │
7665
│ 2019-01-01 ┆ 83.12 ┆ 4696 │
7666
└────────────┴────────────┴──────┘
7667
7668
Note how:
7669
7670
- date `2016-03-01` from `population` is matched with `2016-01-01` from `gdp`;
7671
- date `2018-08-01` from `population` is matched with `2018-01-01` from `gdp`.
7672
7673
You can verify this by passing `coalesce=False`:
7674
7675
>>> population.join_asof(gdp, on="date", strategy="backward", coalesce=False)
7676
shape: (3, 4)
7677
┌────────────┬────────────┬────────────┬──────┐
7678
│ date ┆ population ┆ date_right ┆ gdp │
7679
│ --- ┆ --- ┆ --- ┆ --- │
7680
│ date ┆ f64 ┆ date ┆ i64 │
7681
╞════════════╪════════════╪════════════╪══════╡
7682
│ 2016-03-01 ┆ 82.19 ┆ 2016-01-01 ┆ 4164 │
7683
│ 2018-08-01 ┆ 82.66 ┆ 2018-01-01 ┆ 4566 │
7684
│ 2019-01-01 ┆ 83.12 ┆ 2019-01-01 ┆ 4696 │
7685
└────────────┴────────────┴────────────┴──────┘
7686
7687
If we instead use `strategy='forward'`, then each date from `population` which
7688
doesn't have an exact match is matched with the closest later date from `gdp`:
7689
7690
>>> population.join_asof(gdp, on="date", strategy="forward")
7691
shape: (3, 3)
7692
┌────────────┬────────────┬──────┐
7693
│ date ┆ population ┆ gdp │
7694
│ --- ┆ --- ┆ --- │
7695
│ date ┆ f64 ┆ i64 │
7696
╞════════════╪════════════╪══════╡
7697
│ 2016-03-01 ┆ 82.19 ┆ 4411 │
7698
│ 2018-08-01 ┆ 82.66 ┆ 4696 │
7699
│ 2019-01-01 ┆ 83.12 ┆ 4696 │
7700
└────────────┴────────────┴──────┘
7701
7702
Note how:
7703
7704
- date `2016-03-01` from `population` is matched with `2017-01-01` from `gdp`;
7705
- date `2018-08-01` from `population` is matched with `2019-01-01` from `gdp`.
7706
7707
Finally, `strategy='nearest'` gives us a mix of the two results above, as each
7708
date from `population` which doesn't have an exact match is matched with the
7709
closest date from `gdp`, regardless of whether it's earlier or later:
7710
7711
>>> population.join_asof(gdp, on="date", strategy="nearest")
7712
shape: (3, 3)
7713
┌────────────┬────────────┬──────┐
7714
│ date ┆ population ┆ gdp │
7715
│ --- ┆ --- ┆ --- │
7716
│ date ┆ f64 ┆ i64 │
7717
╞════════════╪════════════╪══════╡
7718
│ 2016-03-01 ┆ 82.19 ┆ 4164 │
7719
│ 2018-08-01 ┆ 82.66 ┆ 4696 │
7720
│ 2019-01-01 ┆ 83.12 ┆ 4696 │
7721
└────────────┴────────────┴──────┘
7722
7723
Note how:
7724
7725
- date `2016-03-01` from `population` is matched with `2016-01-01` from `gdp`;
7726
- date `2018-08-01` from `population` is matched with `2019-01-01` from `gdp`.
7727
7728
They `by` argument allows joining on another column first, before the asof join.
7729
In this example we join by `country` first, then asof join by date, as above.
7730
7731
>>> gdp_dates = pl.date_range( # fmt: skip
7732
... date(2016, 1, 1), date(2020, 1, 1), "1y", eager=True
7733
... )
7734
>>> gdp2 = pl.DataFrame(
7735
... {
7736
... "country": ["Germany"] * 5 + ["Netherlands"] * 5,
7737
... "date": pl.concat([gdp_dates, gdp_dates]),
7738
... "gdp": [4164, 4411, 4566, 4696, 4827, 784, 833, 914, 910, 909],
7739
... }
7740
... ).sort("country", "date")
7741
>>>
7742
>>> gdp2
7743
shape: (10, 3)
7744
┌─────────────┬────────────┬──────┐
7745
│ country ┆ date ┆ gdp │
7746
│ --- ┆ --- ┆ --- │
7747
│ str ┆ date ┆ i64 │
7748
╞═════════════╪════════════╪══════╡
7749
│ Germany ┆ 2016-01-01 ┆ 4164 │
7750
│ Germany ┆ 2017-01-01 ┆ 4411 │
7751
│ Germany ┆ 2018-01-01 ┆ 4566 │
7752
│ Germany ┆ 2019-01-01 ┆ 4696 │
7753
│ Germany ┆ 2020-01-01 ┆ 4827 │
7754
│ Netherlands ┆ 2016-01-01 ┆ 784 │
7755
│ Netherlands ┆ 2017-01-01 ┆ 833 │
7756
│ Netherlands ┆ 2018-01-01 ┆ 914 │
7757
│ Netherlands ┆ 2019-01-01 ┆ 910 │
7758
│ Netherlands ┆ 2020-01-01 ┆ 909 │
7759
└─────────────┴────────────┴──────┘
7760
>>> pop2 = pl.DataFrame(
7761
... {
7762
... "country": ["Germany"] * 3 + ["Netherlands"] * 3,
7763
... "date": [
7764
... date(2016, 3, 1),
7765
... date(2018, 8, 1),
7766
... date(2019, 1, 1),
7767
... date(2016, 3, 1),
7768
... date(2018, 8, 1),
7769
... date(2019, 1, 1),
7770
... ],
7771
... "population": [82.19, 82.66, 83.12, 17.11, 17.32, 17.40],
7772
... }
7773
... ).sort("country", "date")
7774
>>>
7775
>>> pop2
7776
shape: (6, 3)
7777
┌─────────────┬────────────┬────────────┐
7778
│ country ┆ date ┆ population │
7779
│ --- ┆ --- ┆ --- │
7780
│ str ┆ date ┆ f64 │
7781
╞═════════════╪════════════╪════════════╡
7782
│ Germany ┆ 2016-03-01 ┆ 82.19 │
7783
│ Germany ┆ 2018-08-01 ┆ 82.66 │
7784
│ Germany ┆ 2019-01-01 ┆ 83.12 │
7785
│ Netherlands ┆ 2016-03-01 ┆ 17.11 │
7786
│ Netherlands ┆ 2018-08-01 ┆ 17.32 │
7787
│ Netherlands ┆ 2019-01-01 ┆ 17.4 │
7788
└─────────────┴────────────┴────────────┘
7789
>>> pop2.join_asof(gdp2, by="country", on="date", strategy="nearest")
7790
shape: (6, 4)
7791
┌─────────────┬────────────┬────────────┬──────┐
7792
│ country ┆ date ┆ population ┆ gdp │
7793
│ --- ┆ --- ┆ --- ┆ --- │
7794
│ str ┆ date ┆ f64 ┆ i64 │
7795
╞═════════════╪════════════╪════════════╪══════╡
7796
│ Germany ┆ 2016-03-01 ┆ 82.19 ┆ 4164 │
7797
│ Germany ┆ 2018-08-01 ┆ 82.66 ┆ 4696 │
7798
│ Germany ┆ 2019-01-01 ┆ 83.12 ┆ 4696 │
7799
│ Netherlands ┆ 2016-03-01 ┆ 17.11 ┆ 784 │
7800
│ Netherlands ┆ 2018-08-01 ┆ 17.32 ┆ 910 │
7801
│ Netherlands ┆ 2019-01-01 ┆ 17.4 ┆ 910 │
7802
└─────────────┴────────────┴────────────┴──────┘
7803
"""
7804
require_same_type(self, other)
7805
7806
if on is not None:
7807
if not isinstance(on, (str, pl.Expr)):
7808
msg = (
7809
f"expected `on` to be str or Expr, got {qualified_type_name(on)!r}"
7810
)
7811
raise TypeError(msg)
7812
else:
7813
if not isinstance(left_on, (str, pl.Expr)):
7814
msg = f"expected `left_on` to be str or Expr, got {qualified_type_name(left_on)!r}"
7815
raise TypeError(msg)
7816
elif not isinstance(right_on, (str, pl.Expr)):
7817
msg = f"expected `right_on` to be str or Expr, got {qualified_type_name(right_on)!r}"
7818
raise TypeError(msg)
7819
7820
from polars.lazyframe.opt_flags import QueryOptFlags
7821
7822
return (
7823
self.lazy()
7824
.join_asof(
7825
other.lazy(),
7826
left_on=left_on,
7827
right_on=right_on,
7828
on=on,
7829
by_left=by_left,
7830
by_right=by_right,
7831
by=by,
7832
strategy=strategy,
7833
suffix=suffix,
7834
tolerance=tolerance,
7835
allow_parallel=allow_parallel,
7836
force_parallel=force_parallel,
7837
coalesce=coalesce,
7838
allow_exact_matches=allow_exact_matches,
7839
check_sortedness=check_sortedness,
7840
)
7841
.collect(optimizations=QueryOptFlags._eager())
7842
)
7843
7844
@deprecate_renamed_parameter("join_nulls", "nulls_equal", version="1.24")
7845
def join(
7846
self,
7847
other: DataFrame,
7848
on: str | Expr | Sequence[str | Expr] | None = None,
7849
how: JoinStrategy = "inner",
7850
*,
7851
left_on: str | Expr | Sequence[str | Expr] | None = None,
7852
right_on: str | Expr | Sequence[str | Expr] | None = None,
7853
suffix: str = "_right",
7854
validate: JoinValidation = "m:m",
7855
nulls_equal: bool = False,
7856
coalesce: bool | None = None,
7857
maintain_order: MaintainOrderJoin | None = None,
7858
) -> DataFrame:
7859
"""
7860
Join in SQL-like fashion.
7861
7862
.. versionchanged:: 1.24
7863
The `join_nulls` parameter was renamed `nulls_equal`.
7864
7865
Parameters
7866
----------
7867
other
7868
DataFrame to join with.
7869
on
7870
Name(s) of the join columns in both DataFrames. If set, `left_on` and
7871
`right_on` should be None. This should not be specified if `how='cross'`.
7872
how : {'inner', 'left', 'right', 'full', 'semi', 'anti', 'cross'}
7873
Join strategy.
7874
7875
.. list-table ::
7876
:header-rows: 0
7877
7878
* - **inner**
7879
- *(Default)* Returns rows that have matching values in both tables.
7880
* - **left**
7881
- Returns all rows from the left table, and the matched rows from
7882
the right table.
7883
* - **full**
7884
- Returns all rows when there is a match in either left or right.
7885
* - **cross**
7886
- Returns the Cartesian product of rows from both tables
7887
* - **semi**
7888
- Returns rows from the left table that have a match in the right
7889
table.
7890
* - **anti**
7891
- Returns rows from the left table that have no match in the right
7892
table.
7893
7894
left_on
7895
Name(s) of the left join column(s).
7896
right_on
7897
Name(s) of the right join column(s).
7898
suffix
7899
Suffix to append to columns with a duplicate name.
7900
validate: {'m:m', 'm:1', '1:m', '1:1'}
7901
Checks if join is of specified type.
7902
7903
.. list-table ::
7904
:header-rows: 0
7905
7906
* - **m:m**
7907
- *(Default)* Many-to-many (default). Does not result in checks.
7908
* - **1:1**
7909
- One-to-one. Checks if join keys are unique in both left and
7910
right datasets.
7911
* - **1:m**
7912
- One-to-many. Checks if join keys are unique in left dataset.
7913
* - **m:1**
7914
- Many-to-one. Check if join keys are unique in right dataset.
7915
7916
.. note::
7917
This is currently not supported by the streaming engine.
7918
7919
nulls_equal
7920
Join on null values. By default null values will never produce matches.
7921
coalesce
7922
Coalescing behavior (merging of join columns).
7923
7924
.. list-table ::
7925
:header-rows: 0
7926
7927
* - **None**
7928
- *(Default)* Coalesce unless `how='full'` is specified.
7929
* - **True**
7930
- Always coalesce join columns.
7931
* - **False**
7932
- Never coalesce join columns.
7933
7934
.. note::
7935
Joining on any other expressions than `col`
7936
will turn off coalescing.
7937
maintain_order : {'none', 'left', 'right', 'left_right', 'right_left'}
7938
Which DataFrame row order to preserve, if any.
7939
Do not rely on any observed ordering without explicitly setting this
7940
parameter, as your code may break in a future release.
7941
Not specifying any ordering can improve performance.
7942
Supported for inner, left, right and full joins
7943
7944
.. list-table ::
7945
:header-rows: 0
7946
7947
* - **none**
7948
- *(Default)* No specific ordering is desired. The ordering might
7949
differ across Polars versions or even between different runs.
7950
* - **left**
7951
- Preserves the order of the left DataFrame.
7952
* - **right**
7953
- Preserves the order of the right DataFrame.
7954
* - **left_right**
7955
- First preserves the order of the left DataFrame, then the right.
7956
* - **right_left**
7957
- First preserves the order of the right DataFrame, then the left.
7958
7959
See Also
7960
--------
7961
join_asof
7962
7963
Examples
7964
--------
7965
>>> df = pl.DataFrame(
7966
... {
7967
... "foo": [1, 2, 3],
7968
... "bar": [6.0, 7.0, 8.0],
7969
... "ham": ["a", "b", "c"],
7970
... }
7971
... )
7972
>>> other_df = pl.DataFrame(
7973
... {
7974
... "apple": ["x", "y", "z"],
7975
... "ham": ["a", "b", "d"],
7976
... }
7977
... )
7978
>>> df.join(other_df, on="ham")
7979
shape: (2, 4)
7980
┌─────┬─────┬─────┬───────┐
7981
│ foo ┆ bar ┆ ham ┆ apple │
7982
│ --- ┆ --- ┆ --- ┆ --- │
7983
│ i64 ┆ f64 ┆ str ┆ str │
7984
╞═════╪═════╪═════╪═══════╡
7985
│ 1 ┆ 6.0 ┆ a ┆ x │
7986
│ 2 ┆ 7.0 ┆ b ┆ y │
7987
└─────┴─────┴─────┴───────┘
7988
7989
>>> df.join(other_df, on="ham", how="full")
7990
shape: (4, 5)
7991
┌──────┬──────┬──────┬───────┬───────────┐
7992
│ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │
7993
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
7994
│ i64 ┆ f64 ┆ str ┆ str ┆ str │
7995
╞══════╪══════╪══════╪═══════╪═══════════╡
7996
│ 1 ┆ 6.0 ┆ a ┆ x ┆ a │
7997
│ 2 ┆ 7.0 ┆ b ┆ y ┆ b │
7998
│ null ┆ null ┆ null ┆ z ┆ d │
7999
│ 3 ┆ 8.0 ┆ c ┆ null ┆ null │
8000
└──────┴──────┴──────┴───────┴───────────┘
8001
8002
>>> df.join(other_df, on="ham", how="full", coalesce=True)
8003
shape: (4, 4)
8004
┌──────┬──────┬─────┬───────┐
8005
│ foo ┆ bar ┆ ham ┆ apple │
8006
│ --- ┆ --- ┆ --- ┆ --- │
8007
│ i64 ┆ f64 ┆ str ┆ str │
8008
╞══════╪══════╪═════╪═══════╡
8009
│ 1 ┆ 6.0 ┆ a ┆ x │
8010
│ 2 ┆ 7.0 ┆ b ┆ y │
8011
│ null ┆ null ┆ d ┆ z │
8012
│ 3 ┆ 8.0 ┆ c ┆ null │
8013
└──────┴──────┴─────┴───────┘
8014
8015
>>> df.join(other_df, on="ham", how="left")
8016
shape: (3, 4)
8017
┌─────┬─────┬─────┬───────┐
8018
│ foo ┆ bar ┆ ham ┆ apple │
8019
│ --- ┆ --- ┆ --- ┆ --- │
8020
│ i64 ┆ f64 ┆ str ┆ str │
8021
╞═════╪═════╪═════╪═══════╡
8022
│ 1 ┆ 6.0 ┆ a ┆ x │
8023
│ 2 ┆ 7.0 ┆ b ┆ y │
8024
│ 3 ┆ 8.0 ┆ c ┆ null │
8025
└─────┴─────┴─────┴───────┘
8026
8027
>>> df.join(other_df, on="ham", how="semi")
8028
shape: (2, 3)
8029
┌─────┬─────┬─────┐
8030
│ foo ┆ bar ┆ ham │
8031
│ --- ┆ --- ┆ --- │
8032
│ i64 ┆ f64 ┆ str │
8033
╞═════╪═════╪═════╡
8034
│ 1 ┆ 6.0 ┆ a │
8035
│ 2 ┆ 7.0 ┆ b │
8036
└─────┴─────┴─────┘
8037
8038
>>> df.join(other_df, on="ham", how="anti")
8039
shape: (1, 3)
8040
┌─────┬─────┬─────┐
8041
│ foo ┆ bar ┆ ham │
8042
│ --- ┆ --- ┆ --- │
8043
│ i64 ┆ f64 ┆ str │
8044
╞═════╪═════╪═════╡
8045
│ 3 ┆ 8.0 ┆ c │
8046
└─────┴─────┴─────┘
8047
8048
>>> df.join(other_df, how="cross")
8049
shape: (9, 5)
8050
┌─────┬─────┬─────┬───────┬───────────┐
8051
│ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │
8052
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
8053
│ i64 ┆ f64 ┆ str ┆ str ┆ str │
8054
╞═════╪═════╪═════╪═══════╪═══════════╡
8055
│ 1 ┆ 6.0 ┆ a ┆ x ┆ a │
8056
│ 1 ┆ 6.0 ┆ a ┆ y ┆ b │
8057
│ 1 ┆ 6.0 ┆ a ┆ z ┆ d │
8058
│ 2 ┆ 7.0 ┆ b ┆ x ┆ a │
8059
│ 2 ┆ 7.0 ┆ b ┆ y ┆ b │
8060
│ 2 ┆ 7.0 ┆ b ┆ z ┆ d │
8061
│ 3 ┆ 8.0 ┆ c ┆ x ┆ a │
8062
│ 3 ┆ 8.0 ┆ c ┆ y ┆ b │
8063
│ 3 ┆ 8.0 ┆ c ┆ z ┆ d │
8064
└─────┴─────┴─────┴───────┴───────────┘
8065
8066
Notes
8067
-----
8068
For joining on columns with categorical data, see :class:`polars.StringCache`.
8069
"""
8070
require_same_type(self, other)
8071
8072
from polars.lazyframe.opt_flags import QueryOptFlags
8073
8074
return (
8075
self.lazy()
8076
.join(
8077
other=other.lazy(),
8078
left_on=left_on,
8079
right_on=right_on,
8080
on=on,
8081
how=how,
8082
suffix=suffix,
8083
validate=validate,
8084
nulls_equal=nulls_equal,
8085
coalesce=coalesce,
8086
maintain_order=maintain_order,
8087
)
8088
.collect(optimizations=QueryOptFlags._eager())
8089
)
8090
8091
@unstable()
8092
def join_where(
8093
self,
8094
other: DataFrame,
8095
*predicates: Expr | Iterable[Expr],
8096
suffix: str = "_right",
8097
) -> DataFrame:
8098
"""
8099
Perform a join based on one or multiple (in)equality predicates.
8100
8101
This performs an inner join, so only rows where all predicates are true
8102
are included in the result, and a row from either DataFrame may be included
8103
multiple times in the result.
8104
8105
.. note::
8106
The row order of the input DataFrames is not preserved.
8107
8108
.. warning::
8109
This functionality is experimental. It may be
8110
changed at any point without it being considered a breaking change.
8111
8112
Parameters
8113
----------
8114
other
8115
DataFrame to join with.
8116
*predicates
8117
(In)Equality condition to join the two tables on.
8118
When a column name occurs in both tables, the proper suffix must
8119
be applied in the predicate.
8120
suffix
8121
Suffix to append to columns with a duplicate name.
8122
8123
Examples
8124
--------
8125
Join two dataframes together based on two predicates which get AND-ed together.
8126
8127
>>> east = pl.DataFrame(
8128
... {
8129
... "id": [100, 101, 102],
8130
... "dur": [120, 140, 160],
8131
... "rev": [12, 14, 16],
8132
... "cores": [2, 8, 4],
8133
... }
8134
... )
8135
>>> west = pl.DataFrame(
8136
... {
8137
... "t_id": [404, 498, 676, 742],
8138
... "time": [90, 130, 150, 170],
8139
... "cost": [9, 13, 15, 16],
8140
... "cores": [4, 2, 1, 4],
8141
... }
8142
... )
8143
>>> east.join_where(
8144
... west,
8145
... pl.col("dur") < pl.col("time"),
8146
... pl.col("rev") < pl.col("cost"),
8147
... )
8148
shape: (5, 8)
8149
┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
8150
│ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
8151
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
8152
│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
8153
╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
8154
│ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
8155
│ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
8156
│ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
8157
│ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
8158
│ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
8159
└─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
8160
8161
To OR them together, use a single expression and the `|` operator.
8162
8163
>>> east.join_where(
8164
... west,
8165
... (pl.col("dur") < pl.col("time")) | (pl.col("rev") < pl.col("cost")),
8166
... )
8167
shape: (6, 8)
8168
┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
8169
│ id ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
8170
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
8171
│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
8172
╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
8173
│ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 498 ┆ 130 ┆ 13 ┆ 2 │
8174
│ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
8175
│ 100 ┆ 120 ┆ 12 ┆ 2 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
8176
│ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 676 ┆ 150 ┆ 15 ┆ 1 │
8177
│ 101 ┆ 140 ┆ 14 ┆ 8 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
8178
│ 102 ┆ 160 ┆ 16 ┆ 4 ┆ 742 ┆ 170 ┆ 16 ┆ 4 │
8179
└─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
8180
"""
8181
require_same_type(self, other)
8182
8183
from polars.lazyframe.opt_flags import QueryOptFlags
8184
8185
return (
8186
self.lazy()
8187
.join_where(
8188
other.lazy(),
8189
*predicates,
8190
suffix=suffix,
8191
)
8192
.collect(optimizations=QueryOptFlags._eager())
8193
)
8194
8195
def map_rows(
8196
self,
8197
function: Callable[[tuple[Any, ...]], Any],
8198
return_dtype: PolarsDataType | None = None,
8199
*,
8200
inference_size: int = 256,
8201
) -> DataFrame:
8202
"""
8203
Apply a custom/user-defined function (UDF) over the rows of the DataFrame.
8204
8205
.. warning::
8206
This method is much slower than the native expressions API.
8207
Only use it if you cannot implement your logic otherwise.
8208
8209
The UDF will receive each row as a tuple of values: `udf(row)`.
8210
8211
Implementing logic using a Python function is almost always *significantly*
8212
slower and more memory intensive than implementing the same logic using
8213
the native expression API because:
8214
8215
- The native expression engine runs in Rust; UDFs run in Python.
8216
- Use of Python UDFs forces the DataFrame to be materialized in memory.
8217
- Polars-native expressions can be parallelised (UDFs typically cannot).
8218
- Polars-native expressions can be logically optimised (UDFs cannot).
8219
8220
Wherever possible you should strongly prefer the native expression API
8221
to achieve the best performance.
8222
8223
Parameters
8224
----------
8225
function
8226
Custom function or lambda.
8227
return_dtype
8228
Output type of the operation. If none given, Polars tries to infer the type.
8229
inference_size
8230
Only used in the case when the custom function returns rows.
8231
This uses the first `n` rows to determine the output schema.
8232
8233
Notes
8234
-----
8235
* The frame-level `map_rows` cannot track column names (as the UDF is a
8236
black-box that may arbitrarily drop, rearrange, transform, or add new
8237
columns); if you want to apply a UDF such that column names are preserved,
8238
you should use the expression-level `map_elements` syntax instead.
8239
8240
* If your function is expensive and you don't want it to be called more than
8241
once for a given input, consider applying an `@lru_cache` decorator to it.
8242
If your data is suitable you may achieve *significant* speedups.
8243
8244
Examples
8245
--------
8246
>>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]})
8247
8248
Return a DataFrame by mapping each row to a tuple:
8249
8250
>>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3))
8251
shape: (3, 2)
8252
┌──────────┬──────────┐
8253
│ column_0 ┆ column_1 │
8254
│ --- ┆ --- │
8255
│ i64 ┆ i64 │
8256
╞══════════╪══════════╡
8257
│ 2 ┆ -3 │
8258
│ 4 ┆ 15 │
8259
│ 6 ┆ 24 │
8260
└──────────┴──────────┘
8261
8262
However, it is much better to implement this with a native expression:
8263
8264
>>> df.select(
8265
... pl.col("foo") * 2,
8266
... pl.col("bar") * 3,
8267
... ) # doctest: +IGNORE_RESULT
8268
8269
Return a DataFrame with a single column by mapping each row to a scalar:
8270
8271
>>> df.map_rows(lambda t: (t[0] * 2 + t[1]))
8272
shape: (3, 1)
8273
┌─────┐
8274
│ map │
8275
│ --- │
8276
│ i64 │
8277
╞═════╡
8278
│ 1 │
8279
│ 9 │
8280
│ 14 │
8281
└─────┘
8282
8283
In this case it is better to use the following native expression:
8284
8285
>>> df.select(pl.col("foo") * 2 + pl.col("bar")) # doctest: +IGNORE_RESULT
8286
"""
8287
# TODO: Enable warning for inefficient map
8288
# from polars._utils.udfs import warn_on_inefficient_map
8289
# warn_on_inefficient_map(function, columns=self.columns, map_target="frame)
8290
8291
out, is_df = self._df.map_rows(function, return_dtype, inference_size)
8292
if is_df:
8293
return self._from_pydf(out)
8294
else:
8295
return wrap_s(out).to_frame()
8296
8297
def hstack(
8298
self, columns: list[Series] | DataFrame, *, in_place: bool = False
8299
) -> DataFrame:
8300
"""
8301
Return a new DataFrame grown horizontally by stacking multiple Series to it.
8302
8303
Parameters
8304
----------
8305
columns
8306
Series to stack.
8307
in_place
8308
Modify in place.
8309
8310
Examples
8311
--------
8312
>>> df = pl.DataFrame(
8313
... {
8314
... "foo": [1, 2, 3],
8315
... "bar": [6, 7, 8],
8316
... "ham": ["a", "b", "c"],
8317
... }
8318
... )
8319
>>> x = pl.Series("apple", [10, 20, 30])
8320
>>> df.hstack([x])
8321
shape: (3, 4)
8322
┌─────┬─────┬─────┬───────┐
8323
│ foo ┆ bar ┆ ham ┆ apple │
8324
│ --- ┆ --- ┆ --- ┆ --- │
8325
│ i64 ┆ i64 ┆ str ┆ i64 │
8326
╞═════╪═════╪═════╪═══════╡
8327
│ 1 ┆ 6 ┆ a ┆ 10 │
8328
│ 2 ┆ 7 ┆ b ┆ 20 │
8329
│ 3 ┆ 8 ┆ c ┆ 30 │
8330
└─────┴─────┴─────┴───────┘
8331
"""
8332
if not isinstance(columns, list):
8333
columns = columns.get_columns()
8334
if in_place:
8335
self._df.hstack_mut([s._s for s in columns])
8336
return self
8337
else:
8338
return self._from_pydf(self._df.hstack([s._s for s in columns]))
8339
8340
def vstack(self, other: DataFrame, *, in_place: bool = False) -> DataFrame:
8341
"""
8342
Grow this DataFrame vertically by stacking a DataFrame to it.
8343
8344
Parameters
8345
----------
8346
other
8347
DataFrame to stack.
8348
in_place
8349
Modify in place.
8350
8351
See Also
8352
--------
8353
extend
8354
8355
Examples
8356
--------
8357
>>> df1 = pl.DataFrame(
8358
... {
8359
... "foo": [1, 2],
8360
... "bar": [6, 7],
8361
... "ham": ["a", "b"],
8362
... }
8363
... )
8364
>>> df2 = pl.DataFrame(
8365
... {
8366
... "foo": [3, 4],
8367
... "bar": [8, 9],
8368
... "ham": ["c", "d"],
8369
... }
8370
... )
8371
>>> df1.vstack(df2)
8372
shape: (4, 3)
8373
┌─────┬─────┬─────┐
8374
│ foo ┆ bar ┆ ham │
8375
│ --- ┆ --- ┆ --- │
8376
│ i64 ┆ i64 ┆ str │
8377
╞═════╪═════╪═════╡
8378
│ 1 ┆ 6 ┆ a │
8379
│ 2 ┆ 7 ┆ b │
8380
│ 3 ┆ 8 ┆ c │
8381
│ 4 ┆ 9 ┆ d │
8382
└─────┴─────┴─────┘
8383
"""
8384
require_same_type(self, other)
8385
if in_place:
8386
self._df.vstack_mut(other._df)
8387
return self
8388
8389
return self._from_pydf(self._df.vstack(other._df))
8390
8391
def extend(self, other: DataFrame) -> DataFrame:
8392
"""
8393
Extend the memory backed by this `DataFrame` with the values from `other`.
8394
8395
Different from `vstack` which adds the chunks from `other` to the chunks of
8396
this `DataFrame`, `extend` appends the data from `other` to the underlying
8397
memory locations and thus may cause a reallocation.
8398
8399
If this does not cause a reallocation, the resulting data structure will not
8400
have any extra chunks and thus will yield faster queries.
8401
8402
Prefer `extend` over `vstack` when you want to do a query after a single
8403
append. For instance, during online operations where you add `n` rows and rerun
8404
a query.
8405
8406
Prefer `vstack` over `extend` when you want to append many times before
8407
doing a query. For instance, when you read in multiple files and want to store
8408
them in a single `DataFrame`. In the latter case, finish the sequence of
8409
`vstack` operations with a `rechunk`.
8410
8411
Parameters
8412
----------
8413
other
8414
DataFrame to vertically add.
8415
8416
Warnings
8417
--------
8418
This method modifies the dataframe in-place. The dataframe is returned for
8419
convenience only.
8420
8421
See Also
8422
--------
8423
vstack
8424
8425
Examples
8426
--------
8427
>>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
8428
>>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]})
8429
>>> df1.extend(df2)
8430
shape: (6, 2)
8431
┌─────┬─────┐
8432
│ foo ┆ bar │
8433
│ --- ┆ --- │
8434
│ i64 ┆ i64 │
8435
╞═════╪═════╡
8436
│ 1 ┆ 4 │
8437
│ 2 ┆ 5 │
8438
│ 3 ┆ 6 │
8439
│ 10 ┆ 40 │
8440
│ 20 ┆ 50 │
8441
│ 30 ┆ 60 │
8442
└─────┴─────┘
8443
"""
8444
require_same_type(self, other)
8445
self._df.extend(other._df)
8446
return self
8447
8448
def drop(
8449
self,
8450
*columns: ColumnNameOrSelector | Iterable[ColumnNameOrSelector],
8451
strict: bool = True,
8452
) -> DataFrame:
8453
"""
8454
Remove columns from the dataframe.
8455
8456
Parameters
8457
----------
8458
*columns
8459
Names of the columns that should be removed from the dataframe.
8460
Accepts column selector input.
8461
strict
8462
Validate that all column names exist in the current schema,
8463
and throw an exception if any do not.
8464
8465
Examples
8466
--------
8467
Drop a single column by passing the name of that column.
8468
8469
>>> df = pl.DataFrame(
8470
... {
8471
... "foo": [1, 2, 3],
8472
... "bar": [6.0, 7.0, 8.0],
8473
... "ham": ["a", "b", "c"],
8474
... }
8475
... )
8476
>>> df.drop("ham")
8477
shape: (3, 2)
8478
┌─────┬─────┐
8479
│ foo ┆ bar │
8480
│ --- ┆ --- │
8481
│ i64 ┆ f64 │
8482
╞═════╪═════╡
8483
│ 1 ┆ 6.0 │
8484
│ 2 ┆ 7.0 │
8485
│ 3 ┆ 8.0 │
8486
└─────┴─────┘
8487
8488
Drop multiple columns by passing a list of column names.
8489
8490
>>> df.drop(["bar", "ham"])
8491
shape: (3, 1)
8492
┌─────┐
8493
│ foo │
8494
│ --- │
8495
│ i64 │
8496
╞═════╡
8497
│ 1 │
8498
│ 2 │
8499
│ 3 │
8500
└─────┘
8501
8502
Drop multiple columns by passing a selector.
8503
8504
>>> import polars.selectors as cs
8505
>>> df.drop(cs.numeric())
8506
shape: (3, 1)
8507
┌─────┐
8508
│ ham │
8509
│ --- │
8510
│ str │
8511
╞═════╡
8512
│ a │
8513
│ b │
8514
│ c │
8515
└─────┘
8516
8517
Use positional arguments to drop multiple columns.
8518
8519
>>> df.drop("foo", "ham")
8520
shape: (3, 1)
8521
┌─────┐
8522
│ bar │
8523
│ --- │
8524
│ f64 │
8525
╞═════╡
8526
│ 6.0 │
8527
│ 7.0 │
8528
│ 8.0 │
8529
└─────┘
8530
"""
8531
from polars.lazyframe.opt_flags import QueryOptFlags
8532
8533
return (
8534
self.lazy()
8535
.drop(*columns, strict=strict)
8536
.collect(optimizations=QueryOptFlags._eager())
8537
)
8538
8539
def drop_in_place(self, name: str) -> Series:
8540
"""
8541
Drop a single column in-place and return the dropped column.
8542
8543
Parameters
8544
----------
8545
name
8546
Name of the column to drop.
8547
8548
Returns
8549
-------
8550
Series
8551
The dropped column.
8552
8553
Examples
8554
--------
8555
>>> df = pl.DataFrame(
8556
... {
8557
... "foo": [1, 2, 3],
8558
... "bar": [6, 7, 8],
8559
... "ham": ["a", "b", "c"],
8560
... }
8561
... )
8562
>>> df.drop_in_place("ham")
8563
shape: (3,)
8564
Series: 'ham' [str]
8565
[
8566
"a"
8567
"b"
8568
"c"
8569
]
8570
"""
8571
return wrap_s(self._df.drop_in_place(name))
8572
8573
def cast(
8574
self,
8575
dtypes: (
8576
Mapping[
8577
ColumnNameOrSelector | PolarsDataType, PolarsDataType | PythonDataType
8578
]
8579
| PolarsDataType
8580
),
8581
*,
8582
strict: bool = True,
8583
) -> DataFrame:
8584
"""
8585
Cast DataFrame column(s) to the specified dtype(s).
8586
8587
Parameters
8588
----------
8589
dtypes
8590
Mapping of column names (or selector) to dtypes, or a single dtype
8591
to which all columns will be cast.
8592
strict
8593
Raise if cast is invalid on rows after predicates are pushed down.
8594
If `False`, invalid casts will produce null values.
8595
8596
Examples
8597
--------
8598
>>> from datetime import date
8599
>>> df = pl.DataFrame(
8600
... {
8601
... "foo": [1, 2, 3],
8602
... "bar": [6.0, 7.0, 8.0],
8603
... "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)],
8604
... }
8605
... )
8606
8607
Cast specific frame columns to the specified dtypes:
8608
8609
>>> df.cast({"foo": pl.Float32, "bar": pl.UInt8})
8610
shape: (3, 3)
8611
┌─────┬─────┬────────────┐
8612
│ foo ┆ bar ┆ ham │
8613
│ --- ┆ --- ┆ --- │
8614
│ f32 ┆ u8 ┆ date │
8615
╞═════╪═════╪════════════╡
8616
│ 1.0 ┆ 6 ┆ 2020-01-02 │
8617
│ 2.0 ┆ 7 ┆ 2021-03-04 │
8618
│ 3.0 ┆ 8 ┆ 2022-05-06 │
8619
└─────┴─────┴────────────┘
8620
8621
Cast all frame columns matching one dtype (or dtype group) to another dtype:
8622
8623
>>> df.cast({pl.Date: pl.Datetime})
8624
shape: (3, 3)
8625
┌─────┬─────┬─────────────────────┐
8626
│ foo ┆ bar ┆ ham │
8627
│ --- ┆ --- ┆ --- │
8628
│ i64 ┆ f64 ┆ datetime[μs] │
8629
╞═════╪═════╪═════════════════════╡
8630
│ 1 ┆ 6.0 ┆ 2020-01-02 00:00:00 │
8631
│ 2 ┆ 7.0 ┆ 2021-03-04 00:00:00 │
8632
│ 3 ┆ 8.0 ┆ 2022-05-06 00:00:00 │
8633
└─────┴─────┴─────────────────────┘
8634
8635
Use selectors to define the columns being cast:
8636
8637
>>> import polars.selectors as cs
8638
>>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.String})
8639
shape: (3, 3)
8640
┌─────┬─────┬────────────┐
8641
│ foo ┆ bar ┆ ham │
8642
│ --- ┆ --- ┆ --- │
8643
│ u32 ┆ u32 ┆ str │
8644
╞═════╪═════╪════════════╡
8645
│ 1 ┆ 6 ┆ 2020-01-02 │
8646
│ 2 ┆ 7 ┆ 2021-03-04 │
8647
│ 3 ┆ 8 ┆ 2022-05-06 │
8648
└─────┴─────┴────────────┘
8649
8650
Cast all frame columns to the specified dtype:
8651
8652
>>> df.cast(pl.String).to_dict(as_series=False)
8653
{'foo': ['1', '2', '3'],
8654
'bar': ['6.0', '7.0', '8.0'],
8655
'ham': ['2020-01-02', '2021-03-04', '2022-05-06']}
8656
"""
8657
from polars.lazyframe.opt_flags import QueryOptFlags
8658
8659
return (
8660
self.lazy()
8661
.cast(dtypes, strict=strict)
8662
.collect(optimizations=QueryOptFlags._eager())
8663
)
8664
8665
def clear(self, n: int = 0) -> DataFrame:
8666
"""
8667
Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame.
8668
8669
Returns a `n`-row null-filled DataFrame with an identical schema.
8670
`n` can be greater than the current number of rows in the DataFrame.
8671
8672
Parameters
8673
----------
8674
n
8675
Number of (null-filled) rows to return in the cleared frame.
8676
8677
See Also
8678
--------
8679
clone : Cheap deepcopy/clone.
8680
8681
Examples
8682
--------
8683
>>> df = pl.DataFrame(
8684
... {
8685
... "a": [None, 2, 3, 4],
8686
... "b": [0.5, None, 2.5, 13],
8687
... "c": [True, True, False, None],
8688
... }
8689
... )
8690
>>> df.clear()
8691
shape: (0, 3)
8692
┌─────┬─────┬──────┐
8693
│ a ┆ b ┆ c │
8694
│ --- ┆ --- ┆ --- │
8695
│ i64 ┆ f64 ┆ bool │
8696
╞═════╪═════╪══════╡
8697
└─────┴─────┴──────┘
8698
8699
>>> df.clear(n=2)
8700
shape: (2, 3)
8701
┌──────┬──────┬──────┐
8702
│ a ┆ b ┆ c │
8703
│ --- ┆ --- ┆ --- │
8704
│ i64 ┆ f64 ┆ bool │
8705
╞══════╪══════╪══════╡
8706
│ null ┆ null ┆ null │
8707
│ null ┆ null ┆ null │
8708
└──────┴──────┴──────┘
8709
"""
8710
if n < 0:
8711
msg = f"`n` should be greater than or equal to 0, got {n}"
8712
raise ValueError(msg)
8713
# faster path
8714
if n == 0:
8715
return self._from_pydf(self._df.clear())
8716
return self.__class__(
8717
{
8718
nm: pl.Series(name=nm, dtype=tp).extend_constant(None, n)
8719
for nm, tp in self.schema.items()
8720
}
8721
)
8722
8723
def clone(self) -> DataFrame:
8724
"""
8725
Create a copy of this DataFrame.
8726
8727
This is a cheap operation that does not copy data.
8728
8729
See Also
8730
--------
8731
clear : Create an empty copy of the current DataFrame, with identical
8732
schema but no data.
8733
8734
Examples
8735
--------
8736
>>> df = pl.DataFrame(
8737
... {
8738
... "a": [1, 2, 3, 4],
8739
... "b": [0.5, 4, 10, 13],
8740
... "c": [True, True, False, True],
8741
... }
8742
... )
8743
>>> df.clone()
8744
shape: (4, 3)
8745
┌─────┬──────┬───────┐
8746
│ a ┆ b ┆ c │
8747
│ --- ┆ --- ┆ --- │
8748
│ i64 ┆ f64 ┆ bool │
8749
╞═════╪══════╪═══════╡
8750
│ 1 ┆ 0.5 ┆ true │
8751
│ 2 ┆ 4.0 ┆ true │
8752
│ 3 ┆ 10.0 ┆ false │
8753
│ 4 ┆ 13.0 ┆ true │
8754
└─────┴──────┴───────┘
8755
"""
8756
return self._from_pydf(self._df.clone())
8757
8758
def get_columns(self) -> list[Series]:
8759
"""
8760
Get the DataFrame as a List of Series.
8761
8762
Examples
8763
--------
8764
>>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
8765
>>> df.get_columns()
8766
[shape: (3,)
8767
Series: 'foo' [i64]
8768
[
8769
1
8770
2
8771
3
8772
], shape: (3,)
8773
Series: 'bar' [i64]
8774
[
8775
4
8776
5
8777
6
8778
]]
8779
8780
>>> df = pl.DataFrame(
8781
... {
8782
... "a": [1, 2, 3, 4],
8783
... "b": [0.5, 4, 10, 13],
8784
... "c": [True, True, False, True],
8785
... }
8786
... )
8787
>>> df.get_columns()
8788
[shape: (4,)
8789
Series: 'a' [i64]
8790
[
8791
1
8792
2
8793
3
8794
4
8795
], shape: (4,)
8796
Series: 'b' [f64]
8797
[
8798
0.5
8799
4.0
8800
10.0
8801
13.0
8802
], shape: (4,)
8803
Series: 'c' [bool]
8804
[
8805
true
8806
true
8807
false
8808
true
8809
]]
8810
"""
8811
return [wrap_s(s) for s in self._df.get_columns()]
8812
8813
@overload
8814
def get_column(self, name: str, *, default: Series | NoDefault = ...) -> Series: ...
8815
8816
@overload
8817
def get_column(self, name: str, *, default: Any) -> Any: ...
8818
8819
def get_column(
8820
self, name: str, *, default: Any | NoDefault = no_default
8821
) -> Series | Any:
8822
"""
8823
Get a single column by name.
8824
8825
Parameters
8826
----------
8827
name
8828
String name of the column to retrieve.
8829
default
8830
Value to return if the column does not exist; if not explicitly set and
8831
the column is not present a `ColumnNotFoundError` exception is raised.
8832
8833
Returns
8834
-------
8835
Series (or arbitrary default value, if specified).
8836
8837
See Also
8838
--------
8839
to_series
8840
8841
Examples
8842
--------
8843
>>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
8844
>>> df.get_column("foo")
8845
shape: (3,)
8846
Series: 'foo' [i64]
8847
[
8848
1
8849
2
8850
3
8851
]
8852
8853
Missing column handling; can optionally provide an arbitrary default value
8854
to the method (otherwise a `ColumnNotFoundError` exception is raised).
8855
8856
>>> df.get_column("baz", default=pl.Series("baz", ["?", "?", "?"]))
8857
shape: (3,)
8858
Series: 'baz' [str]
8859
[
8860
"?"
8861
"?"
8862
"?"
8863
]
8864
>>> res = df.get_column("baz", default=None)
8865
>>> res is None
8866
True
8867
"""
8868
try:
8869
return wrap_s(self._df.get_column(name))
8870
except ColumnNotFoundError:
8871
if default is no_default:
8872
raise
8873
return default
8874
8875
def fill_null(
8876
self,
8877
value: Any | Expr | None = None,
8878
strategy: FillNullStrategy | None = None,
8879
limit: int | None = None,
8880
*,
8881
matches_supertype: bool = True,
8882
) -> DataFrame:
8883
"""
8884
Fill null values using the specified value or strategy.
8885
8886
Parameters
8887
----------
8888
value
8889
Value used to fill null values.
8890
strategy : {None, 'forward', 'backward', 'min', 'max', 'mean', 'zero', 'one'}
8891
Strategy used to fill null values.
8892
limit
8893
Number of consecutive null values to fill when using the 'forward' or
8894
'backward' strategy.
8895
matches_supertype
8896
Fill all matching supertype of the fill `value`.
8897
8898
Returns
8899
-------
8900
DataFrame
8901
DataFrame with None values replaced by the filling strategy.
8902
8903
See Also
8904
--------
8905
fill_nan
8906
8907
Notes
8908
-----
8909
A null value is not the same as a NaN value.
8910
To fill NaN values, use :func:`fill_nan`.
8911
8912
Examples
8913
--------
8914
>>> df = pl.DataFrame(
8915
... {
8916
... "a": [1, 2, None, 4],
8917
... "b": [0.5, 4, None, 13],
8918
... }
8919
... )
8920
>>> df.fill_null(99)
8921
shape: (4, 2)
8922
┌─────┬──────┐
8923
│ a ┆ b │
8924
│ --- ┆ --- │
8925
│ i64 ┆ f64 │
8926
╞═════╪══════╡
8927
│ 1 ┆ 0.5 │
8928
│ 2 ┆ 4.0 │
8929
│ 99 ┆ 99.0 │
8930
│ 4 ┆ 13.0 │
8931
└─────┴──────┘
8932
>>> df.fill_null(strategy="forward")
8933
shape: (4, 2)
8934
┌─────┬──────┐
8935
│ a ┆ b │
8936
│ --- ┆ --- │
8937
│ i64 ┆ f64 │
8938
╞═════╪══════╡
8939
│ 1 ┆ 0.5 │
8940
│ 2 ┆ 4.0 │
8941
│ 2 ┆ 4.0 │
8942
│ 4 ┆ 13.0 │
8943
└─────┴──────┘
8944
8945
>>> df.fill_null(strategy="max")
8946
shape: (4, 2)
8947
┌─────┬──────┐
8948
│ a ┆ b │
8949
│ --- ┆ --- │
8950
│ i64 ┆ f64 │
8951
╞═════╪══════╡
8952
│ 1 ┆ 0.5 │
8953
│ 2 ┆ 4.0 │
8954
│ 4 ┆ 13.0 │
8955
│ 4 ┆ 13.0 │
8956
└─────┴──────┘
8957
8958
>>> df.fill_null(strategy="zero")
8959
shape: (4, 2)
8960
┌─────┬──────┐
8961
│ a ┆ b │
8962
│ --- ┆ --- │
8963
│ i64 ┆ f64 │
8964
╞═════╪══════╡
8965
│ 1 ┆ 0.5 │
8966
│ 2 ┆ 4.0 │
8967
│ 0 ┆ 0.0 │
8968
│ 4 ┆ 13.0 │
8969
└─────┴──────┘
8970
"""
8971
from polars.lazyframe.opt_flags import QueryOptFlags
8972
8973
return (
8974
self.lazy()
8975
.fill_null(value, strategy, limit, matches_supertype=matches_supertype)
8976
.collect(optimizations=QueryOptFlags._eager())
8977
)
8978
8979
def fill_nan(self, value: Expr | int | float | None) -> DataFrame:
8980
"""
8981
Fill floating point NaN values by an Expression evaluation.
8982
8983
Parameters
8984
----------
8985
value
8986
Value used to fill NaN values.
8987
8988
Returns
8989
-------
8990
DataFrame
8991
DataFrame with NaN values replaced by the given value.
8992
8993
See Also
8994
--------
8995
fill_null
8996
8997
Notes
8998
-----
8999
A NaN value is not the same as a null value.
9000
To fill null values, use :func:`fill_null`.
9001
9002
Examples
9003
--------
9004
>>> df = pl.DataFrame(
9005
... {
9006
... "a": [1.5, 2, float("nan"), 4],
9007
... "b": [0.5, 4, float("nan"), 13],
9008
... }
9009
... )
9010
>>> df.fill_nan(99)
9011
shape: (4, 2)
9012
┌──────┬──────┐
9013
│ a ┆ b │
9014
│ --- ┆ --- │
9015
│ f64 ┆ f64 │
9016
╞══════╪══════╡
9017
│ 1.5 ┆ 0.5 │
9018
│ 2.0 ┆ 4.0 │
9019
│ 99.0 ┆ 99.0 │
9020
│ 4.0 ┆ 13.0 │
9021
└──────┴──────┘
9022
"""
9023
from polars.lazyframe.opt_flags import QueryOptFlags
9024
9025
return self.lazy().fill_nan(value).collect(optimizations=QueryOptFlags._eager())
9026
9027
def explode(
9028
self,
9029
columns: ColumnNameOrSelector | Iterable[ColumnNameOrSelector],
9030
*more_columns: ColumnNameOrSelector,
9031
) -> DataFrame:
9032
"""
9033
Explode the dataframe to long format by exploding the given columns.
9034
9035
Parameters
9036
----------
9037
columns
9038
Column names, expressions, or a selector defining them. The underlying
9039
columns being exploded must be of the `List` or `Array` data type.
9040
*more_columns
9041
Additional names of columns to explode, specified as positional arguments.
9042
9043
Returns
9044
-------
9045
DataFrame
9046
9047
Examples
9048
--------
9049
>>> df = pl.DataFrame(
9050
... {
9051
... "letters": ["a", "a", "b", "c"],
9052
... "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]],
9053
... }
9054
... )
9055
>>> df
9056
shape: (4, 2)
9057
┌─────────┬───────────┐
9058
│ letters ┆ numbers │
9059
│ --- ┆ --- │
9060
│ str ┆ list[i64] │
9061
╞═════════╪═══════════╡
9062
│ a ┆ [1] │
9063
│ a ┆ [2, 3] │
9064
│ b ┆ [4, 5] │
9065
│ c ┆ [6, 7, 8] │
9066
└─────────┴───────────┘
9067
>>> df.explode("numbers")
9068
shape: (8, 2)
9069
┌─────────┬─────────┐
9070
│ letters ┆ numbers │
9071
│ --- ┆ --- │
9072
│ str ┆ i64 │
9073
╞═════════╪═════════╡
9074
│ a ┆ 1 │
9075
│ a ┆ 2 │
9076
│ a ┆ 3 │
9077
│ b ┆ 4 │
9078
│ b ┆ 5 │
9079
│ c ┆ 6 │
9080
│ c ┆ 7 │
9081
│ c ┆ 8 │
9082
└─────────┴─────────┘
9083
"""
9084
from polars.lazyframe.opt_flags import QueryOptFlags
9085
9086
return (
9087
self.lazy()
9088
.explode(columns, *more_columns)
9089
.collect(optimizations=QueryOptFlags._eager())
9090
)
9091
9092
@deprecate_renamed_parameter("columns", "on", version="1.0.0")
9093
def pivot(
9094
self,
9095
on: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
9096
*,
9097
index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None,
9098
values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None,
9099
aggregate_function: PivotAgg | Expr | None = None,
9100
maintain_order: bool = True,
9101
sort_columns: bool = False,
9102
separator: str = "_",
9103
) -> DataFrame:
9104
"""
9105
Create a spreadsheet-style pivot table as a DataFrame.
9106
9107
Only available in eager mode. See "Examples" section below for how to do a
9108
"lazy pivot" if you know the unique column values in advance.
9109
9110
.. versionchanged:: 1.0.0
9111
The `columns` parameter was renamed `on`.
9112
9113
Parameters
9114
----------
9115
on
9116
The column(s) whose values will be used as the new columns of the output
9117
DataFrame.
9118
index
9119
The column(s) that remain from the input to the output. The output DataFrame will have one row
9120
for each unique combination of the `index`'s values.
9121
If None, all remaining columns not specified on `on` and `values` will be used. At least one
9122
of `index` and `values` must be specified.
9123
values
9124
The existing column(s) of values which will be moved under the new columns from index. If an
9125
aggregation is specified, these are the values on which the aggregation will be computed.
9126
If None, all remaining columns not specified on `on` and `index` will be used.
9127
At least one of `index` and `values` must be specified.
9128
aggregate_function
9129
Choose from:
9130
9131
- None: no aggregation takes place, will raise error if multiple values are in group.
9132
- A predefined aggregate function string, one of
9133
{'min', 'max', 'first', 'last', 'sum', 'mean', 'median', 'len'}
9134
- An expression to do the aggregation. The expression can only access data from the respective
9135
'values' columns as generated by pivot, through `pl.element()`.
9136
maintain_order
9137
Ensure the values of `index` are sorted by discovery order.
9138
sort_columns
9139
Sort the transposed columns by name. Default is by order of discovery.
9140
separator
9141
Used as separator/delimiter in generated column names in case of multiple
9142
`values` columns.
9143
9144
Returns
9145
-------
9146
DataFrame
9147
9148
Notes
9149
-----
9150
In some other frameworks, you might know this operation as `pivot_wider`.
9151
9152
Examples
9153
--------
9154
You can use `pivot` to reshape a dataframe from "long" to "wide" format.
9155
9156
For example, suppose we have a dataframe of test scores achieved by some
9157
students, where each row represents a distinct test.
9158
9159
>>> df = pl.DataFrame(
9160
... {
9161
... "name": ["Cady", "Cady", "Karen", "Karen"],
9162
... "subject": ["maths", "physics", "maths", "physics"],
9163
... "test_1": [98, 99, 61, 58],
9164
... "test_2": [100, 100, 60, 60],
9165
... }
9166
... )
9167
>>> df
9168
shape: (4, 4)
9169
┌───────┬─────────┬────────┬────────┐
9170
│ name ┆ subject ┆ test_1 ┆ test_2 │
9171
│ --- ┆ --- ┆ --- ┆ --- │
9172
│ str ┆ str ┆ i64 ┆ i64 │
9173
╞═══════╪═════════╪════════╪════════╡
9174
│ Cady ┆ maths ┆ 98 ┆ 100 │
9175
│ Cady ┆ physics ┆ 99 ┆ 100 │
9176
│ Karen ┆ maths ┆ 61 ┆ 60 │
9177
│ Karen ┆ physics ┆ 58 ┆ 60 │
9178
└───────┴─────────┴────────┴────────┘
9179
9180
Using `pivot`, we can reshape so we have one row per student, with different
9181
subjects as columns, and their `test_1` scores as values:
9182
9183
>>> df.pivot("subject", index="name", values="test_1")
9184
shape: (2, 3)
9185
┌───────┬───────┬─────────┐
9186
│ name ┆ maths ┆ physics │
9187
│ --- ┆ --- ┆ --- │
9188
│ str ┆ i64 ┆ i64 │
9189
╞═══════╪═══════╪═════════╡
9190
│ Cady ┆ 98 ┆ 99 │
9191
│ Karen ┆ 61 ┆ 58 │
9192
└───────┴───────┴─────────┘
9193
9194
You can use selectors too - here we include all test scores in the pivoted table:
9195
9196
>>> import polars.selectors as cs
9197
>>> df.pivot("subject", values=cs.starts_with("test"))
9198
shape: (2, 5)
9199
┌───────┬──────────────┬────────────────┬──────────────┬────────────────┐
9200
│ name ┆ test_1_maths ┆ test_1_physics ┆ test_2_maths ┆ test_2_physics │
9201
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
9202
│ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
9203
╞═══════╪══════════════╪════════════════╪══════════════╪════════════════╡
9204
│ Cady ┆ 98 ┆ 99 ┆ 100 ┆ 100 │
9205
│ Karen ┆ 61 ┆ 58 ┆ 60 ┆ 60 │
9206
└───────┴──────────────┴────────────────┴──────────────┴────────────────┘
9207
9208
If you end up with multiple values per cell, you can specify how to aggregate
9209
them with `aggregate_function`:
9210
9211
>>> df = pl.DataFrame(
9212
... {
9213
... "ix": [1, 1, 2, 2, 1, 2],
9214
... "col": ["a", "a", "a", "a", "b", "b"],
9215
... "foo": [0, 1, 2, 2, 7, 1],
9216
... "bar": [0, 2, 0, 0, 9, 4],
9217
... }
9218
... )
9219
>>> df.pivot("col", index="ix", aggregate_function="sum")
9220
shape: (2, 5)
9221
┌─────┬───────┬───────┬───────┬───────┐
9222
│ ix ┆ foo_a ┆ foo_b ┆ bar_a ┆ bar_b │
9223
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
9224
│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
9225
╞═════╪═══════╪═══════╪═══════╪═══════╡
9226
│ 1 ┆ 1 ┆ 7 ┆ 2 ┆ 9 │
9227
│ 2 ┆ 4 ┆ 1 ┆ 0 ┆ 4 │
9228
└─────┴───────┴───────┴───────┴───────┘
9229
9230
You can also pass a custom aggregation function using
9231
:meth:`polars.element`:
9232
9233
>>> df = pl.DataFrame(
9234
... {
9235
... "col1": ["a", "a", "a", "b", "b", "b"],
9236
... "col2": ["x", "x", "x", "x", "y", "y"],
9237
... "col3": [6, 7, 3, 2, 5, 7],
9238
... }
9239
... )
9240
>>> df.pivot(
9241
... "col2",
9242
... index="col1",
9243
... values="col3",
9244
... aggregate_function=pl.element().tanh().mean(),
9245
... )
9246
shape: (2, 3)
9247
┌──────┬──────────┬──────────┐
9248
│ col1 ┆ x ┆ y │
9249
│ --- ┆ --- ┆ --- │
9250
│ str ┆ f64 ┆ f64 │
9251
╞══════╪══════════╪══════════╡
9252
│ a ┆ 0.998347 ┆ null │
9253
│ b ┆ 0.964028 ┆ 0.999954 │
9254
└──────┴──────────┴──────────┘
9255
9256
Note that `pivot` is only available in eager mode. If you know the unique
9257
column values in advance, you can use :meth:`polars.LazyFrame.group_by` to
9258
get the same result as above in lazy mode:
9259
9260
>>> index = pl.col("col1")
9261
>>> on = pl.col("col2")
9262
>>> values = pl.col("col3")
9263
>>> unique_column_values = ["x", "y"]
9264
>>> aggregate_function = lambda col: col.tanh().mean()
9265
>>> df.lazy().group_by(index).agg(
9266
... aggregate_function(values.filter(on == value)).alias(value)
9267
... for value in unique_column_values
9268
... ).collect() # doctest: +IGNORE_RESULT
9269
shape: (2, 3)
9270
┌──────┬──────────┬──────────┐
9271
│ col1 ┆ x ┆ y │
9272
│ --- ┆ --- ┆ --- │
9273
│ str ┆ f64 ┆ f64 │
9274
╞══════╪══════════╪══════════╡
9275
│ a ┆ 0.998347 ┆ null │
9276
│ b ┆ 0.964028 ┆ 0.999954 │
9277
└──────┴──────────┴──────────┘
9278
""" # noqa: W505
9279
on = _expand_selectors(self, on)
9280
if values is not None:
9281
values = _expand_selectors(self, values)
9282
if index is not None:
9283
index = _expand_selectors(self, index)
9284
9285
if isinstance(aggregate_function, str):
9286
if aggregate_function == "first":
9287
aggregate_expr = F.element().first()._pyexpr
9288
elif aggregate_function == "sum":
9289
aggregate_expr = F.element().sum()._pyexpr
9290
elif aggregate_function == "max":
9291
aggregate_expr = F.element().max()._pyexpr
9292
elif aggregate_function == "min":
9293
aggregate_expr = F.element().min()._pyexpr
9294
elif aggregate_function == "mean":
9295
aggregate_expr = F.element().mean()._pyexpr
9296
elif aggregate_function == "median":
9297
aggregate_expr = F.element().median()._pyexpr
9298
elif aggregate_function == "last":
9299
aggregate_expr = F.element().last()._pyexpr
9300
elif aggregate_function == "len":
9301
aggregate_expr = F.len()._pyexpr
9302
elif aggregate_function == "count":
9303
issue_deprecation_warning(
9304
"`aggregate_function='count'` input for `pivot` is deprecated."
9305
" Please use `aggregate_function='len'`.",
9306
version="0.20.5",
9307
)
9308
aggregate_expr = F.len()._pyexpr
9309
else:
9310
msg = f"invalid input for `aggregate_function` argument: {aggregate_function!r}"
9311
raise ValueError(msg)
9312
elif aggregate_function is None:
9313
aggregate_expr = None
9314
else:
9315
aggregate_expr = aggregate_function._pyexpr
9316
9317
return self._from_pydf(
9318
self._df.pivot_expr(
9319
on,
9320
index,
9321
values,
9322
maintain_order,
9323
sort_columns,
9324
aggregate_expr,
9325
separator,
9326
)
9327
)
9328
9329
def unpivot(
9330
self,
9331
on: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None,
9332
*,
9333
index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None,
9334
variable_name: str | None = None,
9335
value_name: str | None = None,
9336
) -> DataFrame:
9337
"""
9338
Unpivot a DataFrame from wide to long format.
9339
9340
Optionally leaves identifiers set.
9341
9342
This function is useful to massage a DataFrame into a format where one or more
9343
columns are identifier variables (index) while all other columns, considered
9344
measured variables (on), are "unpivoted" to the row axis leaving just
9345
two non-identifier columns, 'variable' and 'value'.
9346
9347
Parameters
9348
----------
9349
on
9350
Column(s) or selector(s) to use as values variables; if `on`
9351
is empty all columns that are not in `index` will be used.
9352
index
9353
Column(s) or selector(s) to use as identifier variables.
9354
variable_name
9355
Name to give to the `variable` column. Defaults to "variable"
9356
value_name
9357
Name to give to the `value` column. Defaults to "value"
9358
9359
Notes
9360
-----
9361
If you're coming from pandas, this is similar to `pandas.DataFrame.melt`,
9362
but with `index` replacing `id_vars` and `on` replacing `value_vars`.
9363
In other frameworks, you might know this operation as `pivot_longer`.
9364
9365
Examples
9366
--------
9367
>>> df = pl.DataFrame(
9368
... {
9369
... "a": ["x", "y", "z"],
9370
... "b": [1, 3, 5],
9371
... "c": [2, 4, 6],
9372
... }
9373
... )
9374
>>> import polars.selectors as cs
9375
>>> df.unpivot(cs.numeric(), index="a")
9376
shape: (6, 3)
9377
┌─────┬──────────┬───────┐
9378
│ a ┆ variable ┆ value │
9379
│ --- ┆ --- ┆ --- │
9380
│ str ┆ str ┆ i64 │
9381
╞═════╪══════════╪═══════╡
9382
│ x ┆ b ┆ 1 │
9383
│ y ┆ b ┆ 3 │
9384
│ z ┆ b ┆ 5 │
9385
│ x ┆ c ┆ 2 │
9386
│ y ┆ c ┆ 4 │
9387
│ z ┆ c ┆ 6 │
9388
└─────┴──────────┴───────┘
9389
"""
9390
on = [] if on is None else _expand_selectors(self, on)
9391
index = [] if index is None else _expand_selectors(self, index)
9392
9393
return self._from_pydf(self._df.unpivot(on, index, value_name, variable_name))
9394
9395
def unstack(
9396
self,
9397
*,
9398
step: int,
9399
how: UnstackDirection = "vertical",
9400
columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None,
9401
fill_values: list[Any] | None = None,
9402
) -> DataFrame:
9403
"""
9404
Unstack a long table to a wide form without doing an aggregation.
9405
9406
This can be much faster than a pivot, because it can skip the grouping phase.
9407
9408
Parameters
9409
----------
9410
step
9411
Number of rows in the unstacked frame.
9412
how : { 'vertical', 'horizontal' }
9413
Direction of the unstack.
9414
columns
9415
Column name(s) or selector(s) to include in the operation.
9416
If set to `None` (default), use all columns.
9417
fill_values
9418
Fill values that don't fit the new size with this value.
9419
9420
Examples
9421
--------
9422
>>> from string import ascii_uppercase
9423
>>> df = pl.DataFrame(
9424
... {
9425
... "x": list(ascii_uppercase[0:8]),
9426
... "y": pl.int_range(1, 9, eager=True),
9427
... }
9428
... ).with_columns(
9429
... z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8),
9430
... )
9431
>>> df
9432
shape: (8, 3)
9433
┌─────┬─────┬──────────┐
9434
│ x ┆ y ┆ z │
9435
│ --- ┆ --- ┆ --- │
9436
│ str ┆ i64 ┆ list[u8] │
9437
╞═════╪═════╪══════════╡
9438
│ A ┆ 1 ┆ [1, 2] │
9439
│ B ┆ 2 ┆ [2, 3] │
9440
│ C ┆ 3 ┆ [3, 4] │
9441
│ D ┆ 4 ┆ [4, 5] │
9442
│ E ┆ 5 ┆ [5, 6] │
9443
│ F ┆ 6 ┆ [6, 7] │
9444
│ G ┆ 7 ┆ [7, 8] │
9445
│ H ┆ 8 ┆ [8, 9] │
9446
└─────┴─────┴──────────┘
9447
>>> df.unstack(step=4, how="vertical")
9448
shape: (4, 6)
9449
┌─────┬─────┬─────┬─────┬──────────┬──────────┐
9450
│ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │
9451
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
9452
│ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │
9453
╞═════╪═════╪═════╪═════╪══════════╪══════════╡
9454
│ A ┆ E ┆ 1 ┆ 5 ┆ [1, 2] ┆ [5, 6] │
9455
│ B ┆ F ┆ 2 ┆ 6 ┆ [2, 3] ┆ [6, 7] │
9456
│ C ┆ G ┆ 3 ┆ 7 ┆ [3, 4] ┆ [7, 8] │
9457
│ D ┆ H ┆ 4 ┆ 8 ┆ [4, 5] ┆ [8, 9] │
9458
└─────┴─────┴─────┴─────┴──────────┴──────────┘
9459
>>> df.unstack(step=2, how="horizontal")
9460
shape: (4, 6)
9461
┌─────┬─────┬─────┬─────┬──────────┬──────────┐
9462
│ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0 ┆ z_1 │
9463
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
9464
│ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │
9465
╞═════╪═════╪═════╪═════╪══════════╪══════════╡
9466
│ A ┆ B ┆ 1 ┆ 2 ┆ [1, 2] ┆ [2, 3] │
9467
│ C ┆ D ┆ 3 ┆ 4 ┆ [3, 4] ┆ [4, 5] │
9468
│ E ┆ F ┆ 5 ┆ 6 ┆ [5, 6] ┆ [6, 7] │
9469
│ G ┆ H ┆ 7 ┆ 8 ┆ [7, 8] ┆ [8, 9] │
9470
└─────┴─────┴─────┴─────┴──────────┴──────────┘
9471
>>> import polars.selectors as cs
9472
>>> df.unstack(step=5, columns=cs.numeric(), fill_values=0)
9473
shape: (5, 2)
9474
┌─────┬─────┐
9475
│ y_0 ┆ y_1 │
9476
│ --- ┆ --- │
9477
│ i64 ┆ i64 │
9478
╞═════╪═════╡
9479
│ 1 ┆ 6 │
9480
│ 2 ┆ 7 │
9481
│ 3 ┆ 8 │
9482
│ 4 ┆ 0 │
9483
│ 5 ┆ 0 │
9484
└─────┴─────┘
9485
"""
9486
import math
9487
9488
df = self.select(columns) if columns is not None else self
9489
9490
height = df.height
9491
if how == "vertical":
9492
n_rows = step
9493
n_cols = math.ceil(height / n_rows)
9494
else:
9495
n_cols = step
9496
n_rows = math.ceil(height / n_cols)
9497
9498
if n_fill := n_cols * n_rows - height:
9499
if not isinstance(fill_values, list):
9500
fill_values = [fill_values for _ in range(df.width)]
9501
9502
df = df.select(
9503
s.extend_constant(next_fill, n_fill)
9504
for s, next_fill in zip(df, fill_values)
9505
)
9506
9507
if how == "horizontal":
9508
df = (
9509
df.with_columns(
9510
(F.int_range(0, n_cols * n_rows, eager=True) % n_cols).alias(
9511
"__sort_order"
9512
),
9513
)
9514
.sort("__sort_order")
9515
.drop("__sort_order")
9516
)
9517
9518
zfill_val = math.floor(math.log10(n_cols)) + 1
9519
slices = [
9520
s.slice(slice_nbr * n_rows, n_rows).alias(
9521
s.name + "_" + str(slice_nbr).zfill(zfill_val)
9522
)
9523
for s in df
9524
for slice_nbr in range(n_cols)
9525
]
9526
9527
return DataFrame(slices)
9528
9529
@overload
9530
def partition_by(
9531
self,
9532
by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
9533
*more_by: ColumnNameOrSelector,
9534
maintain_order: bool = ...,
9535
include_key: bool = ...,
9536
as_dict: Literal[False] = ...,
9537
) -> list[DataFrame]: ...
9538
9539
@overload
9540
def partition_by(
9541
self,
9542
by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
9543
*more_by: ColumnNameOrSelector,
9544
maintain_order: bool = ...,
9545
include_key: bool = ...,
9546
as_dict: Literal[True],
9547
) -> dict[tuple[Any, ...], DataFrame]: ...
9548
9549
@overload
9550
def partition_by(
9551
self,
9552
by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
9553
*more_by: ColumnNameOrSelector,
9554
maintain_order: bool = ...,
9555
include_key: bool = ...,
9556
as_dict: bool,
9557
) -> list[DataFrame] | dict[tuple[Any, ...], DataFrame]: ...
9558
9559
def partition_by(
9560
self,
9561
by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
9562
*more_by: ColumnNameOrSelector,
9563
maintain_order: bool = True,
9564
include_key: bool = True,
9565
as_dict: bool = False,
9566
) -> list[DataFrame] | dict[tuple[Any, ...], DataFrame]:
9567
"""
9568
Group by the given columns and return the groups as separate dataframes.
9569
9570
Parameters
9571
----------
9572
by
9573
Column name(s) or selector(s) to group by.
9574
*more_by
9575
Additional names of columns to group by, specified as positional arguments.
9576
maintain_order
9577
Ensure that the order of the groups is consistent with the input data.
9578
This is slower than a default partition by operation.
9579
include_key
9580
Include the columns used to partition the DataFrame in the output.
9581
as_dict
9582
Return a dictionary instead of a list. The dictionary keys are tuples of
9583
the distinct group values that identify each group.
9584
9585
Examples
9586
--------
9587
Pass a single column name to partition by that column.
9588
9589
>>> df = pl.DataFrame(
9590
... {
9591
... "a": ["a", "b", "a", "b", "c"],
9592
... "b": [1, 2, 1, 3, 3],
9593
... "c": [5, 4, 3, 2, 1],
9594
... }
9595
... )
9596
>>> df.partition_by("a") # doctest: +IGNORE_RESULT
9597
[shape: (2, 3)
9598
┌─────┬─────┬─────┐
9599
│ a ┆ b ┆ c │
9600
│ --- ┆ --- ┆ --- │
9601
│ str ┆ i64 ┆ i64 │
9602
╞═════╪═════╪═════╡
9603
│ a ┆ 1 ┆ 5 │
9604
│ a ┆ 1 ┆ 3 │
9605
└─────┴─────┴─────┘,
9606
shape: (2, 3)
9607
┌─────┬─────┬─────┐
9608
│ a ┆ b ┆ c │
9609
│ --- ┆ --- ┆ --- │
9610
│ str ┆ i64 ┆ i64 │
9611
╞═════╪═════╪═════╡
9612
│ b ┆ 2 ┆ 4 │
9613
│ b ┆ 3 ┆ 2 │
9614
└─────┴─────┴─────┘,
9615
shape: (1, 3)
9616
┌─────┬─────┬─────┐
9617
│ a ┆ b ┆ c │
9618
│ --- ┆ --- ┆ --- │
9619
│ str ┆ i64 ┆ i64 │
9620
╞═════╪═════╪═════╡
9621
│ c ┆ 3 ┆ 1 │
9622
└─────┴─────┴─────┘]
9623
9624
Partition by multiple columns by either passing a list of column names, or by
9625
specifying each column name as a positional argument.
9626
9627
>>> df.partition_by("a", "b") # doctest: +IGNORE_RESULT
9628
[shape: (2, 3)
9629
┌─────┬─────┬─────┐
9630
│ a ┆ b ┆ c │
9631
│ --- ┆ --- ┆ --- │
9632
│ str ┆ i64 ┆ i64 │
9633
╞═════╪═════╪═════╡
9634
│ a ┆ 1 ┆ 5 │
9635
│ a ┆ 1 ┆ 3 │
9636
└─────┴─────┴─────┘,
9637
shape: (1, 3)
9638
┌─────┬─────┬─────┐
9639
│ a ┆ b ┆ c │
9640
│ --- ┆ --- ┆ --- │
9641
│ str ┆ i64 ┆ i64 │
9642
╞═════╪═════╪═════╡
9643
│ b ┆ 2 ┆ 4 │
9644
└─────┴─────┴─────┘,
9645
shape: (1, 3)
9646
┌─────┬─────┬─────┐
9647
│ a ┆ b ┆ c │
9648
│ --- ┆ --- ┆ --- │
9649
│ str ┆ i64 ┆ i64 │
9650
╞═════╪═════╪═════╡
9651
│ b ┆ 3 ┆ 2 │
9652
└─────┴─────┴─────┘,
9653
shape: (1, 3)
9654
┌─────┬─────┬─────┐
9655
│ a ┆ b ┆ c │
9656
│ --- ┆ --- ┆ --- │
9657
│ str ┆ i64 ┆ i64 │
9658
╞═════╪═════╪═════╡
9659
│ c ┆ 3 ┆ 1 │
9660
└─────┴─────┴─────┘]
9661
9662
Return the partitions as a dictionary by specifying `as_dict=True`.
9663
9664
>>> import polars.selectors as cs
9665
>>> df.partition_by(cs.string(), as_dict=True) # doctest: +IGNORE_RESULT
9666
{('a',): shape: (2, 3)
9667
┌─────┬─────┬─────┐
9668
│ a ┆ b ┆ c │
9669
│ --- ┆ --- ┆ --- │
9670
│ str ┆ i64 ┆ i64 │
9671
╞═════╪═════╪═════╡
9672
│ a ┆ 1 ┆ 5 │
9673
│ a ┆ 1 ┆ 3 │
9674
└─────┴─────┴─────┘,
9675
('b',): shape: (2, 3)
9676
┌─────┬─────┬─────┐
9677
│ a ┆ b ┆ c │
9678
│ --- ┆ --- ┆ --- │
9679
│ str ┆ i64 ┆ i64 │
9680
╞═════╪═════╪═════╡
9681
│ b ┆ 2 ┆ 4 │
9682
│ b ┆ 3 ┆ 2 │
9683
└─────┴─────┴─────┘,
9684
('c',): shape: (1, 3)
9685
┌─────┬─────┬─────┐
9686
│ a ┆ b ┆ c │
9687
│ --- ┆ --- ┆ --- │
9688
│ str ┆ i64 ┆ i64 │
9689
╞═════╪═════╪═════╡
9690
│ c ┆ 3 ┆ 1 │
9691
└─────┴─────┴─────┘}
9692
"""
9693
by_parsed = _expand_selectors(self, by, *more_by)
9694
9695
partitions = [
9696
self._from_pydf(_df)
9697
for _df in self._df.partition_by(by_parsed, maintain_order, include_key)
9698
]
9699
9700
if as_dict:
9701
if include_key:
9702
names = [p.select(by_parsed).row(0) for p in partitions]
9703
else:
9704
if not maintain_order: # Group keys cannot be matched to partitions
9705
msg = "cannot use `partition_by` with `maintain_order=False, include_key=False, as_dict=True`"
9706
raise ValueError(msg)
9707
names = self.select(by_parsed).unique(maintain_order=True).rows()
9708
9709
return dict(zip(names, partitions))
9710
9711
return partitions
9712
9713
def shift(self, n: int = 1, *, fill_value: IntoExpr | None = None) -> DataFrame:
9714
"""
9715
Shift values by the given number of indices.
9716
9717
Parameters
9718
----------
9719
n
9720
Number of indices to shift forward. If a negative value is passed, values
9721
are shifted in the opposite direction instead.
9722
fill_value
9723
Fill the resulting null values with this value. Accepts scalar expression
9724
input. Non-expression inputs are parsed as literals.
9725
9726
Notes
9727
-----
9728
This method is similar to the `LAG` operation in SQL when the value for `n`
9729
is positive. With a negative value for `n`, it is similar to `LEAD`.
9730
9731
Examples
9732
--------
9733
By default, values are shifted forward by one index.
9734
9735
>>> df = pl.DataFrame(
9736
... {
9737
... "a": [1, 2, 3, 4],
9738
... "b": [5, 6, 7, 8],
9739
... }
9740
... )
9741
>>> df.shift()
9742
shape: (4, 2)
9743
┌──────┬──────┐
9744
│ a ┆ b │
9745
│ --- ┆ --- │
9746
│ i64 ┆ i64 │
9747
╞══════╪══════╡
9748
│ null ┆ null │
9749
│ 1 ┆ 5 │
9750
│ 2 ┆ 6 │
9751
│ 3 ┆ 7 │
9752
└──────┴──────┘
9753
9754
Pass a negative value to shift in the opposite direction instead.
9755
9756
>>> df.shift(-2)
9757
shape: (4, 2)
9758
┌──────┬──────┐
9759
│ a ┆ b │
9760
│ --- ┆ --- │
9761
│ i64 ┆ i64 │
9762
╞══════╪══════╡
9763
│ 3 ┆ 7 │
9764
│ 4 ┆ 8 │
9765
│ null ┆ null │
9766
│ null ┆ null │
9767
└──────┴──────┘
9768
9769
Specify `fill_value` to fill the resulting null values.
9770
9771
>>> df.shift(-2, fill_value=100)
9772
shape: (4, 2)
9773
┌─────┬─────┐
9774
│ a ┆ b │
9775
│ --- ┆ --- │
9776
│ i64 ┆ i64 │
9777
╞═════╪═════╡
9778
│ 3 ┆ 7 │
9779
│ 4 ┆ 8 │
9780
│ 100 ┆ 100 │
9781
│ 100 ┆ 100 │
9782
└─────┴─────┘
9783
"""
9784
from polars.lazyframe.opt_flags import QueryOptFlags
9785
9786
return (
9787
self.lazy()
9788
.shift(n, fill_value=fill_value)
9789
.collect(optimizations=QueryOptFlags._eager())
9790
)
9791
9792
def is_duplicated(self) -> Series:
9793
"""
9794
Get a mask of all duplicated rows in this DataFrame.
9795
9796
Examples
9797
--------
9798
>>> df = pl.DataFrame(
9799
... {
9800
... "a": [1, 2, 3, 1],
9801
... "b": ["x", "y", "z", "x"],
9802
... }
9803
... )
9804
>>> df.is_duplicated()
9805
shape: (4,)
9806
Series: '' [bool]
9807
[
9808
true
9809
false
9810
false
9811
true
9812
]
9813
9814
This mask can be used to visualize the duplicated lines like this:
9815
9816
>>> df.filter(df.is_duplicated())
9817
shape: (2, 2)
9818
┌─────┬─────┐
9819
│ a ┆ b │
9820
│ --- ┆ --- │
9821
│ i64 ┆ str │
9822
╞═════╪═════╡
9823
│ 1 ┆ x │
9824
│ 1 ┆ x │
9825
└─────┴─────┘
9826
"""
9827
return wrap_s(self._df.is_duplicated())
9828
9829
def is_unique(self) -> Series:
9830
"""
9831
Get a mask of all unique rows in this DataFrame.
9832
9833
Examples
9834
--------
9835
>>> df = pl.DataFrame(
9836
... {
9837
... "a": [1, 2, 3, 1],
9838
... "b": ["x", "y", "z", "x"],
9839
... }
9840
... )
9841
>>> df.is_unique()
9842
shape: (4,)
9843
Series: '' [bool]
9844
[
9845
false
9846
true
9847
true
9848
false
9849
]
9850
9851
This mask can be used to visualize the unique lines like this:
9852
9853
>>> df.filter(df.is_unique())
9854
shape: (2, 2)
9855
┌─────┬─────┐
9856
│ a ┆ b │
9857
│ --- ┆ --- │
9858
│ i64 ┆ str │
9859
╞═════╪═════╡
9860
│ 2 ┆ y │
9861
│ 3 ┆ z │
9862
└─────┴─────┘
9863
"""
9864
return wrap_s(self._df.is_unique())
9865
9866
def lazy(self) -> LazyFrame:
9867
"""
9868
Start a lazy query from this point. This returns a `LazyFrame` object.
9869
9870
Operations on a `LazyFrame` are not executed until this is triggered
9871
by calling one of:
9872
9873
* :meth:`.collect() <polars.LazyFrame.collect>`
9874
(run on all data)
9875
* :meth:`.explain() <polars.LazyFrame.explain>`
9876
(print the query plan)
9877
* :meth:`.show_graph() <polars.LazyFrame.show_graph>`
9878
(show the query plan as graphviz graph)
9879
* :meth:`.collect_schema() <polars.LazyFrame.collect_schema>`
9880
(return the final frame schema)
9881
9882
Lazy operations are recommended because they allow for query optimization and
9883
additional parallelism.
9884
9885
Returns
9886
-------
9887
LazyFrame
9888
9889
Examples
9890
--------
9891
>>> df = pl.DataFrame(
9892
... {
9893
... "a": [None, 2, 3, 4],
9894
... "b": [0.5, None, 2.5, 13],
9895
... "c": [True, True, False, None],
9896
... }
9897
... )
9898
>>> df.lazy()
9899
<LazyFrame at ...>
9900
"""
9901
return wrap_ldf(self._df.lazy())
9902
9903
def select(
9904
self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr
9905
) -> DataFrame:
9906
"""
9907
Select columns from this DataFrame.
9908
9909
Parameters
9910
----------
9911
*exprs
9912
Column(s) to select, specified as positional arguments.
9913
Accepts expression input. Strings are parsed as column names,
9914
other non-expression inputs are parsed as literals.
9915
**named_exprs
9916
Additional columns to select, specified as keyword arguments.
9917
The columns will be renamed to the keyword used.
9918
9919
Examples
9920
--------
9921
Pass the name of a column to select that column.
9922
9923
>>> df = pl.DataFrame(
9924
... {
9925
... "foo": [1, 2, 3],
9926
... "bar": [6, 7, 8],
9927
... "ham": ["a", "b", "c"],
9928
... }
9929
... )
9930
>>> df.select("foo")
9931
shape: (3, 1)
9932
┌─────┐
9933
│ foo │
9934
│ --- │
9935
│ i64 │
9936
╞═════╡
9937
│ 1 │
9938
│ 2 │
9939
│ 3 │
9940
└─────┘
9941
9942
Multiple columns can be selected by passing a list of column names.
9943
9944
>>> df.select(["foo", "bar"])
9945
shape: (3, 2)
9946
┌─────┬─────┐
9947
│ foo ┆ bar │
9948
│ --- ┆ --- │
9949
│ i64 ┆ i64 │
9950
╞═════╪═════╡
9951
│ 1 ┆ 6 │
9952
│ 2 ┆ 7 │
9953
│ 3 ┆ 8 │
9954
└─────┴─────┘
9955
9956
Multiple columns can also be selected using positional arguments instead of a
9957
list. Expressions are also accepted.
9958
9959
>>> df.select(pl.col("foo"), pl.col("bar") + 1)
9960
shape: (3, 2)
9961
┌─────┬─────┐
9962
│ foo ┆ bar │
9963
│ --- ┆ --- │
9964
│ i64 ┆ i64 │
9965
╞═════╪═════╡
9966
│ 1 ┆ 7 │
9967
│ 2 ┆ 8 │
9968
│ 3 ┆ 9 │
9969
└─────┴─────┘
9970
9971
Use keyword arguments to easily name your expression inputs.
9972
9973
>>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0))
9974
shape: (3, 1)
9975
┌───────────┐
9976
│ threshold │
9977
│ --- │
9978
│ i32 │
9979
╞═══════════╡
9980
│ 0 │
9981
│ 0 │
9982
│ 10 │
9983
└───────────┘
9984
"""
9985
from polars.lazyframe.opt_flags import QueryOptFlags
9986
9987
return (
9988
self.lazy()
9989
.select(*exprs, **named_exprs)
9990
.collect(optimizations=QueryOptFlags._eager())
9991
)
9992
9993
def select_seq(
9994
self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr
9995
) -> DataFrame:
9996
"""
9997
Select columns from this DataFrame.
9998
9999
This will run all expression sequentially instead of in parallel.
10000
Use this when the work per expression is cheap.
10001
10002
Parameters
10003
----------
10004
*exprs
10005
Column(s) to select, specified as positional arguments.
10006
Accepts expression input. Strings are parsed as column names,
10007
other non-expression inputs are parsed as literals.
10008
**named_exprs
10009
Additional columns to select, specified as keyword arguments.
10010
The columns will be renamed to the keyword used.
10011
10012
See Also
10013
--------
10014
select
10015
"""
10016
from polars.lazyframe.opt_flags import QueryOptFlags
10017
10018
return (
10019
self.lazy()
10020
.select_seq(*exprs, **named_exprs)
10021
.collect(optimizations=QueryOptFlags._eager())
10022
)
10023
10024
def with_columns(
10025
self,
10026
*exprs: IntoExpr | Iterable[IntoExpr],
10027
**named_exprs: IntoExpr,
10028
) -> DataFrame:
10029
"""
10030
Add columns to this DataFrame.
10031
10032
Added columns will replace existing columns with the same name.
10033
10034
Parameters
10035
----------
10036
*exprs
10037
Column(s) to add, specified as positional arguments.
10038
Accepts expression input. Strings are parsed as column names, other
10039
non-expression inputs are parsed as literals.
10040
**named_exprs
10041
Additional columns to add, specified as keyword arguments.
10042
The columns will be renamed to the keyword used.
10043
10044
Returns
10045
-------
10046
DataFrame
10047
A new DataFrame with the columns added.
10048
10049
Notes
10050
-----
10051
Creating a new DataFrame using this method does not create a new copy of
10052
existing data.
10053
10054
Examples
10055
--------
10056
Pass an expression to add it as a new column.
10057
10058
>>> df = pl.DataFrame(
10059
... {
10060
... "a": [1, 2, 3, 4],
10061
... "b": [0.5, 4, 10, 13],
10062
... "c": [True, True, False, True],
10063
... }
10064
... )
10065
>>> df.with_columns((pl.col("a") ** 2).alias("a^2"))
10066
shape: (4, 4)
10067
┌─────┬──────┬───────┬─────┐
10068
│ a ┆ b ┆ c ┆ a^2 │
10069
│ --- ┆ --- ┆ --- ┆ --- │
10070
│ i64 ┆ f64 ┆ bool ┆ i64 │
10071
╞═════╪══════╪═══════╪═════╡
10072
│ 1 ┆ 0.5 ┆ true ┆ 1 │
10073
│ 2 ┆ 4.0 ┆ true ┆ 4 │
10074
│ 3 ┆ 10.0 ┆ false ┆ 9 │
10075
│ 4 ┆ 13.0 ┆ true ┆ 16 │
10076
└─────┴──────┴───────┴─────┘
10077
10078
Added columns will replace existing columns with the same name.
10079
10080
>>> df.with_columns(pl.col("a").cast(pl.Float64))
10081
shape: (4, 3)
10082
┌─────┬──────┬───────┐
10083
│ a ┆ b ┆ c │
10084
│ --- ┆ --- ┆ --- │
10085
│ f64 ┆ f64 ┆ bool │
10086
╞═════╪══════╪═══════╡
10087
│ 1.0 ┆ 0.5 ┆ true │
10088
│ 2.0 ┆ 4.0 ┆ true │
10089
│ 3.0 ┆ 10.0 ┆ false │
10090
│ 4.0 ┆ 13.0 ┆ true │
10091
└─────┴──────┴───────┘
10092
10093
Multiple columns can be added using positional arguments.
10094
10095
>>> df.with_columns(
10096
... (pl.col("a") ** 2).alias("a^2"),
10097
... (pl.col("b") / 2).alias("b/2"),
10098
... (pl.col("c").not_()).alias("not c"),
10099
... )
10100
shape: (4, 6)
10101
┌─────┬──────┬───────┬─────┬──────┬───────┐
10102
│ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
10103
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
10104
│ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
10105
╞═════╪══════╪═══════╪═════╪══════╪═══════╡
10106
│ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
10107
│ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
10108
│ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
10109
│ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
10110
└─────┴──────┴───────┴─────┴──────┴───────┘
10111
10112
Multiple columns can also be added by passing a list of expressions.
10113
10114
>>> df.with_columns(
10115
... [
10116
... (pl.col("a") ** 2).alias("a^2"),
10117
... (pl.col("b") / 2).alias("b/2"),
10118
... (pl.col("c").not_()).alias("not c"),
10119
... ]
10120
... )
10121
shape: (4, 6)
10122
┌─────┬──────┬───────┬─────┬──────┬───────┐
10123
│ a ┆ b ┆ c ┆ a^2 ┆ b/2 ┆ not c │
10124
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
10125
│ i64 ┆ f64 ┆ bool ┆ i64 ┆ f64 ┆ bool │
10126
╞═════╪══════╪═══════╪═════╪══════╪═══════╡
10127
│ 1 ┆ 0.5 ┆ true ┆ 1 ┆ 0.25 ┆ false │
10128
│ 2 ┆ 4.0 ┆ true ┆ 4 ┆ 2.0 ┆ false │
10129
│ 3 ┆ 10.0 ┆ false ┆ 9 ┆ 5.0 ┆ true │
10130
│ 4 ┆ 13.0 ┆ true ┆ 16 ┆ 6.5 ┆ false │
10131
└─────┴──────┴───────┴─────┴──────┴───────┘
10132
10133
Use keyword arguments to easily name your expression inputs.
10134
10135
>>> df.with_columns(
10136
... ab=pl.col("a") * pl.col("b"),
10137
... not_c=pl.col("c").not_(),
10138
... )
10139
shape: (4, 5)
10140
┌─────┬──────┬───────┬──────┬───────┐
10141
│ a ┆ b ┆ c ┆ ab ┆ not_c │
10142
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
10143
│ i64 ┆ f64 ┆ bool ┆ f64 ┆ bool │
10144
╞═════╪══════╪═══════╪══════╪═══════╡
10145
│ 1 ┆ 0.5 ┆ true ┆ 0.5 ┆ false │
10146
│ 2 ┆ 4.0 ┆ true ┆ 8.0 ┆ false │
10147
│ 3 ┆ 10.0 ┆ false ┆ 30.0 ┆ true │
10148
│ 4 ┆ 13.0 ┆ true ┆ 52.0 ┆ false │
10149
└─────┴──────┴───────┴──────┴───────┘
10150
"""
10151
from polars.lazyframe.opt_flags import QueryOptFlags
10152
10153
return (
10154
self.lazy()
10155
.with_columns(*exprs, **named_exprs)
10156
.collect(optimizations=QueryOptFlags._eager())
10157
)
10158
10159
def with_columns_seq(
10160
self,
10161
*exprs: IntoExpr | Iterable[IntoExpr],
10162
**named_exprs: IntoExpr,
10163
) -> DataFrame:
10164
"""
10165
Add columns to this DataFrame.
10166
10167
Added columns will replace existing columns with the same name.
10168
10169
This will run all expression sequentially instead of in parallel.
10170
Use this when the work per expression is cheap.
10171
10172
Parameters
10173
----------
10174
*exprs
10175
Column(s) to add, specified as positional arguments.
10176
Accepts expression input. Strings are parsed as column names, other
10177
non-expression inputs are parsed as literals.
10178
**named_exprs
10179
Additional columns to add, specified as keyword arguments.
10180
The columns will be renamed to the keyword used.
10181
10182
Returns
10183
-------
10184
DataFrame
10185
A new DataFrame with the columns added.
10186
10187
See Also
10188
--------
10189
with_columns
10190
"""
10191
from polars.lazyframe.opt_flags import QueryOptFlags
10192
10193
return (
10194
self.lazy()
10195
.with_columns_seq(*exprs, **named_exprs)
10196
.collect(optimizations=QueryOptFlags._eager())
10197
)
10198
10199
@overload
10200
def n_chunks(self, strategy: Literal["first"] = ...) -> int: ...
10201
10202
@overload
10203
def n_chunks(self, strategy: Literal["all"]) -> list[int]: ...
10204
10205
def n_chunks(self, strategy: Literal["first", "all"] = "first") -> int | list[int]:
10206
"""
10207
Get number of chunks used by the ChunkedArrays of this DataFrame.
10208
10209
Parameters
10210
----------
10211
strategy : {'first', 'all'}
10212
Return the number of chunks of the 'first' column,
10213
or 'all' columns in this DataFrame.
10214
10215
10216
Examples
10217
--------
10218
>>> df = pl.DataFrame(
10219
... {
10220
... "a": [1, 2, 3, 4],
10221
... "b": [0.5, 4, 10, 13],
10222
... "c": [True, True, False, True],
10223
... }
10224
... )
10225
>>> df.n_chunks()
10226
1
10227
>>> df.n_chunks(strategy="all")
10228
[1, 1, 1]
10229
"""
10230
if strategy == "first":
10231
return self._df.n_chunks()
10232
elif strategy == "all":
10233
return [s.n_chunks() for s in self.__iter__()]
10234
else:
10235
msg = (
10236
f"unexpected input for `strategy`: {strategy!r}"
10237
f"\n\nChoose one of {{'first', 'all'}}"
10238
)
10239
raise ValueError(msg)
10240
10241
def max(self) -> DataFrame:
10242
"""
10243
Aggregate the columns of this DataFrame to their maximum value.
10244
10245
Examples
10246
--------
10247
>>> df = pl.DataFrame(
10248
... {
10249
... "foo": [1, 2, 3],
10250
... "bar": [6, 7, 8],
10251
... "ham": ["a", "b", "c"],
10252
... }
10253
... )
10254
>>> df.max()
10255
shape: (1, 3)
10256
┌─────┬─────┬─────┐
10257
│ foo ┆ bar ┆ ham │
10258
│ --- ┆ --- ┆ --- │
10259
│ i64 ┆ i64 ┆ str │
10260
╞═════╪═════╪═════╡
10261
│ 3 ┆ 8 ┆ c │
10262
└─────┴─────┴─────┘
10263
"""
10264
from polars.lazyframe.opt_flags import QueryOptFlags
10265
10266
return self.lazy().max().collect(optimizations=QueryOptFlags._eager())
10267
10268
def max_horizontal(self) -> Series:
10269
"""
10270
Get the maximum value horizontally across columns.
10271
10272
Returns
10273
-------
10274
Series
10275
A Series named `"max"`.
10276
10277
Examples
10278
--------
10279
>>> df = pl.DataFrame(
10280
... {
10281
... "foo": [1, 2, 3],
10282
... "bar": [4.0, 5.0, 6.0],
10283
... }
10284
... )
10285
>>> df.max_horizontal()
10286
shape: (3,)
10287
Series: 'max' [f64]
10288
[
10289
4.0
10290
5.0
10291
6.0
10292
]
10293
"""
10294
return self.select(max=F.max_horizontal(F.all())).to_series()
10295
10296
def min(self) -> DataFrame:
10297
"""
10298
Aggregate the columns of this DataFrame to their minimum value.
10299
10300
Examples
10301
--------
10302
>>> df = pl.DataFrame(
10303
... {
10304
... "foo": [1, 2, 3],
10305
... "bar": [6, 7, 8],
10306
... "ham": ["a", "b", "c"],
10307
... }
10308
... )
10309
>>> df.min()
10310
shape: (1, 3)
10311
┌─────┬─────┬─────┐
10312
│ foo ┆ bar ┆ ham │
10313
│ --- ┆ --- ┆ --- │
10314
│ i64 ┆ i64 ┆ str │
10315
╞═════╪═════╪═════╡
10316
│ 1 ┆ 6 ┆ a │
10317
└─────┴─────┴─────┘
10318
"""
10319
from polars.lazyframe.opt_flags import QueryOptFlags
10320
10321
return self.lazy().min().collect(optimizations=QueryOptFlags._eager())
10322
10323
def min_horizontal(self) -> Series:
10324
"""
10325
Get the minimum value horizontally across columns.
10326
10327
Returns
10328
-------
10329
Series
10330
A Series named `"min"`.
10331
10332
Examples
10333
--------
10334
>>> df = pl.DataFrame(
10335
... {
10336
... "foo": [1, 2, 3],
10337
... "bar": [4.0, 5.0, 6.0],
10338
... }
10339
... )
10340
>>> df.min_horizontal()
10341
shape: (3,)
10342
Series: 'min' [f64]
10343
[
10344
1.0
10345
2.0
10346
3.0
10347
]
10348
"""
10349
return self.select(min=F.min_horizontal(F.all())).to_series()
10350
10351
def sum(self) -> DataFrame:
10352
"""
10353
Aggregate the columns of this DataFrame to their sum value.
10354
10355
Examples
10356
--------
10357
>>> df = pl.DataFrame(
10358
... {
10359
... "foo": [1, 2, 3],
10360
... "bar": [6, 7, 8],
10361
... "ham": ["a", "b", "c"],
10362
... }
10363
... )
10364
>>> df.sum()
10365
shape: (1, 3)
10366
┌─────┬─────┬──────┐
10367
│ foo ┆ bar ┆ ham │
10368
│ --- ┆ --- ┆ --- │
10369
│ i64 ┆ i64 ┆ str │
10370
╞═════╪═════╪══════╡
10371
│ 6 ┆ 21 ┆ null │
10372
└─────┴─────┴──────┘
10373
"""
10374
from polars.lazyframe.opt_flags import QueryOptFlags
10375
10376
return self.lazy().sum().collect(optimizations=QueryOptFlags._eager())
10377
10378
def sum_horizontal(self, *, ignore_nulls: bool = True) -> Series:
10379
"""
10380
Sum all values horizontally across columns.
10381
10382
Parameters
10383
----------
10384
ignore_nulls
10385
Ignore null values (default).
10386
If set to `False`, any null value in the input will lead to a null output.
10387
10388
Returns
10389
-------
10390
Series
10391
A Series named `"sum"`.
10392
10393
Examples
10394
--------
10395
>>> df = pl.DataFrame(
10396
... {
10397
... "foo": [1, 2, 3],
10398
... "bar": [4.0, 5.0, 6.0],
10399
... }
10400
... )
10401
>>> df.sum_horizontal()
10402
shape: (3,)
10403
Series: 'sum' [f64]
10404
[
10405
5.0
10406
7.0
10407
9.0
10408
]
10409
"""
10410
return self.select(
10411
sum=F.sum_horizontal(F.all(), ignore_nulls=ignore_nulls)
10412
).to_series()
10413
10414
def mean(self) -> DataFrame:
10415
"""
10416
Aggregate the columns of this DataFrame to their mean value.
10417
10418
Examples
10419
--------
10420
>>> df = pl.DataFrame(
10421
... {
10422
... "foo": [1, 2, 3],
10423
... "bar": [6, 7, 8],
10424
... "ham": ["a", "b", "c"],
10425
... "spam": [True, False, None],
10426
... }
10427
... )
10428
>>> df.mean()
10429
shape: (1, 4)
10430
┌─────┬─────┬──────┬──────┐
10431
│ foo ┆ bar ┆ ham ┆ spam │
10432
│ --- ┆ --- ┆ --- ┆ --- │
10433
│ f64 ┆ f64 ┆ str ┆ f64 │
10434
╞═════╪═════╪══════╪══════╡
10435
│ 2.0 ┆ 7.0 ┆ null ┆ 0.5 │
10436
└─────┴─────┴──────┴──────┘
10437
"""
10438
from polars.lazyframe.opt_flags import QueryOptFlags
10439
10440
return self.lazy().mean().collect(optimizations=QueryOptFlags._eager())
10441
10442
def mean_horizontal(self, *, ignore_nulls: bool = True) -> Series:
10443
"""
10444
Take the mean of all values horizontally across columns.
10445
10446
Parameters
10447
----------
10448
ignore_nulls
10449
Ignore null values (default).
10450
If set to `False`, any null value in the input will lead to a null output.
10451
10452
Returns
10453
-------
10454
Series
10455
A Series named `"mean"`.
10456
10457
Examples
10458
--------
10459
>>> df = pl.DataFrame(
10460
... {
10461
... "foo": [1, 2, 3],
10462
... "bar": [4.0, 5.0, 6.0],
10463
... }
10464
... )
10465
>>> df.mean_horizontal()
10466
shape: (3,)
10467
Series: 'mean' [f64]
10468
[
10469
2.5
10470
3.5
10471
4.5
10472
]
10473
"""
10474
return self.select(
10475
mean=F.mean_horizontal(F.all(), ignore_nulls=ignore_nulls)
10476
).to_series()
10477
10478
def std(self, ddof: int = 1) -> DataFrame:
10479
"""
10480
Aggregate the columns of this DataFrame to their standard deviation value.
10481
10482
Parameters
10483
----------
10484
ddof
10485
“Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
10486
where N represents the number of elements.
10487
By default ddof is 1.
10488
10489
Examples
10490
--------
10491
>>> df = pl.DataFrame(
10492
... {
10493
... "foo": [1, 2, 3],
10494
... "bar": [6, 7, 8],
10495
... "ham": ["a", "b", "c"],
10496
... }
10497
... )
10498
>>> df.std()
10499
shape: (1, 3)
10500
┌─────┬─────┬──────┐
10501
│ foo ┆ bar ┆ ham │
10502
│ --- ┆ --- ┆ --- │
10503
│ f64 ┆ f64 ┆ str │
10504
╞═════╪═════╪══════╡
10505
│ 1.0 ┆ 1.0 ┆ null │
10506
└─────┴─────┴──────┘
10507
>>> df.std(ddof=0)
10508
shape: (1, 3)
10509
┌──────────┬──────────┬──────┐
10510
│ foo ┆ bar ┆ ham │
10511
│ --- ┆ --- ┆ --- │
10512
│ f64 ┆ f64 ┆ str │
10513
╞══════════╪══════════╪══════╡
10514
│ 0.816497 ┆ 0.816497 ┆ null │
10515
└──────────┴──────────┴──────┘
10516
"""
10517
from polars.lazyframe.opt_flags import QueryOptFlags
10518
10519
return self.lazy().std(ddof).collect(optimizations=QueryOptFlags._eager())
10520
10521
def var(self, ddof: int = 1) -> DataFrame:
10522
"""
10523
Aggregate the columns of this DataFrame to their variance value.
10524
10525
Parameters
10526
----------
10527
ddof
10528
“Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
10529
where N represents the number of elements.
10530
By default ddof is 1.
10531
10532
Examples
10533
--------
10534
>>> df = pl.DataFrame(
10535
... {
10536
... "foo": [1, 2, 3],
10537
... "bar": [6, 7, 8],
10538
... "ham": ["a", "b", "c"],
10539
... }
10540
... )
10541
>>> df.var()
10542
shape: (1, 3)
10543
┌─────┬─────┬──────┐
10544
│ foo ┆ bar ┆ ham │
10545
│ --- ┆ --- ┆ --- │
10546
│ f64 ┆ f64 ┆ str │
10547
╞═════╪═════╪══════╡
10548
│ 1.0 ┆ 1.0 ┆ null │
10549
└─────┴─────┴──────┘
10550
>>> df.var(ddof=0)
10551
shape: (1, 3)
10552
┌──────────┬──────────┬──────┐
10553
│ foo ┆ bar ┆ ham │
10554
│ --- ┆ --- ┆ --- │
10555
│ f64 ┆ f64 ┆ str │
10556
╞══════════╪══════════╪══════╡
10557
│ 0.666667 ┆ 0.666667 ┆ null │
10558
└──────────┴──────────┴──────┘
10559
"""
10560
from polars.lazyframe.opt_flags import QueryOptFlags
10561
10562
return self.lazy().var(ddof).collect(optimizations=QueryOptFlags._eager())
10563
10564
def median(self) -> DataFrame:
10565
"""
10566
Aggregate the columns of this DataFrame to their median value.
10567
10568
Examples
10569
--------
10570
>>> df = pl.DataFrame(
10571
... {
10572
... "foo": [1, 2, 3],
10573
... "bar": [6, 7, 8],
10574
... "ham": ["a", "b", "c"],
10575
... }
10576
... )
10577
>>> df.median()
10578
shape: (1, 3)
10579
┌─────┬─────┬──────┐
10580
│ foo ┆ bar ┆ ham │
10581
│ --- ┆ --- ┆ --- │
10582
│ f64 ┆ f64 ┆ str │
10583
╞═════╪═════╪══════╡
10584
│ 2.0 ┆ 7.0 ┆ null │
10585
└─────┴─────┴──────┘
10586
"""
10587
from polars.lazyframe.opt_flags import QueryOptFlags
10588
10589
return self.lazy().median().collect(optimizations=QueryOptFlags._eager())
10590
10591
def product(self) -> DataFrame:
10592
"""
10593
Aggregate the columns of this DataFrame to their product values.
10594
10595
Examples
10596
--------
10597
>>> df = pl.DataFrame(
10598
... {
10599
... "a": [1, 2, 3],
10600
... "b": [0.5, 4, 10],
10601
... "c": [True, True, False],
10602
... }
10603
... )
10604
10605
>>> df.product()
10606
shape: (1, 3)
10607
┌─────┬──────┬─────┐
10608
│ a ┆ b ┆ c │
10609
│ --- ┆ --- ┆ --- │
10610
│ i64 ┆ f64 ┆ i64 │
10611
╞═════╪══════╪═════╡
10612
│ 6 ┆ 20.0 ┆ 0 │
10613
└─────┴──────┴─────┘
10614
"""
10615
exprs = []
10616
for name, dt in self.schema.items():
10617
if dt.is_numeric() or isinstance(dt, Boolean):
10618
exprs.append(F.col(name).product())
10619
else:
10620
exprs.append(F.lit(None).alias(name))
10621
10622
return self.select(exprs)
10623
10624
def quantile(
10625
self, quantile: float, interpolation: QuantileMethod = "nearest"
10626
) -> DataFrame:
10627
"""
10628
Aggregate the columns of this DataFrame to their quantile value.
10629
10630
Parameters
10631
----------
10632
quantile
10633
Quantile between 0.0 and 1.0.
10634
interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear', 'equiprobable'}
10635
Interpolation method.
10636
10637
Examples
10638
--------
10639
>>> df = pl.DataFrame(
10640
... {
10641
... "foo": [1, 2, 3],
10642
... "bar": [6, 7, 8],
10643
... "ham": ["a", "b", "c"],
10644
... }
10645
... )
10646
>>> df.quantile(0.5, "nearest")
10647
shape: (1, 3)
10648
┌─────┬─────┬──────┐
10649
│ foo ┆ bar ┆ ham │
10650
│ --- ┆ --- ┆ --- │
10651
│ f64 ┆ f64 ┆ str │
10652
╞═════╪═════╪══════╡
10653
│ 2.0 ┆ 7.0 ┆ null │
10654
└─────┴─────┴──────┘
10655
""" # noqa: W505
10656
from polars.lazyframe.opt_flags import QueryOptFlags
10657
10658
return (
10659
self.lazy()
10660
.quantile(quantile, interpolation)
10661
.collect(optimizations=QueryOptFlags._eager())
10662
)
10663
10664
def to_dummies(
10665
self,
10666
columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None,
10667
*,
10668
separator: str = "_",
10669
drop_first: bool = False,
10670
drop_nulls: bool = False,
10671
) -> DataFrame:
10672
"""
10673
Convert categorical variables into dummy/indicator variables.
10674
10675
Parameters
10676
----------
10677
columns
10678
Column name(s) or selector(s) that should be converted to dummy
10679
variables. If set to `None` (default), convert all columns.
10680
separator
10681
Separator/delimiter used when generating column names.
10682
drop_first
10683
Remove the first category from the variables being encoded.
10684
drop_nulls
10685
If there are `None` values in the series, a `null` column is not generated
10686
10687
Examples
10688
--------
10689
>>> df = pl.DataFrame(
10690
... {
10691
... "foo": [1, 2],
10692
... "bar": [3, 4],
10693
... "ham": ["a", "b"],
10694
... }
10695
... )
10696
>>> df.to_dummies()
10697
shape: (2, 6)
10698
┌───────┬───────┬───────┬───────┬───────┬───────┐
10699
│ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │
10700
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
10701
│ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │
10702
╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡
10703
│ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 │
10704
│ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │
10705
└───────┴───────┴───────┴───────┴───────┴───────┘
10706
10707
>>> df.to_dummies(drop_first=True)
10708
shape: (2, 3)
10709
┌───────┬───────┬───────┐
10710
│ foo_2 ┆ bar_4 ┆ ham_b │
10711
│ --- ┆ --- ┆ --- │
10712
│ u8 ┆ u8 ┆ u8 │
10713
╞═══════╪═══════╪═══════╡
10714
│ 0 ┆ 0 ┆ 0 │
10715
│ 1 ┆ 1 ┆ 1 │
10716
└───────┴───────┴───────┘
10717
10718
>>> import polars.selectors as cs
10719
>>> df.to_dummies(cs.integer(), separator=":")
10720
shape: (2, 5)
10721
┌───────┬───────┬───────┬───────┬─────┐
10722
│ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │
10723
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
10724
│ u8 ┆ u8 ┆ u8 ┆ u8 ┆ str │
10725
╞═══════╪═══════╪═══════╪═══════╪═════╡
10726
│ 1 ┆ 0 ┆ 1 ┆ 0 ┆ a │
10727
│ 0 ┆ 1 ┆ 0 ┆ 1 ┆ b │
10728
└───────┴───────┴───────┴───────┴─────┘
10729
10730
>>> df.to_dummies(cs.integer(), drop_first=True, separator=":")
10731
shape: (2, 3)
10732
┌───────┬───────┬─────┐
10733
│ foo:2 ┆ bar:4 ┆ ham │
10734
│ --- ┆ --- ┆ --- │
10735
│ u8 ┆ u8 ┆ str │
10736
╞═══════╪═══════╪═════╡
10737
│ 0 ┆ 0 ┆ a │
10738
│ 1 ┆ 1 ┆ b │
10739
└───────┴───────┴─────┘
10740
"""
10741
if columns is not None:
10742
columns = _expand_selectors(self, columns)
10743
return self._from_pydf(
10744
self._df.to_dummies(columns, separator, drop_first, drop_nulls)
10745
)
10746
10747
def unique(
10748
self,
10749
subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None,
10750
*,
10751
keep: UniqueKeepStrategy = "any",
10752
maintain_order: bool = False,
10753
) -> DataFrame:
10754
"""
10755
Drop duplicate rows from this dataframe.
10756
10757
Parameters
10758
----------
10759
subset
10760
Column name(s) or selector(s), to consider when identifying
10761
duplicate rows. If set to `None` (default), use all columns.
10762
keep : {'first', 'last', 'any', 'none'}
10763
Which of the duplicate rows to keep.
10764
10765
* 'any': Does not give any guarantee of which row is kept.
10766
This allows more optimizations.
10767
* 'none': Don't keep duplicate rows.
10768
* 'first': Keep first unique row.
10769
* 'last': Keep last unique row.
10770
maintain_order
10771
Keep the same order as the original DataFrame. This is more expensive to
10772
compute.
10773
Settings this to `True` blocks the possibility
10774
to run on the streaming engine.
10775
10776
Returns
10777
-------
10778
DataFrame
10779
DataFrame with unique rows.
10780
10781
Warnings
10782
--------
10783
This method will fail if there is a column of type `List` in the DataFrame or
10784
subset.
10785
10786
Notes
10787
-----
10788
If you're coming from pandas, this is similar to
10789
`pandas.DataFrame.drop_duplicates`.
10790
10791
Examples
10792
--------
10793
>>> df = pl.DataFrame(
10794
... {
10795
... "foo": [1, 2, 3, 1],
10796
... "bar": ["a", "a", "a", "a"],
10797
... "ham": ["b", "b", "b", "b"],
10798
... }
10799
... )
10800
>>> df.unique(maintain_order=True)
10801
shape: (3, 3)
10802
┌─────┬─────┬─────┐
10803
│ foo ┆ bar ┆ ham │
10804
│ --- ┆ --- ┆ --- │
10805
│ i64 ┆ str ┆ str │
10806
╞═════╪═════╪═════╡
10807
│ 1 ┆ a ┆ b │
10808
│ 2 ┆ a ┆ b │
10809
│ 3 ┆ a ┆ b │
10810
└─────┴─────┴─────┘
10811
>>> df.unique(subset=["bar", "ham"], maintain_order=True)
10812
shape: (1, 3)
10813
┌─────┬─────┬─────┐
10814
│ foo ┆ bar ┆ ham │
10815
│ --- ┆ --- ┆ --- │
10816
│ i64 ┆ str ┆ str │
10817
╞═════╪═════╪═════╡
10818
│ 1 ┆ a ┆ b │
10819
└─────┴─────┴─────┘
10820
>>> df.unique(keep="last", maintain_order=True)
10821
shape: (3, 3)
10822
┌─────┬─────┬─────┐
10823
│ foo ┆ bar ┆ ham │
10824
│ --- ┆ --- ┆ --- │
10825
│ i64 ┆ str ┆ str │
10826
╞═════╪═════╪═════╡
10827
│ 2 ┆ a ┆ b │
10828
│ 3 ┆ a ┆ b │
10829
│ 1 ┆ a ┆ b │
10830
└─────┴─────┴─────┘
10831
"""
10832
from polars.lazyframe.opt_flags import QueryOptFlags
10833
10834
return (
10835
self.lazy()
10836
.unique(subset=subset, keep=keep, maintain_order=maintain_order)
10837
.collect(optimizations=QueryOptFlags._eager())
10838
)
10839
10840
def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = None) -> int:
10841
"""
10842
Return the number of unique rows, or the number of unique row-subsets.
10843
10844
Parameters
10845
----------
10846
subset
10847
One or more columns/expressions that define what to count;
10848
omit to return the count of unique rows.
10849
10850
Notes
10851
-----
10852
This method operates at the `DataFrame` level; to operate on subsets at the
10853
expression level you can make use of struct-packing instead, for example:
10854
10855
>>> expr_unique_subset = pl.struct("a", "b").n_unique()
10856
10857
If instead you want to count the number of unique values per-column, you can
10858
also use expression-level syntax to return a new frame containing that result:
10859
10860
>>> df = pl.DataFrame(
10861
... [[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"], orient="row"
10862
... )
10863
>>> df_nunique = df.select(pl.all().n_unique())
10864
10865
In aggregate context there is also an equivalent method for returning the
10866
unique values per-group:
10867
10868
>>> df_agg_nunique = df.group_by("a").n_unique()
10869
10870
Examples
10871
--------
10872
>>> df = pl.DataFrame(
10873
... {
10874
... "a": [1, 1, 2, 3, 4, 5],
10875
... "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
10876
... "c": [True, True, True, False, True, True],
10877
... }
10878
... )
10879
>>> df.n_unique()
10880
5
10881
10882
Simple columns subset.
10883
10884
>>> df.n_unique(subset=["b", "c"])
10885
4
10886
10887
Expression subset.
10888
10889
>>> df.n_unique(
10890
... subset=[
10891
... (pl.col("a") // 2),
10892
... (pl.col("c") | (pl.col("b") >= 2)),
10893
... ],
10894
... )
10895
3
10896
"""
10897
if isinstance(subset, str):
10898
expr = F.col(subset)
10899
elif isinstance(subset, pl.Expr):
10900
expr = subset
10901
elif isinstance(subset, Sequence) and len(subset) == 1:
10902
expr = wrap_expr(parse_into_expression(subset[0]))
10903
else:
10904
struct_fields = F.all() if (subset is None) else subset
10905
expr = F.struct(struct_fields)
10906
10907
from polars.lazyframe.opt_flags import QueryOptFlags
10908
10909
df = (
10910
self.lazy()
10911
.select(expr.n_unique())
10912
.collect(optimizations=QueryOptFlags._eager())
10913
)
10914
return 0 if df.is_empty() else df.row(0)[0]
10915
10916
@deprecated(
10917
"`DataFrame.approx_n_unique` is deprecated; "
10918
"use `select(pl.all().approx_n_unique())` instead."
10919
)
10920
def approx_n_unique(self) -> DataFrame:
10921
"""
10922
Approximate count of unique values.
10923
10924
.. deprecated:: 0.20.11
10925
Use the `select(pl.all().approx_n_unique())` method instead.
10926
10927
This is done using the HyperLogLog++ algorithm for cardinality estimation.
10928
10929
Examples
10930
--------
10931
>>> df = pl.DataFrame(
10932
... {
10933
... "a": [1, 2, 3, 4],
10934
... "b": [1, 2, 1, 1],
10935
... }
10936
... )
10937
>>> df.approx_n_unique() # doctest: +SKIP
10938
shape: (1, 2)
10939
┌─────┬─────┐
10940
│ a ┆ b │
10941
│ --- ┆ --- │
10942
│ u32 ┆ u32 │
10943
╞═════╪═════╡
10944
│ 4 ┆ 2 │
10945
└─────┴─────┘
10946
"""
10947
from polars.lazyframe.opt_flags import QueryOptFlags
10948
10949
return (
10950
self.lazy().approx_n_unique().collect(optimizations=QueryOptFlags._eager())
10951
)
10952
10953
def rechunk(self) -> DataFrame:
10954
"""
10955
Rechunk the data in this DataFrame to a contiguous allocation.
10956
10957
This will make sure all subsequent operations have optimal and predictable
10958
performance.
10959
"""
10960
return self._from_pydf(self._df.rechunk())
10961
10962
def null_count(self) -> DataFrame:
10963
"""
10964
Create a new DataFrame that shows the null counts per column.
10965
10966
Examples
10967
--------
10968
>>> df = pl.DataFrame(
10969
... {
10970
... "foo": [1, None, 3],
10971
... "bar": [6, 7, None],
10972
... "ham": ["a", "b", "c"],
10973
... }
10974
... )
10975
>>> df.null_count()
10976
shape: (1, 3)
10977
┌─────┬─────┬─────┐
10978
│ foo ┆ bar ┆ ham │
10979
│ --- ┆ --- ┆ --- │
10980
│ u32 ┆ u32 ┆ u32 │
10981
╞═════╪═════╪═════╡
10982
│ 1 ┆ 1 ┆ 0 │
10983
└─────┴─────┴─────┘
10984
"""
10985
return self._from_pydf(self._df.null_count())
10986
10987
def sample(
10988
self,
10989
n: int | Series | None = None,
10990
*,
10991
fraction: float | Series | None = None,
10992
with_replacement: bool = False,
10993
shuffle: bool = False,
10994
seed: int | None = None,
10995
) -> DataFrame:
10996
"""
10997
Sample from this DataFrame.
10998
10999
Parameters
11000
----------
11001
n
11002
Number of items to return. Cannot be used with `fraction`. Defaults to 1 if
11003
`fraction` is None.
11004
fraction
11005
Fraction of items to return. Cannot be used with `n`.
11006
with_replacement
11007
Allow values to be sampled more than once.
11008
shuffle
11009
If set to True, the order of the sampled rows will be shuffled. If
11010
set to False (default), the order of the returned rows will be
11011
neither stable nor fully random.
11012
seed
11013
Seed for the random number generator. If set to None (default), a
11014
random seed is generated for each sample operation.
11015
11016
Examples
11017
--------
11018
>>> df = pl.DataFrame(
11019
... {
11020
... "foo": [1, 2, 3],
11021
... "bar": [6, 7, 8],
11022
... "ham": ["a", "b", "c"],
11023
... }
11024
... )
11025
>>> df.sample(n=2, seed=0) # doctest: +IGNORE_RESULT
11026
shape: (2, 3)
11027
┌─────┬─────┬─────┐
11028
│ foo ┆ bar ┆ ham │
11029
│ --- ┆ --- ┆ --- │
11030
│ i64 ┆ i64 ┆ str │
11031
╞═════╪═════╪═════╡
11032
│ 3 ┆ 8 ┆ c │
11033
│ 2 ┆ 7 ┆ b │
11034
└─────┴─────┴─────┘
11035
"""
11036
if n is not None and fraction is not None:
11037
msg = "cannot specify both `n` and `fraction`"
11038
raise ValueError(msg)
11039
11040
if seed is None:
11041
seed = random.randint(0, 10000)
11042
11043
if n is None and fraction is not None:
11044
if not isinstance(fraction, pl.Series):
11045
fraction = pl.Series("frac", [fraction])
11046
11047
return self._from_pydf(
11048
self._df.sample_frac(fraction._s, with_replacement, shuffle, seed)
11049
)
11050
11051
if n is None:
11052
n = 1
11053
11054
if not isinstance(n, pl.Series):
11055
n = pl.Series("", [n])
11056
11057
return self._from_pydf(self._df.sample_n(n._s, with_replacement, shuffle, seed))
11058
11059
def fold(self, operation: Callable[[Series, Series], Series]) -> Series:
11060
"""
11061
Apply a horizontal reduction on a DataFrame.
11062
11063
This can be used to effectively determine aggregations on a row level, and can
11064
be applied to any DataType that can be supercast (cast to a similar parent
11065
type).
11066
11067
An example of the supercast rules when applying an arithmetic operation on two
11068
DataTypes are for instance:
11069
11070
- Int8 + String = String
11071
- Float32 + Int64 = Float32
11072
- Float32 + Float64 = Float64
11073
11074
Examples
11075
--------
11076
A horizontal sum operation:
11077
11078
>>> df = pl.DataFrame(
11079
... {
11080
... "a": [2, 1, 3],
11081
... "b": [1, 2, 3],
11082
... "c": [1.0, 2.0, 3.0],
11083
... }
11084
... )
11085
>>> df.fold(lambda s1, s2: s1 + s2)
11086
shape: (3,)
11087
Series: 'a' [f64]
11088
[
11089
4.0
11090
5.0
11091
9.0
11092
]
11093
11094
A horizontal minimum operation:
11095
11096
>>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})
11097
>>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2))
11098
shape: (3,)
11099
Series: 'a' [f64]
11100
[
11101
1.0
11102
1.0
11103
3.0
11104
]
11105
11106
A horizontal string concatenation:
11107
11108
>>> df = pl.DataFrame(
11109
... {
11110
... "a": ["foo", "bar", None],
11111
... "b": [1, 2, 3],
11112
... "c": [1.0, 2.0, 3.0],
11113
... }
11114
... )
11115
>>> df.fold(lambda s1, s2: s1 + s2)
11116
shape: (3,)
11117
Series: 'a' [str]
11118
[
11119
"foo11.0"
11120
"bar22.0"
11121
null
11122
]
11123
11124
A horizontal boolean or, similar to a row-wise .any():
11125
11126
>>> df = pl.DataFrame(
11127
... {
11128
... "a": [False, False, True],
11129
... "b": [False, True, False],
11130
... }
11131
... )
11132
>>> df.fold(lambda s1, s2: s1 | s2)
11133
shape: (3,)
11134
Series: 'a' [bool]
11135
[
11136
false
11137
true
11138
true
11139
]
11140
11141
Parameters
11142
----------
11143
operation
11144
function that takes two `Series` and returns a `Series`.
11145
"""
11146
acc = self.to_series(0)
11147
11148
for i in range(1, self.width):
11149
acc = operation(acc, self.to_series(i))
11150
return acc
11151
11152
@overload
11153
def row(
11154
self,
11155
index: int | None = ...,
11156
*,
11157
by_predicate: Expr | None = ...,
11158
named: Literal[False] = ...,
11159
) -> tuple[Any, ...]: ...
11160
11161
@overload
11162
def row(
11163
self,
11164
index: int | None = ...,
11165
*,
11166
by_predicate: Expr | None = ...,
11167
named: Literal[True],
11168
) -> dict[str, Any]: ...
11169
11170
def row(
11171
self,
11172
index: int | None = None,
11173
*,
11174
by_predicate: Expr | None = None,
11175
named: bool = False,
11176
) -> tuple[Any, ...] | dict[str, Any]:
11177
"""
11178
Get the values of a single row, either by index or by predicate.
11179
11180
Parameters
11181
----------
11182
index
11183
Row index.
11184
by_predicate
11185
Select the row according to a given expression/predicate.
11186
named
11187
Return a dictionary instead of a tuple. The dictionary is a mapping of
11188
column name to row value. This is more expensive than returning a regular
11189
tuple, but allows for accessing values by column name.
11190
11191
Returns
11192
-------
11193
tuple (default) or dictionary of row values
11194
11195
Notes
11196
-----
11197
The `index` and `by_predicate` params are mutually exclusive. Additionally,
11198
to ensure clarity, the `by_predicate` parameter must be supplied by keyword.
11199
11200
When using `by_predicate` it is an error condition if anything other than
11201
one row is returned; more than one row raises `TooManyRowsReturnedError`, and
11202
zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`).
11203
11204
Warnings
11205
--------
11206
You should NEVER use this method to iterate over a DataFrame; if you require
11207
row-iteration you should strongly prefer use of `iter_rows()` instead.
11208
11209
See Also
11210
--------
11211
iter_rows : Row iterator over frame data (does not materialise all rows).
11212
rows : Materialise all frame data as a list of rows (potentially expensive).
11213
item: Return dataframe element as a scalar.
11214
11215
Examples
11216
--------
11217
Specify an index to return the row at the given index as a tuple.
11218
11219
>>> df = pl.DataFrame(
11220
... {
11221
... "foo": [1, 2, 3],
11222
... "bar": [6, 7, 8],
11223
... "ham": ["a", "b", "c"],
11224
... }
11225
... )
11226
>>> df.row(2)
11227
(3, 8, 'c')
11228
11229
Specify `named=True` to get a dictionary instead with a mapping of column
11230
names to row values.
11231
11232
>>> df.row(2, named=True)
11233
{'foo': 3, 'bar': 8, 'ham': 'c'}
11234
11235
Use `by_predicate` to return the row that matches the given predicate.
11236
11237
>>> df.row(by_predicate=(pl.col("ham") == "b"))
11238
(2, 7, 'b')
11239
"""
11240
if index is not None and by_predicate is not None:
11241
msg = "cannot set both 'index' and 'by_predicate'; mutually exclusive"
11242
raise ValueError(msg)
11243
elif isinstance(index, pl.Expr):
11244
msg = "expressions should be passed to the `by_predicate` parameter"
11245
raise TypeError(msg)
11246
11247
if index is not None:
11248
row = self._df.row_tuple(index)
11249
if named:
11250
return dict(zip(self.columns, row))
11251
else:
11252
return row
11253
11254
elif by_predicate is not None:
11255
if not isinstance(by_predicate, pl.Expr):
11256
msg = f"expected `by_predicate` to be an expression, got {qualified_type_name(by_predicate)!r}"
11257
raise TypeError(msg)
11258
rows = self.filter(by_predicate).rows()
11259
n_rows = len(rows)
11260
if n_rows > 1:
11261
msg = f"predicate <{by_predicate!s}> returned {n_rows} rows"
11262
raise TooManyRowsReturnedError(msg)
11263
elif n_rows == 0:
11264
msg = f"predicate <{by_predicate!s}> returned no rows"
11265
raise NoRowsReturnedError(msg)
11266
11267
row = rows[0]
11268
if named:
11269
return dict(zip(self.columns, row))
11270
else:
11271
return row
11272
else:
11273
msg = "one of `index` or `by_predicate` must be set"
11274
raise ValueError(msg)
11275
11276
@overload
11277
def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ...
11278
11279
@overload
11280
def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ...
11281
11282
def rows(
11283
self, *, named: bool = False
11284
) -> list[tuple[Any, ...]] | list[dict[str, Any]]:
11285
"""
11286
Returns all data in the DataFrame as a list of rows of python-native values.
11287
11288
By default, each row is returned as a tuple of values given in the same order
11289
as the frame columns. Setting `named=True` will return rows of dictionaries
11290
instead.
11291
11292
Parameters
11293
----------
11294
named
11295
Return dictionaries instead of tuples. The dictionaries are a mapping of
11296
column name to row value. This is more expensive than returning a regular
11297
tuple, but allows for accessing values by column name.
11298
11299
Notes
11300
-----
11301
If you have `ns`-precision temporal values you should be aware that Python
11302
natively only supports up to `μs`-precision; `ns`-precision values will be
11303
truncated to microseconds on conversion to Python. If this matters to your
11304
use-case you should export to a different format (such as Arrow or NumPy).
11305
11306
Warnings
11307
--------
11308
Row-iteration is not optimal as the underlying data is stored in columnar form;
11309
where possible, prefer export via one of the dedicated export/output methods.
11310
You should also consider using `iter_rows` instead, to avoid materialising all
11311
the data at once; there is little performance difference between the two, but
11312
peak memory can be reduced if processing rows in batches.
11313
11314
Returns
11315
-------
11316
list of row value tuples (default), or list of dictionaries (if `named=True`).
11317
11318
See Also
11319
--------
11320
iter_rows : Row iterator over frame data (does not materialise all rows).
11321
rows_by_key : Materialises frame data as a key-indexed dictionary.
11322
11323
Examples
11324
--------
11325
>>> df = pl.DataFrame(
11326
... {
11327
... "x": ["a", "b", "b", "a"],
11328
... "y": [1, 2, 3, 4],
11329
... "z": [0, 3, 6, 9],
11330
... }
11331
... )
11332
>>> df.rows()
11333
[('a', 1, 0), ('b', 2, 3), ('b', 3, 6), ('a', 4, 9)]
11334
>>> df.rows(named=True)
11335
[{'x': 'a', 'y': 1, 'z': 0},
11336
{'x': 'b', 'y': 2, 'z': 3},
11337
{'x': 'b', 'y': 3, 'z': 6},
11338
{'x': 'a', 'y': 4, 'z': 9}]
11339
"""
11340
if named:
11341
# Load these into the local namespace for a minor performance boost
11342
dict_, zip_, columns = dict, zip, self.columns
11343
return [dict_(zip_(columns, row)) for row in self._df.row_tuples()]
11344
else:
11345
return self._df.row_tuples()
11346
11347
@overload
11348
def rows_by_key(
11349
self,
11350
key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
11351
*,
11352
named: Literal[False] = ...,
11353
include_key: bool = ...,
11354
unique: Literal[False] = ...,
11355
) -> dict[Any, list[Any]]: ...
11356
11357
@overload
11358
def rows_by_key(
11359
self,
11360
key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
11361
*,
11362
named: Literal[False] = ...,
11363
include_key: bool = ...,
11364
unique: Literal[True],
11365
) -> dict[Any, Any]: ...
11366
11367
@overload
11368
def rows_by_key(
11369
self,
11370
key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
11371
*,
11372
named: Literal[True],
11373
include_key: bool = ...,
11374
unique: Literal[False] = ...,
11375
) -> dict[Any, list[dict[str, Any]]]: ...
11376
11377
@overload
11378
def rows_by_key(
11379
self,
11380
key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
11381
*,
11382
named: Literal[True],
11383
include_key: bool = ...,
11384
unique: Literal[True],
11385
) -> dict[Any, dict[str, Any]]: ...
11386
11387
def rows_by_key(
11388
self,
11389
key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
11390
*,
11391
named: bool = False,
11392
include_key: bool = False,
11393
unique: bool = False,
11394
) -> dict[Any, Any]:
11395
"""
11396
Returns all data as a dictionary of python-native values keyed by some column.
11397
11398
This method is like `rows`, but instead of returning rows in a flat list, rows
11399
are grouped by the values in the `key` column(s) and returned as a dictionary.
11400
11401
Note that this method should not be used in place of native operations, due to
11402
the high cost of materializing all frame data out into a dictionary; it should
11403
be used only when you need to move the values out into a Python data structure
11404
or other object that cannot operate directly with Polars/Arrow.
11405
11406
Parameters
11407
----------
11408
key
11409
The column(s) to use as the key for the returned dictionary. If multiple
11410
columns are specified, the key will be a tuple of those values, otherwise
11411
it will be a string.
11412
named
11413
Return dictionary rows instead of tuples, mapping column name to row value.
11414
include_key
11415
Include key values inline with the associated data (by default the key
11416
values are omitted as a memory/performance optimisation, as they can be
11417
reoconstructed from the key).
11418
unique
11419
Indicate that the key is unique; this will result in a 1:1 mapping from
11420
key to a single associated row. Note that if the key is *not* actually
11421
unique the last row with the given key will be returned.
11422
11423
Notes
11424
-----
11425
If you have `ns`-precision temporal values you should be aware that Python
11426
natively only supports up to `μs`-precision; `ns`-precision values will be
11427
truncated to microseconds on conversion to Python. If this matters to your
11428
use-case you should export to a different format (such as Arrow or NumPy).
11429
11430
See Also
11431
--------
11432
rows : Materialize all frame data as a list of rows (potentially expensive).
11433
iter_rows : Row iterator over frame data (does not materialize all rows).
11434
to_dict : Convert DataFrame to a dictionary mapping column name to values.
11435
11436
Examples
11437
--------
11438
>>> df = pl.DataFrame(
11439
... {
11440
... "w": ["a", "b", "b", "a"],
11441
... "x": ["q", "q", "q", "k"],
11442
... "y": [1.0, 2.5, 3.0, 4.5],
11443
... "z": [9, 8, 7, 6],
11444
... }
11445
... )
11446
11447
Group rows by the given key column(s):
11448
11449
>>> df.rows_by_key(key=["w"])
11450
defaultdict(<class 'list'>,
11451
{'a': [('q', 1.0, 9), ('k', 4.5, 6)],
11452
'b': [('q', 2.5, 8), ('q', 3.0, 7)]})
11453
11454
Return the same row groupings as dictionaries:
11455
11456
>>> df.rows_by_key(key=["w"], named=True)
11457
defaultdict(<class 'list'>,
11458
{'a': [{'x': 'q', 'y': 1.0, 'z': 9},
11459
{'x': 'k', 'y': 4.5, 'z': 6}],
11460
'b': [{'x': 'q', 'y': 2.5, 'z': 8},
11461
{'x': 'q', 'y': 3.0, 'z': 7}]})
11462
11463
Return row groupings, assuming keys are unique:
11464
11465
>>> df.rows_by_key(key=["z"], unique=True)
11466
{9: ('a', 'q', 1.0),
11467
8: ('b', 'q', 2.5),
11468
7: ('b', 'q', 3.0),
11469
6: ('a', 'k', 4.5)}
11470
11471
Return row groupings as dictionaries, assuming keys are unique:
11472
11473
>>> df.rows_by_key(key=["z"], named=True, unique=True)
11474
{9: {'w': 'a', 'x': 'q', 'y': 1.0},
11475
8: {'w': 'b', 'x': 'q', 'y': 2.5},
11476
7: {'w': 'b', 'x': 'q', 'y': 3.0},
11477
6: {'w': 'a', 'x': 'k', 'y': 4.5}}
11478
11479
Return dictionary rows grouped by a compound key, including key values:
11480
11481
>>> df.rows_by_key(key=["w", "x"], named=True, include_key=True)
11482
defaultdict(<class 'list'>,
11483
{('a', 'q'): [{'w': 'a', 'x': 'q', 'y': 1.0, 'z': 9}],
11484
('b', 'q'): [{'w': 'b', 'x': 'q', 'y': 2.5, 'z': 8},
11485
{'w': 'b', 'x': 'q', 'y': 3.0, 'z': 7}],
11486
('a', 'k'): [{'w': 'a', 'x': 'k', 'y': 4.5, 'z': 6}]})
11487
"""
11488
key = _expand_selectors(self, key)
11489
11490
keys = (
11491
iter(self.get_column(key[0]))
11492
if len(key) == 1
11493
else self.select(key).iter_rows()
11494
)
11495
11496
if include_key:
11497
values = self
11498
else:
11499
data_cols = [k for k in self.schema if k not in key]
11500
values = self.select(data_cols)
11501
11502
zipped = zip(keys, values.iter_rows(named=named)) # type: ignore[call-overload]
11503
11504
# if unique, we expect to write just one entry per key; otherwise, we're
11505
# returning a list of rows for each key, so append into a defaultdict.
11506
if unique:
11507
rows = dict(zipped)
11508
else:
11509
rows = defaultdict(list)
11510
for key, data in zipped:
11511
rows[key].append(data)
11512
11513
return rows
11514
11515
@overload
11516
def iter_rows(
11517
self, *, named: Literal[False] = ..., buffer_size: int = ...
11518
) -> Iterator[tuple[Any, ...]]: ...
11519
11520
@overload
11521
def iter_rows(
11522
self, *, named: Literal[True], buffer_size: int = ...
11523
) -> Iterator[dict[str, Any]]: ...
11524
11525
def iter_rows(
11526
self, *, named: bool = False, buffer_size: int = 512
11527
) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]:
11528
"""
11529
Returns an iterator over the DataFrame of rows of python-native values.
11530
11531
Parameters
11532
----------
11533
named
11534
Return dictionaries instead of tuples. The dictionaries are a mapping of
11535
column name to row value. This is more expensive than returning a regular
11536
tuple, but allows for accessing values by column name.
11537
buffer_size
11538
Determines the number of rows that are buffered internally while iterating
11539
over the data; you should only modify this in very specific cases where the
11540
default value is determined not to be a good fit to your access pattern, as
11541
the speedup from using the buffer is significant (~2-4x). Setting this
11542
value to zero disables row buffering (not recommended).
11543
11544
Notes
11545
-----
11546
If you have `ns`-precision temporal values you should be aware that Python
11547
natively only supports up to `μs`-precision; `ns`-precision values will be
11548
truncated to microseconds on conversion to Python. If this matters to your
11549
use-case you should export to a different format (such as Arrow or NumPy).
11550
11551
Warnings
11552
--------
11553
Row iteration is not optimal as the underlying data is stored in columnar form;
11554
where possible, prefer export via one of the dedicated export/output methods
11555
that deals with columnar data.
11556
11557
Returns
11558
-------
11559
iterator of tuples (default) or dictionaries (if named) of python row values
11560
11561
See Also
11562
--------
11563
rows : Materialises all frame data as a list of rows (potentially expensive).
11564
rows_by_key : Materialises frame data as a key-indexed dictionary.
11565
11566
Examples
11567
--------
11568
>>> df = pl.DataFrame(
11569
... {
11570
... "a": [1, 3, 5],
11571
... "b": [2, 4, 6],
11572
... }
11573
... )
11574
>>> [row[0] for row in df.iter_rows()]
11575
[1, 3, 5]
11576
>>> [row["b"] for row in df.iter_rows(named=True)]
11577
[2, 4, 6]
11578
"""
11579
# load into the local namespace for a (minor) performance boost in the hot loops
11580
columns, get_row, dict_, zip_ = self.columns, self.row, dict, zip
11581
has_object = Object in self.dtypes
11582
11583
# note: buffering rows results in a 2-4x speedup over individual calls
11584
# to ".row(i)", so it should only be disabled in extremely specific cases.
11585
if buffer_size and not has_object:
11586
for offset in range(0, self.height, buffer_size):
11587
zerocopy_slice = self.slice(offset, buffer_size)
11588
if named:
11589
for row in zerocopy_slice.rows(named=False):
11590
yield dict_(zip_(columns, row))
11591
else:
11592
yield from zerocopy_slice.rows(named=False)
11593
elif named:
11594
for i in range(self.height):
11595
yield dict_(zip_(columns, get_row(i)))
11596
else:
11597
for i in range(self.height):
11598
yield get_row(i)
11599
11600
def iter_columns(self) -> Iterator[Series]:
11601
"""
11602
Returns an iterator over the columns of this DataFrame.
11603
11604
Yields
11605
------
11606
Series
11607
11608
Notes
11609
-----
11610
Consider whether you can use :func:`all` instead.
11611
If you can, it will be more efficient.
11612
11613
Examples
11614
--------
11615
>>> df = pl.DataFrame(
11616
... {
11617
... "a": [1, 3, 5],
11618
... "b": [2, 4, 6],
11619
... }
11620
... )
11621
>>> [s.name for s in df.iter_columns()]
11622
['a', 'b']
11623
11624
If you're using this to modify a dataframe's columns, e.g.
11625
11626
>>> # Do NOT do this
11627
>>> pl.DataFrame(column * 2 for column in df.iter_columns())
11628
shape: (3, 2)
11629
┌─────┬─────┐
11630
│ a ┆ b │
11631
│ --- ┆ --- │
11632
│ i64 ┆ i64 │
11633
╞═════╪═════╡
11634
│ 2 ┆ 4 │
11635
│ 6 ┆ 8 │
11636
│ 10 ┆ 12 │
11637
└─────┴─────┘
11638
11639
then consider whether you can use :func:`all` instead:
11640
11641
>>> df.select(pl.all() * 2)
11642
shape: (3, 2)
11643
┌─────┬─────┐
11644
│ a ┆ b │
11645
│ --- ┆ --- │
11646
│ i64 ┆ i64 │
11647
╞═════╪═════╡
11648
│ 2 ┆ 4 │
11649
│ 6 ┆ 8 │
11650
│ 10 ┆ 12 │
11651
└─────┴─────┘
11652
"""
11653
for s in self._df.get_columns():
11654
yield wrap_s(s)
11655
11656
def iter_slices(self, n_rows: int = 10_000) -> Iterator[DataFrame]:
11657
r"""
11658
Returns a non-copying iterator of slices over the underlying DataFrame.
11659
11660
Parameters
11661
----------
11662
n_rows
11663
Determines the number of rows contained in each DataFrame slice.
11664
11665
Examples
11666
--------
11667
>>> from datetime import date
11668
>>> df = pl.DataFrame(
11669
... data={
11670
... "a": range(17_500),
11671
... "b": date(2023, 1, 1),
11672
... "c": "klmnoopqrstuvwxyz",
11673
... },
11674
... schema_overrides={"a": pl.Int32},
11675
... )
11676
>>> for idx, frame in enumerate(df.iter_slices()):
11677
... print(f"{type(frame).__name__}:[{idx}]:{len(frame)}")
11678
DataFrame:[0]:10000
11679
DataFrame:[1]:7500
11680
11681
Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and
11682
any supported frame export/conversion types; for example, as RecordBatches:
11683
11684
>>> for frame in df.iter_slices(n_rows=15_000):
11685
... record_batch = frame.to_arrow().to_batches()[0]
11686
... print(f"{record_batch.schema}\n<< {len(record_batch)}")
11687
a: int32
11688
b: date32[day]
11689
c: large_string
11690
<< 15000
11691
a: int32
11692
b: date32[day]
11693
c: large_string
11694
<< 2500
11695
11696
See Also
11697
--------
11698
iter_rows : Row iterator over frame data (does not materialise all rows).
11699
partition_by : Split into multiple DataFrames, partitioned by groups.
11700
"""
11701
for offset in range(0, self.height, n_rows):
11702
yield self.slice(offset, n_rows)
11703
11704
def shrink_to_fit(self, *, in_place: bool = False) -> DataFrame:
11705
"""
11706
Shrink DataFrame memory usage.
11707
11708
Shrinks to fit the exact capacity needed to hold the data.
11709
"""
11710
if in_place:
11711
self._df.shrink_to_fit()
11712
return self
11713
else:
11714
df = self.clone()
11715
df._df.shrink_to_fit()
11716
return df
11717
11718
def gather_every(self, n: int, offset: int = 0) -> DataFrame:
11719
"""
11720
Take every nth row in the DataFrame and return as a new DataFrame.
11721
11722
Parameters
11723
----------
11724
n
11725
Gather every *n*-th row.
11726
offset
11727
Starting index.
11728
11729
Examples
11730
--------
11731
>>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})
11732
>>> s.gather_every(2)
11733
shape: (2, 2)
11734
┌─────┬─────┐
11735
│ a ┆ b │
11736
│ --- ┆ --- │
11737
│ i64 ┆ i64 │
11738
╞═════╪═════╡
11739
│ 1 ┆ 5 │
11740
│ 3 ┆ 7 │
11741
└─────┴─────┘
11742
11743
>>> s.gather_every(2, offset=1)
11744
shape: (2, 2)
11745
┌─────┬─────┐
11746
│ a ┆ b │
11747
│ --- ┆ --- │
11748
│ i64 ┆ i64 │
11749
╞═════╪═════╡
11750
│ 2 ┆ 6 │
11751
│ 4 ┆ 8 │
11752
└─────┴─────┘
11753
"""
11754
return self.select(F.col("*").gather_every(n, offset))
11755
11756
def hash_rows(
11757
self,
11758
seed: int = 0,
11759
seed_1: int | None = None,
11760
seed_2: int | None = None,
11761
seed_3: int | None = None,
11762
) -> Series:
11763
"""
11764
Hash and combine the rows in this DataFrame.
11765
11766
The hash value is of type `UInt64`.
11767
11768
Parameters
11769
----------
11770
seed
11771
Random seed parameter. Defaults to 0.
11772
seed_1
11773
Random seed parameter. Defaults to `seed` if not set.
11774
seed_2
11775
Random seed parameter. Defaults to `seed` if not set.
11776
seed_3
11777
Random seed parameter. Defaults to `seed` if not set.
11778
11779
Notes
11780
-----
11781
This implementation of `hash_rows` does not guarantee stable results
11782
across different Polars versions. Its stability is only guaranteed within a
11783
single version.
11784
11785
Examples
11786
--------
11787
>>> df = pl.DataFrame(
11788
... {
11789
... "foo": [1, None, 3, 4],
11790
... "ham": ["a", "b", None, "d"],
11791
... }
11792
... )
11793
>>> df.hash_rows(seed=42) # doctest: +IGNORE_RESULT
11794
shape: (4,)
11795
Series: '' [u64]
11796
[
11797
10783150408545073287
11798
1438741209321515184
11799
10047419486152048166
11800
2047317070637311557
11801
]
11802
"""
11803
k0 = seed
11804
k1 = seed_1 if seed_1 is not None else seed
11805
k2 = seed_2 if seed_2 is not None else seed
11806
k3 = seed_3 if seed_3 is not None else seed
11807
return wrap_s(self._df.hash_rows(k0, k1, k2, k3))
11808
11809
def interpolate(self) -> DataFrame:
11810
"""
11811
Interpolate intermediate values. The interpolation method is linear.
11812
11813
Nulls at the beginning and end of the series remain null.
11814
11815
Examples
11816
--------
11817
>>> df = pl.DataFrame(
11818
... {
11819
... "foo": [1, None, 9, 10],
11820
... "bar": [6, 7, 9, None],
11821
... "baz": [1, None, None, 9],
11822
... }
11823
... )
11824
>>> df.interpolate()
11825
shape: (4, 3)
11826
┌──────┬──────┬──────────┐
11827
│ foo ┆ bar ┆ baz │
11828
│ --- ┆ --- ┆ --- │
11829
│ f64 ┆ f64 ┆ f64 │
11830
╞══════╪══════╪══════════╡
11831
│ 1.0 ┆ 6.0 ┆ 1.0 │
11832
│ 5.0 ┆ 7.0 ┆ 3.666667 │
11833
│ 9.0 ┆ 9.0 ┆ 6.333333 │
11834
│ 10.0 ┆ null ┆ 9.0 │
11835
└──────┴──────┴──────────┘
11836
"""
11837
return self.select(F.col("*").interpolate())
11838
11839
def is_empty(self) -> bool:
11840
"""
11841
Returns `True` if the DataFrame contains no rows.
11842
11843
Examples
11844
--------
11845
>>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
11846
>>> df.is_empty()
11847
False
11848
>>> df.filter(pl.col("foo") > 99).is_empty()
11849
True
11850
"""
11851
return self._df.is_empty()
11852
11853
def to_struct(self, name: str = "") -> Series:
11854
"""
11855
Convert a `DataFrame` to a `Series` of type `Struct`.
11856
11857
Parameters
11858
----------
11859
name
11860
Name for the struct Series
11861
11862
Examples
11863
--------
11864
>>> df = pl.DataFrame(
11865
... {
11866
... "a": [1, 2, 3, 4, 5],
11867
... "b": ["one", "two", "three", "four", "five"],
11868
... }
11869
... )
11870
>>> df.to_struct("nums")
11871
shape: (5,)
11872
Series: 'nums' [struct[2]]
11873
[
11874
{1,"one"}
11875
{2,"two"}
11876
{3,"three"}
11877
{4,"four"}
11878
{5,"five"}
11879
]
11880
"""
11881
return wrap_s(self._df.to_struct(name, []))
11882
11883
def unnest(
11884
self,
11885
columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector],
11886
*more_columns: ColumnNameOrSelector,
11887
) -> DataFrame:
11888
"""
11889
Decompose struct columns into separate columns for each of their fields.
11890
11891
The new columns will be inserted into the dataframe at the location of the
11892
struct column.
11893
11894
Parameters
11895
----------
11896
columns
11897
Name of the struct column(s) that should be unnested.
11898
*more_columns
11899
Additional columns to unnest, specified as positional arguments.
11900
11901
Examples
11902
--------
11903
>>> df = pl.DataFrame(
11904
... {
11905
... "before": ["foo", "bar"],
11906
... "t_a": [1, 2],
11907
... "t_b": ["a", "b"],
11908
... "t_c": [True, None],
11909
... "t_d": [[1, 2], [3]],
11910
... "after": ["baz", "womp"],
11911
... }
11912
... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after")
11913
>>> df
11914
shape: (2, 3)
11915
┌────────┬─────────────────────┬───────┐
11916
│ before ┆ t_struct ┆ after │
11917
│ --- ┆ --- ┆ --- │
11918
│ str ┆ struct[4] ┆ str │
11919
╞════════╪═════════════════════╪═══════╡
11920
│ foo ┆ {1,"a",true,[1, 2]} ┆ baz │
11921
│ bar ┆ {2,"b",null,[3]} ┆ womp │
11922
└────────┴─────────────────────┴───────┘
11923
>>> df.unnest("t_struct")
11924
shape: (2, 6)
11925
┌────────┬─────┬─────┬──────┬───────────┬───────┐
11926
│ before ┆ t_a ┆ t_b ┆ t_c ┆ t_d ┆ after │
11927
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
11928
│ str ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str │
11929
╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
11930
│ foo ┆ 1 ┆ a ┆ true ┆ [1, 2] ┆ baz │
11931
│ bar ┆ 2 ┆ b ┆ null ┆ [3] ┆ womp │
11932
└────────┴─────┴─────┴──────┴───────────┴───────┘
11933
"""
11934
from polars.lazyframe.opt_flags import QueryOptFlags
11935
11936
return (
11937
self.lazy()
11938
.unnest(columns, *more_columns)
11939
.collect(optimizations=QueryOptFlags._eager())
11940
)
11941
11942
def corr(self, **kwargs: Any) -> DataFrame:
11943
"""
11944
Return pairwise Pearson product-moment correlation coefficients between columns.
11945
11946
See numpy `corrcoef` for more information:
11947
https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html
11948
11949
Notes
11950
-----
11951
This functionality requires numpy to be installed.
11952
11953
Parameters
11954
----------
11955
**kwargs
11956
Keyword arguments are passed to numpy `corrcoef`.
11957
11958
Examples
11959
--------
11960
>>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]})
11961
>>> df.corr()
11962
shape: (3, 3)
11963
┌──────┬──────┬──────┐
11964
│ foo ┆ bar ┆ ham │
11965
│ --- ┆ --- ┆ --- │
11966
│ f64 ┆ f64 ┆ f64 │
11967
╞══════╪══════╪══════╡
11968
│ 1.0 ┆ -1.0 ┆ 1.0 │
11969
│ -1.0 ┆ 1.0 ┆ -1.0 │
11970
│ 1.0 ┆ -1.0 ┆ 1.0 │
11971
└──────┴──────┴──────┘
11972
"""
11973
correlation_matrix = np.corrcoef(self.to_numpy(), rowvar=False, **kwargs)
11974
if self.width == 1:
11975
correlation_matrix = np.array([correlation_matrix])
11976
return DataFrame(correlation_matrix, schema=self.columns)
11977
11978
def merge_sorted(self, other: DataFrame, key: str) -> DataFrame:
11979
"""
11980
Take two sorted DataFrames and merge them by the sorted key.
11981
11982
The output of this operation will also be sorted.
11983
It is the callers responsibility that the frames
11984
are sorted in ascending order by that key otherwise
11985
the output will not make sense.
11986
11987
The schemas of both DataFrames must be equal.
11988
11989
Parameters
11990
----------
11991
other
11992
Other DataFrame that must be merged
11993
key
11994
Key that is sorted.
11995
11996
Examples
11997
--------
11998
>>> df0 = pl.DataFrame(
11999
... {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]}
12000
... ).sort("age")
12001
>>> df0
12002
shape: (3, 2)
12003
┌───────┬─────┐
12004
│ name ┆ age │
12005
│ --- ┆ --- │
12006
│ str ┆ i64 │
12007
╞═══════╪═════╡
12008
│ bob ┆ 18 │
12009
│ steve ┆ 42 │
12010
│ elise ┆ 44 │
12011
└───────┴─────┘
12012
>>> df1 = pl.DataFrame(
12013
... {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]}
12014
... ).sort("age")
12015
>>> df1
12016
shape: (4, 2)
12017
┌────────┬─────┐
12018
│ name ┆ age │
12019
│ --- ┆ --- │
12020
│ str ┆ i64 │
12021
╞════════╪═════╡
12022
│ thomas ┆ 20 │
12023
│ anna ┆ 21 │
12024
│ megan ┆ 33 │
12025
│ steve ┆ 42 │
12026
└────────┴─────┘
12027
>>> df0.merge_sorted(df1, key="age")
12028
shape: (7, 2)
12029
┌────────┬─────┐
12030
│ name ┆ age │
12031
│ --- ┆ --- │
12032
│ str ┆ i64 │
12033
╞════════╪═════╡
12034
│ bob ┆ 18 │
12035
│ thomas ┆ 20 │
12036
│ anna ┆ 21 │
12037
│ megan ┆ 33 │
12038
│ steve ┆ 42 │
12039
│ steve ┆ 42 │
12040
│ elise ┆ 44 │
12041
└────────┴─────┘
12042
12043
Notes
12044
-----
12045
No guarantee is given over the output row order when the key is equal
12046
between the both dataframes.
12047
12048
The key must be sorted in ascending order.
12049
"""
12050
from polars.lazyframe.opt_flags import QueryOptFlags
12051
12052
require_same_type(self, other)
12053
12054
return (
12055
self.lazy()
12056
.merge_sorted(other.lazy(), key)
12057
.collect(optimizations=QueryOptFlags._eager())
12058
)
12059
12060
def set_sorted(
12061
self,
12062
column: str,
12063
*,
12064
descending: bool = False,
12065
) -> DataFrame:
12066
"""
12067
Flag a column as sorted.
12068
12069
This can speed up future operations.
12070
12071
Parameters
12072
----------
12073
column
12074
Column that is sorted
12075
descending
12076
Whether the column is sorted in descending order.
12077
12078
Warnings
12079
--------
12080
This can lead to incorrect results if the data is NOT sorted!!
12081
Use with care!
12082
12083
"""
12084
# NOTE: Only accepts 1 column on purpose! User think they are sorted by
12085
# the combined multicolumn values.
12086
from polars.lazyframe.opt_flags import QueryOptFlags
12087
12088
return (
12089
self.lazy()
12090
.set_sorted(column, descending=descending)
12091
.collect(optimizations=QueryOptFlags._eager())
12092
)
12093
12094
@unstable()
12095
def update(
12096
self,
12097
other: DataFrame,
12098
on: str | Sequence[str] | None = None,
12099
how: Literal["left", "inner", "full"] = "left",
12100
*,
12101
left_on: str | Sequence[str] | None = None,
12102
right_on: str | Sequence[str] | None = None,
12103
include_nulls: bool = False,
12104
maintain_order: MaintainOrderJoin | None = "left",
12105
) -> DataFrame:
12106
"""
12107
Update the values in this `DataFrame` with the values in `other`.
12108
12109
.. warning::
12110
This functionality is considered **unstable**. It may be changed
12111
at any point without it being considered a breaking change.
12112
12113
Parameters
12114
----------
12115
other
12116
DataFrame that will be used to update the values
12117
on
12118
Column names that will be joined on. If set to `None` (default),
12119
the implicit row index of each frame is used as a join key.
12120
how : {'left', 'inner', 'full'}
12121
* 'left' will keep all rows from the left table; rows may be duplicated
12122
if multiple rows in the right frame match the left row's key.
12123
* 'inner' keeps only those rows where the key exists in both frames.
12124
* 'full' will update existing rows where the key matches while also
12125
adding any new rows contained in the given frame.
12126
left_on
12127
Join column(s) of the left DataFrame.
12128
right_on
12129
Join column(s) of the right DataFrame.
12130
include_nulls
12131
Overwrite values in the left frame with null values from the right frame.
12132
If set to `False` (default), null values in the right frame are ignored.
12133
maintain_order : {'none', 'left', 'right', 'left_right', 'right_left'}
12134
Which order of rows from the inputs to preserve. See :func:`~DataFrame.join`
12135
for details. Unlike `join` this function preserves the left order by
12136
default.
12137
12138
Notes
12139
-----
12140
This is syntactic sugar for a left/inner join that preserves the order
12141
of the left `DataFrame` by default, with an optional coalesce when
12142
`include_nulls = False`.
12143
12144
Examples
12145
--------
12146
>>> df = pl.DataFrame(
12147
... {
12148
... "A": [1, 2, 3, 4],
12149
... "B": [400, 500, 600, 700],
12150
... }
12151
... )
12152
>>> df
12153
shape: (4, 2)
12154
┌─────┬─────┐
12155
│ A ┆ B │
12156
│ --- ┆ --- │
12157
│ i64 ┆ i64 │
12158
╞═════╪═════╡
12159
│ 1 ┆ 400 │
12160
│ 2 ┆ 500 │
12161
│ 3 ┆ 600 │
12162
│ 4 ┆ 700 │
12163
└─────┴─────┘
12164
>>> new_df = pl.DataFrame(
12165
... {
12166
... "B": [-66, None, -99],
12167
... "C": [5, 3, 1],
12168
... }
12169
... )
12170
12171
Update `df` values with the non-null values in `new_df`, by row index:
12172
12173
>>> df.update(new_df)
12174
shape: (4, 2)
12175
┌─────┬─────┐
12176
│ A ┆ B │
12177
│ --- ┆ --- │
12178
│ i64 ┆ i64 │
12179
╞═════╪═════╡
12180
│ 1 ┆ -66 │
12181
│ 2 ┆ 500 │
12182
│ 3 ┆ -99 │
12183
│ 4 ┆ 700 │
12184
└─────┴─────┘
12185
12186
Update `df` values with the non-null values in `new_df`, by row index,
12187
but only keeping those rows that are common to both frames:
12188
12189
>>> df.update(new_df, how="inner")
12190
shape: (3, 2)
12191
┌─────┬─────┐
12192
│ A ┆ B │
12193
│ --- ┆ --- │
12194
│ i64 ┆ i64 │
12195
╞═════╪═════╡
12196
│ 1 ┆ -66 │
12197
│ 2 ┆ 500 │
12198
│ 3 ┆ -99 │
12199
└─────┴─────┘
12200
12201
Update `df` values with the non-null values in `new_df`, using a full
12202
outer join strategy that defines explicit join columns in each frame:
12203
12204
>>> df.update(new_df, left_on=["A"], right_on=["C"], how="full")
12205
shape: (5, 2)
12206
┌─────┬─────┐
12207
│ A ┆ B │
12208
│ --- ┆ --- │
12209
│ i64 ┆ i64 │
12210
╞═════╪═════╡
12211
│ 1 ┆ -99 │
12212
│ 2 ┆ 500 │
12213
│ 3 ┆ 600 │
12214
│ 4 ┆ 700 │
12215
│ 5 ┆ -66 │
12216
└─────┴─────┘
12217
12218
Update `df` values including null values in `new_df`, using a full outer
12219
join strategy that defines explicit join columns in each frame:
12220
12221
>>> df.update(new_df, left_on="A", right_on="C", how="full", include_nulls=True)
12222
shape: (5, 2)
12223
┌─────┬──────┐
12224
│ A ┆ B │
12225
│ --- ┆ --- │
12226
│ i64 ┆ i64 │
12227
╞═════╪══════╡
12228
│ 1 ┆ -99 │
12229
│ 2 ┆ 500 │
12230
│ 3 ┆ null │
12231
│ 4 ┆ 700 │
12232
│ 5 ┆ -66 │
12233
└─────┴──────┘
12234
"""
12235
from polars.lazyframe.opt_flags import QueryOptFlags
12236
12237
require_same_type(self, other)
12238
return (
12239
self.lazy()
12240
.update(
12241
other.lazy(),
12242
on,
12243
how,
12244
left_on=left_on,
12245
right_on=right_on,
12246
include_nulls=include_nulls,
12247
maintain_order=maintain_order,
12248
)
12249
.collect(optimizations=QueryOptFlags._eager())
12250
)
12251
12252
def count(self) -> DataFrame:
12253
"""
12254
Return the number of non-null elements for each column.
12255
12256
Examples
12257
--------
12258
>>> df = pl.DataFrame(
12259
... {"a": [1, 2, 3, 4], "b": [1, 2, 1, None], "c": [None, None, None, None]}
12260
... )
12261
>>> df.count()
12262
shape: (1, 3)
12263
┌─────┬─────┬─────┐
12264
│ a ┆ b ┆ c │
12265
│ --- ┆ --- ┆ --- │
12266
│ u32 ┆ u32 ┆ u32 │
12267
╞═════╪═════╪═════╡
12268
│ 4 ┆ 3 ┆ 0 │
12269
└─────┴─────┴─────┘
12270
"""
12271
from polars.lazyframe.opt_flags import QueryOptFlags
12272
12273
return self.lazy().count().collect(optimizations=QueryOptFlags._eager())
12274
12275
@deprecated(
12276
"`DataFrame.melt` is deprecated; use `DataFrame.unpivot` instead, with "
12277
"`index` instead of `id_vars` and `on` instead of `value_vars`"
12278
)
12279
def melt(
12280
self,
12281
id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None,
12282
value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None,
12283
variable_name: str | None = None,
12284
value_name: str | None = None,
12285
) -> DataFrame:
12286
"""
12287
Unpivot a DataFrame from wide to long format.
12288
12289
Optionally leaves identifiers set.
12290
12291
This function is useful to massage a DataFrame into a format where one or more
12292
columns are identifier variables (id_vars) while all other columns, considered
12293
measured variables (value_vars), are "unpivoted" to the row axis leaving just
12294
two non-identifier columns, 'variable' and 'value'.
12295
12296
.. deprecated:: 1.0.0
12297
Use the :meth:`.unpivot` method instead.
12298
12299
Parameters
12300
----------
12301
id_vars
12302
Column(s) or selector(s) to use as identifier variables.
12303
value_vars
12304
Column(s) or selector(s) to use as values variables; if `value_vars`
12305
is empty all columns that are not in `id_vars` will be used.
12306
variable_name
12307
Name to give to the `variable` column. Defaults to "variable"
12308
value_name
12309
Name to give to the `value` column. Defaults to "value"
12310
"""
12311
return self.unpivot(
12312
index=id_vars,
12313
on=value_vars,
12314
variable_name=variable_name,
12315
value_name=value_name,
12316
)
12317
12318
@unstable()
12319
def match_to_schema(
12320
self,
12321
schema: SchemaDict | Schema,
12322
*,
12323
missing_columns: Literal["insert", "raise"]
12324
| Mapping[str, Literal["insert", "raise"] | Expr] = "raise",
12325
missing_struct_fields: Literal["insert", "raise"]
12326
| Mapping[str, Literal["insert", "raise"]] = "raise",
12327
extra_columns: Literal["ignore", "raise"] = "raise",
12328
extra_struct_fields: Literal["ignore", "raise"]
12329
| Mapping[str, Literal["ignore", "raise"]] = "raise",
12330
integer_cast: Literal["upcast", "forbid"]
12331
| Mapping[str, Literal["upcast", "forbid"]] = "forbid",
12332
float_cast: Literal["upcast", "forbid"]
12333
| Mapping[str, Literal["upcast", "forbid"]] = "forbid",
12334
) -> DataFrame:
12335
"""
12336
Match or evolve the schema of a LazyFrame into a specific schema.
12337
12338
By default, match_to_schema returns an error if the input schema does not
12339
exactly match the target schema. It also allows columns to be freely reordered,
12340
with additional coercion rules available through optional parameters.
12341
12342
.. warning::
12343
This functionality is considered **unstable**. It may be changed
12344
at any point without it being considered a breaking change.
12345
12346
Parameters
12347
----------
12348
schema
12349
Target schema to match or evolve to.
12350
missing_columns
12351
Raise of insert missing columns from the input with respect to the `schema`.
12352
12353
This can also be an expression per column with what to insert if it is
12354
missing.
12355
missing_struct_fields
12356
Raise of insert missing struct fields from the input with respect to the
12357
`schema`.
12358
extra_columns
12359
Raise of ignore extra columns from the input with respect to the `schema`.
12360
extra_struct_fields
12361
Raise of ignore extra struct fields from the input with respect to the
12362
`schema`.
12363
integer_cast
12364
Forbid of upcast for integer columns from the input to the respective column
12365
in `schema`.
12366
float_cast
12367
Forbid of upcast for float columns from the input to the respective column
12368
in `schema`.
12369
12370
Examples
12371
--------
12372
Ensuring the schema matches
12373
12374
>>> df = pl.DataFrame({"a": [1, 2, 3], "b": ["A", "B", "C"]})
12375
>>> df.match_to_schema({"a": pl.Int64, "b": pl.String})
12376
shape: (3, 2)
12377
┌─────┬─────┐
12378
│ a ┆ b │
12379
│ --- ┆ --- │
12380
│ i64 ┆ str │
12381
╞═════╪═════╡
12382
│ 1 ┆ A │
12383
│ 2 ┆ B │
12384
│ 3 ┆ C │
12385
└─────┴─────┘
12386
>>> df.match_to_schema({"a": pl.Int64}) # doctest: +SKIP
12387
polars.exceptions.SchemaError: extra columns in `match_to_schema`: "b"
12388
12389
Adding missing columns
12390
12391
>>> (
12392
... pl.DataFrame({"a": [1, 2, 3]}).match_to_schema(
12393
... {"a": pl.Int64, "b": pl.String},
12394
... missing_columns="insert",
12395
... )
12396
... )
12397
shape: (3, 2)
12398
┌─────┬──────┐
12399
│ a ┆ b │
12400
│ --- ┆ --- │
12401
│ i64 ┆ str │
12402
╞═════╪══════╡
12403
│ 1 ┆ null │
12404
│ 2 ┆ null │
12405
│ 3 ┆ null │
12406
└─────┴──────┘
12407
>>> (
12408
... pl.DataFrame({"a": [1, 2, 3]}).match_to_schema(
12409
... {"a": pl.Int64, "b": pl.String},
12410
... missing_columns={"b": pl.col.a.cast(pl.String)},
12411
... )
12412
... )
12413
shape: (3, 2)
12414
┌─────┬─────┐
12415
│ a ┆ b │
12416
│ --- ┆ --- │
12417
│ i64 ┆ str │
12418
╞═════╪═════╡
12419
│ 1 ┆ 1 │
12420
│ 2 ┆ 2 │
12421
│ 3 ┆ 3 │
12422
└─────┴─────┘
12423
12424
Removing extra columns
12425
12426
>>> (
12427
... pl.DataFrame({"a": [1, 2, 3], "b": ["A", "B", "C"]}).match_to_schema(
12428
... {"a": pl.Int64},
12429
... extra_columns="ignore",
12430
... )
12431
... )
12432
shape: (3, 1)
12433
┌─────┐
12434
│ a │
12435
│ --- │
12436
│ i64 │
12437
╞═════╡
12438
│ 1 │
12439
│ 2 │
12440
│ 3 │
12441
└─────┘
12442
12443
Upcasting integers and floats
12444
12445
>>> (
12446
... pl.DataFrame(
12447
... {"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]},
12448
... schema={"a": pl.Int32, "b": pl.Float32},
12449
... ).match_to_schema(
12450
... {"a": pl.Int64, "b": pl.Float64},
12451
... integer_cast="upcast",
12452
... float_cast="upcast",
12453
... )
12454
... )
12455
shape: (3, 2)
12456
┌─────┬─────┐
12457
│ a ┆ b │
12458
│ --- ┆ --- │
12459
│ i64 ┆ f64 │
12460
╞═════╪═════╡
12461
│ 1 ┆ 1.0 │
12462
│ 2 ┆ 2.0 │
12463
│ 3 ┆ 3.0 │
12464
└─────┴─────┘
12465
"""
12466
from polars.lazyframe.opt_flags import QueryOptFlags
12467
12468
return (
12469
self.lazy()
12470
.match_to_schema(
12471
schema=schema,
12472
missing_columns=missing_columns,
12473
missing_struct_fields=missing_struct_fields,
12474
extra_columns=extra_columns,
12475
extra_struct_fields=extra_struct_fields,
12476
integer_cast=integer_cast,
12477
float_cast=float_cast,
12478
)
12479
.collect(optimizations=QueryOptFlags._eager())
12480
)
12481
12482
def _to_metadata(
12483
self,
12484
columns: None | str | list[str] = None,
12485
stats: None | str | list[str] = None,
12486
) -> DataFrame:
12487
"""
12488
Get all runtime metadata for each column.
12489
12490
This is unstable and is meant for debugging purposes.
12491
12492
Parameters
12493
----------
12494
columns
12495
Column(s) to show the information for
12496
stats
12497
Statistics to show
12498
"""
12499
df = self
12500
12501
if columns is not None:
12502
if isinstance(columns, str):
12503
columns = [columns]
12504
12505
df = df.select(columns)
12506
12507
md = self._from_pydf(df._df._to_metadata())
12508
12509
if stats is not None:
12510
if isinstance(stats, str):
12511
stats = [stats]
12512
12513
if "column_name" not in stats:
12514
stats = ["column_name"] + stats
12515
12516
md = md.select(stats)
12517
12518
return md
12519
12520
def _row_encode(
12521
self,
12522
*,
12523
unordered: bool = False,
12524
descending: list[bool] | None = None,
12525
nulls_last: list[bool] | None = None,
12526
) -> Series:
12527
"""
12528
Row encode the given DataFrame.
12529
12530
This is an internal function not meant for outside consumption and can
12531
be changed or removed at any point in time.
12532
12533
fields have order:
12534
- descending
12535
- nulls_last
12536
- no_order
12537
"""
12538
return self.select_seq(
12539
F._row_encode(
12540
F.all(),
12541
unordered=unordered,
12542
descending=descending,
12543
nulls_last=nulls_last,
12544
)
12545
).to_series()
12546
12547
12548
def _prepare_other_arg(other: Any, length: int | None = None) -> Series:
12549
# if not a series create singleton series such that it will broadcast
12550
value = other
12551
if not isinstance(other, pl.Series):
12552
if isinstance(other, str):
12553
pass
12554
elif isinstance(other, Sequence):
12555
msg = "operation not supported"
12556
raise TypeError(msg)
12557
other = pl.Series("", [other])
12558
12559
if length is not None:
12560
if length > 1:
12561
other = other.extend_constant(value=value, n=length - 1)
12562
elif length == 0:
12563
other = other.slice(0, 0)
12564
12565
return other
12566
12567