CoCalc -- frame.py

GitHub Repository: pola-rs/polars
Path: blob/main/py-polars/polars/dataframe/frame.py
⁶⁹²⁴ views
1
"""Module containing logic related to eager DataFrames."""
2

3
from __future__ import annotations
4

5
import contextlib
6
import os
7
import random
8
from collections import defaultdict
9
from collections.abc import (
10
    Generator,
11
    Iterable,
12
    Mapping,
13
    Sequence,
14
    Sized,
15
)
16
from io import BytesIO, StringIO
17
from pathlib import Path
18
from typing import (
19
    IO,
20
    TYPE_CHECKING,
21
    Any,
22
    Callable,
23
    ClassVar,
24
    NoReturn,
25
    TypeVar,
26
    cast,
27
    get_args,
28
    overload,
29
)
30

31
import polars._reexport as pl
32
from polars import functions as F
33
from polars._typing import DbWriteMode, JaxExportType, TorchExportType
34
from polars._utils.construction import (
35
    arrow_to_pydf,
36
    dataframe_to_pydf,
37
    dict_to_pydf,
38
    iterable_to_pydf,
39
    numpy_to_pydf,
40
    pandas_to_pydf,
41
    sequence_to_pydf,
42
    series_to_pydf,
43
)
44
from polars._utils.convert import parse_as_duration_string
45
from polars._utils.deprecation import (
46
    deprecate_renamed_parameter,
47
    deprecated,
48
    issue_deprecation_warning,
49
)
50
from polars._utils.getitem import get_df_item_by_key
51
from polars._utils.parse import parse_into_expression
52
from polars._utils.pycapsule import is_pycapsule, pycapsule_to_frame
53
from polars._utils.serde import serialize_polars_object
54
from polars._utils.unstable import issue_unstable_warning, unstable
55
from polars._utils.various import (
56
    is_bool_sequence,
57
    no_default,
58
    normalize_filepath,
59
    parse_version,
60
    qualified_type_name,
61
    require_same_type,
62
    scale_bytes,
63
    warn_null_comparison,
64
)
65
from polars._utils.wrap import wrap_expr, wrap_ldf, wrap_s
66
from polars.dataframe._html import NotebookFormatter
67
from polars.dataframe.group_by import DynamicGroupBy, GroupBy, RollingGroupBy
68
from polars.dataframe.plotting import DataFramePlot
69
from polars.datatypes import (
70
    N_INFER_DEFAULT,
71
    Boolean,
72
    Float32,
73
    Float64,
74
    Int32,
75
    Int64,
76
    Null,
77
    Object,
78
    String,
79
    Struct,
80
    UInt16,
81
    UInt32,
82
    UInt64,
83
)
84
from polars.datatypes.group import INTEGER_DTYPES
85
from polars.dependencies import (
86
    _ALTAIR_AVAILABLE,
87
    _GREAT_TABLES_AVAILABLE,
88
    _PANDAS_AVAILABLE,
89
    _PYARROW_AVAILABLE,
90
    _check_for_numpy,
91
    _check_for_pandas,
92
    _check_for_pyarrow,
93
    _check_for_torch,
94
    altair,
95
    great_tables,
96
    import_optional,
97
    torch,
98
)
99
from polars.dependencies import numpy as np
100
from polars.dependencies import pandas as pd
101
from polars.dependencies import pyarrow as pa
102
from polars.exceptions import (
103
    ColumnNotFoundError,
104
    InvalidOperationError,
105
    ModuleUpgradeRequiredError,
106
    NoRowsReturnedError,
107
    TooManyRowsReturnedError,
108
)
109
from polars.functions import col, lit
110
from polars.interchange.protocol import CompatLevel
111
from polars.schema import Schema
112
from polars.selectors import _expand_selector_dicts, _expand_selectors
113

114
with contextlib.suppress(ImportError):  # Module not available when building docs
115
    from polars._plr import PyDataFrame
116
    from polars._plr import dtype_str_repr as _dtype_str_repr
117
    from polars._plr import write_clipboard_string as _write_clipboard_string
118

119
if TYPE_CHECKING:
120
    import sys
121
    from collections.abc import Collection, Iterator, Mapping
122
    from datetime import timedelta
123
    from io import IOBase
124
    from typing import Literal
125

126
    import deltalake
127
    import jax
128
    import numpy.typing as npt
129
    import pyiceberg
130
    from great_tables import GT
131
    from xlsxwriter import Workbook
132
    from xlsxwriter.worksheet import Worksheet
133

134
    from polars import DataType, Expr, LazyFrame, Series
135
    from polars._typing import (
136
        AsofJoinStrategy,
137
        AvroCompression,
138
        ClosedInterval,
139
        ColumnFormatDict,
140
        ColumnNameOrSelector,
141
        ColumnTotalsDefinition,
142
        ColumnWidthsDefinition,
143
        ComparisonOperator,
144
        ConditionalFormatDict,
145
        ConnectionOrCursor,
146
        CsvQuoteStyle,
147
        DbWriteEngine,
148
        EngineType,
149
        FillNullStrategy,
150
        FrameInitTypes,
151
        IndexOrder,
152
        IntoExpr,
153
        IntoExprColumn,
154
        IpcCompression,
155
        JoinStrategy,
156
        JoinValidation,
157
        Label,
158
        MaintainOrderJoin,
159
        MultiColSelector,
160
        MultiIndexSelector,
161
        OneOrMoreDataTypes,
162
        Orientation,
163
        ParquetCompression,
164
        ParquetMetadata,
165
        PartitioningScheme,
166
        PivotAgg,
167
        PolarsDataType,
168
        PythonDataType,
169
        QuantileMethod,
170
        RowTotalsDefinition,
171
        SchemaDefinition,
172
        SchemaDict,
173
        SelectorType,
174
        SerializationFormat,
175
        SingleColSelector,
176
        SingleIndexSelector,
177
        SizeUnit,
178
        StartBy,
179
        UniqueKeepStrategy,
180
        UnstackDirection,
181
    )
182
    from polars._utils.various import NoDefault
183
    from polars.interchange.dataframe import PolarsDataFrame
184
    from polars.io.cloud import CredentialProviderFunction
185
    from polars.ml.torch import PolarsDataset
186

187
    if sys.version_info >= (3, 10):
188
        from typing import Concatenate, ParamSpec
189
    else:
190
        from typing_extensions import Concatenate, ParamSpec
191

192
    if sys.version_info >= (3, 13):
193
        from warnings import deprecated
194
    else:
195
        from typing_extensions import deprecated  # noqa: TC004
196

197
    T = TypeVar("T")
198
    P = ParamSpec("P")
199

200

201
class DataFrame:
202
    """
203
    Two-dimensional data structure representing data as a table with rows and columns.
204

205
    Parameters
206
    ----------
207
    data : dict, Sequence, ndarray, Series, or pandas.DataFrame
208
        Two-dimensional data in various forms; dict input must contain Sequences,
209
        Generators, or a `range`. Sequence may contain Series or other Sequences.
210
    schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict
211
        The schema of the resulting DataFrame. The schema may be declared in several
212
        ways:
213

214
        * As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
215
        * As a list of column names; in this case types are automatically inferred.
216
        * As a list of (name,type) pairs; this is equivalent to the dictionary form.
217

218
        If you supply a list of column names that does not match the names in the
219
        underlying data, the names given here will overwrite them. The number
220
        of names given in the schema should match the underlying data dimensions.
221

222
        If set to `None` (default), the schema is inferred from the data.
223
    schema_overrides : dict, default None
224
        Support type specification or override of one or more columns; note that
225
        any dtypes inferred from the schema param will be overridden.
226

227
        The number of entries in the schema should match the underlying data
228
        dimensions, unless a sequence of dictionaries is being passed, in which case
229
        a *partial* schema can be declared to prevent specific fields from being loaded.
230
    strict : bool, default True
231
        Throw an error if any `data` value does not exactly match the given or inferred
232
        data type for that column. If set to `False`, values that do not match the data
233
        type are cast to that data type or, if casting is not possible, set to null
234
        instead.
235
    orient : {'col', 'row'}, default None
236
        Whether to interpret two-dimensional data as columns or as rows. If None,
237
        the orientation is inferred by matching the columns and data dimensions. If
238
        this does not yield conclusive results, column orientation is used.
239
    infer_schema_length : int or None
240
        The maximum number of rows to scan for schema inference. If set to `None`, the
241
        full data may be scanned *(this can be slow)*. This parameter only applies if
242
        the input data is a sequence or generator of rows; other input is read as-is.
243
    nan_to_null : bool, default False
244
        If the data comes from one or more numpy arrays, can optionally convert input
245
        data np.nan values to null instead. This is a no-op for all other input data.
246

247
    Notes
248
    -----
249
    Polars explicitly does not support subclassing of its core data types. See
250
    the following GitHub issue for possible workarounds:
251
    https://github.com/pola-rs/polars/issues/2846#issuecomment-1711799869
252

253
    Examples
254
    --------
255
    Constructing a DataFrame from a dictionary:
256

257
    >>> data = {"a": [1, 2], "b": [3, 4]}
258
    >>> df = pl.DataFrame(data)
259
    >>> df
260
    shape: (2, 2)
261
    ┌─────┬─────┐
262
    │ a   ┆ b   │
263
    │ --- ┆ --- │
264
    │ i64 ┆ i64 │
265
    ╞═════╪═════╡
266
    │ 1   ┆ 3   │
267
    │ 2   ┆ 4   │
268
    └─────┴─────┘
269

270
    Notice that the dtypes are automatically inferred as polars Int64:
271

272
    >>> df.dtypes
273
    [Int64, Int64]
274

275
    To specify a more detailed/specific frame schema you can supply the `schema`
276
    parameter with a dictionary of (name,dtype) pairs...
277

278
    >>> data = {"col1": [0, 2], "col2": [3, 7]}
279
    >>> df2 = pl.DataFrame(data, schema={"col1": pl.Float32, "col2": pl.Int64})
280
    >>> df2
281
    shape: (2, 2)
282
    ┌──────┬──────┐
283
    │ col1 ┆ col2 │
284
    │ ---  ┆ ---  │
285
    │ f32  ┆ i64  │
286
    ╞══════╪══════╡
287
    │ 0.0  ┆ 3    │
288
    │ 2.0  ┆ 7    │
289
    └──────┴──────┘
290

291
    ...a sequence of (name,dtype) pairs...
292

293
    >>> data = {"col1": [1, 2], "col2": [3, 4]}
294
    >>> df3 = pl.DataFrame(data, schema=[("col1", pl.Float32), ("col2", pl.Int64)])
295
    >>> df3
296
    shape: (2, 2)
297
    ┌──────┬──────┐
298
    │ col1 ┆ col2 │
299
    │ ---  ┆ ---  │
300
    │ f32  ┆ i64  │
301
    ╞══════╪══════╡
302
    │ 1.0  ┆ 3    │
303
    │ 2.0  ┆ 4    │
304
    └──────┴──────┘
305

306
    ...or a list of typed Series.
307

308
    >>> data = [
309
    ...     pl.Series("col1", [1, 2], dtype=pl.Float32),
310
    ...     pl.Series("col2", [3, 4], dtype=pl.Int64),
311
    ... ]
312
    >>> df4 = pl.DataFrame(data)
313
    >>> df4
314
    shape: (2, 2)
315
    ┌──────┬──────┐
316
    │ col1 ┆ col2 │
317
    │ ---  ┆ ---  │
318
    │ f32  ┆ i64  │
319
    ╞══════╪══════╡
320
    │ 1.0  ┆ 3    │
321
    │ 2.0  ┆ 4    │
322
    └──────┴──────┘
323

324
    Constructing a DataFrame from a numpy ndarray, specifying column names:
325

326
    >>> import numpy as np
327
    >>> data = np.array([(1, 2), (3, 4)], dtype=np.int64)
328
    >>> df5 = pl.DataFrame(data, schema=["a", "b"], orient="col")
329
    >>> df5
330
    shape: (2, 2)
331
    ┌─────┬─────┐
332
    │ a   ┆ b   │
333
    │ --- ┆ --- │
334
    │ i64 ┆ i64 │
335
    ╞═════╪═════╡
336
    │ 1   ┆ 3   │
337
    │ 2   ┆ 4   │
338
    └─────┴─────┘
339

340
    Constructing a DataFrame from a list of lists, row orientation specified:
341

342
    >>> data = [[1, 2, 3], [4, 5, 6]]
343
    >>> df6 = pl.DataFrame(data, schema=["a", "b", "c"], orient="row")
344
    >>> df6
345
    shape: (2, 3)
346
    ┌─────┬─────┬─────┐
347
    │ a   ┆ b   ┆ c   │
348
    │ --- ┆ --- ┆ --- │
349
    │ i64 ┆ i64 ┆ i64 │
350
    ╞═════╪═════╪═════╡
351
    │ 1   ┆ 2   ┆ 3   │
352
    │ 4   ┆ 5   ┆ 6   │
353
    └─────┴─────┴─────┘
354
    """
355

356
    _df: PyDataFrame
357
    _accessors: ClassVar[set[str]] = {"plot", "style"}
358

359
    def __init__(
360
        self,
361
        data: FrameInitTypes | None = None,
362
        schema: SchemaDefinition | None = None,
363
        *,
364
        schema_overrides: SchemaDict | None = None,
365
        strict: bool = True,
366
        orient: Orientation | None = None,
367
        infer_schema_length: int | None = N_INFER_DEFAULT,
368
        nan_to_null: bool = False,
369
    ) -> None:
370
        if data is None:
371
            self._df = dict_to_pydf(
372
                {}, schema=schema, schema_overrides=schema_overrides
373
            )
374

375
        elif isinstance(data, dict):
376
            self._df = dict_to_pydf(
377
                data,
378
                schema=schema,
379
                schema_overrides=schema_overrides,
380
                strict=strict,
381
                nan_to_null=nan_to_null,
382
            )
383

384
        elif isinstance(data, (list, tuple, Sequence)):
385
            self._df = sequence_to_pydf(
386
                data,
387
                schema=schema,
388
                schema_overrides=schema_overrides,
389
                strict=strict,
390
                orient=orient,
391
                infer_schema_length=infer_schema_length,
392
                nan_to_null=nan_to_null,
393
            )
394

395
        elif isinstance(data, pl.Series):
396
            self._df = series_to_pydf(
397
                data, schema=schema, schema_overrides=schema_overrides, strict=strict
398
            )
399

400
        elif _check_for_numpy(data) and isinstance(data, np.ndarray):
401
            self._df = numpy_to_pydf(
402
                data,
403
                schema=schema,
404
                schema_overrides=schema_overrides,
405
                strict=strict,
406
                orient=orient,
407
                nan_to_null=nan_to_null,
408
            )
409

410
        elif _check_for_pyarrow(data) and isinstance(data, pa.Table):
411
            self._df = arrow_to_pydf(
412
                data, schema=schema, schema_overrides=schema_overrides, strict=strict
413
            )
414

415
        elif _check_for_pandas(data) and isinstance(data, pd.DataFrame):
416
            self._df = pandas_to_pydf(
417
                data, schema=schema, schema_overrides=schema_overrides, strict=strict
418
            )
419

420
        elif _check_for_torch(data) and isinstance(data, torch.Tensor):
421
            self._df = numpy_to_pydf(
422
                data.numpy(force=False),
423
                schema=schema,
424
                schema_overrides=schema_overrides,
425
                strict=strict,
426
                orient=orient,
427
                nan_to_null=nan_to_null,
428
            )
429

430
        elif (
431
            not hasattr(data, "__arrow_c_stream__")
432
            and not isinstance(data, Sized)
433
            and isinstance(data, (Generator, Iterable))
434
        ):
435
            self._df = iterable_to_pydf(
436
                data,
437
                schema=schema,
438
                schema_overrides=schema_overrides,
439
                strict=strict,
440
                orient=orient,
441
                infer_schema_length=infer_schema_length,
442
            )
443

444
        elif isinstance(data, pl.DataFrame):
445
            self._df = dataframe_to_pydf(
446
                data, schema=schema, schema_overrides=schema_overrides, strict=strict
447
            )
448

449
        elif is_pycapsule(data):
450
            self._df = pycapsule_to_frame(
451
                data,
452
                schema=schema,
453
                schema_overrides=schema_overrides,
454
            )._df
455
        else:
456
            msg = (
457
                f"DataFrame constructor called with unsupported type {type(data).__name__!r}"
458
                " for the `data` parameter"
459
            )
460
            raise TypeError(msg)
461

462
    @classmethod
463
    def deserialize(
464
        cls, source: str | Path | IOBase, *, format: SerializationFormat = "binary"
465
    ) -> DataFrame:
466
        """
467
        Read a serialized DataFrame from a file.
468

469
        Parameters
470
        ----------
471
        source
472
            Path to a file or a file-like object (by file-like object, we refer to
473
            objects that have a `read()` method, such as a file handler (e.g.
474
            via builtin `open` function) or `BytesIO`).
475
        format
476
            The format with which the DataFrame was serialized. Options:
477

478
            - `"binary"`: Deserialize from binary format (bytes). This is the default.
479
            - `"json"`: Deserialize from JSON format (string).
480

481
        See Also
482
        --------
483
        DataFrame.serialize
484

485
        Notes
486
        -----
487
        Serialization is not stable across Polars versions: a LazyFrame serialized
488
        in one Polars version may not be deserializable in another Polars version.
489

490
        Examples
491
        --------
492
        >>> import io
493
        >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]})
494
        >>> bytes = df.serialize()
495
        >>> pl.DataFrame.deserialize(io.BytesIO(bytes))
496
        shape: (3, 2)
497
        ┌─────┬─────┐
498
        │ a   ┆ b   │
499
        │ --- ┆ --- │
500
        │ i64 ┆ f64 │
501
        ╞═════╪═════╡
502
        │ 1   ┆ 4.0 │
503
        │ 2   ┆ 5.0 │
504
        │ 3   ┆ 6.0 │
505
        └─────┴─────┘
506
        """
507
        if isinstance(source, StringIO):
508
            source = BytesIO(source.getvalue().encode())
509
        elif isinstance(source, (str, Path)):
510
            source = normalize_filepath(source)
511

512
        if format == "binary":
513
            deserializer = PyDataFrame.deserialize_binary
514
        elif format == "json":
515
            deserializer = PyDataFrame.deserialize_json
516
        else:
517
            msg = f"`format` must be one of {{'binary', 'json'}}, got {format!r}"
518
            raise ValueError(msg)
519

520
        return cls._from_pydf(deserializer(source))
521

522
    @classmethod
523
    def _from_pydf(cls, py_df: PyDataFrame) -> DataFrame:
524
        """Construct Polars DataFrame from FFI PyDataFrame object."""
525
        df = cls.__new__(cls)
526
        df._df = py_df
527
        return df
528

529
    @classmethod
530
    def _from_arrow(
531
        cls,
532
        data: pa.Table | pa.RecordBatch,
533
        schema: SchemaDefinition | None = None,
534
        *,
535
        schema_overrides: SchemaDict | None = None,
536
        rechunk: bool = True,
537
    ) -> DataFrame:
538
        """
539
        Construct a DataFrame from an Arrow table.
540

541
        This operation will be zero copy for the most part. Types that are not
542
        supported by Polars may be cast to the closest supported type.
543

544
        Parameters
545
        ----------
546
        data : arrow Table, RecordBatch, or sequence of sequences
547
            Data representing an Arrow Table or RecordBatch.
548
        schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict
549
            The DataFrame schema may be declared in several ways:
550

551
            * As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
552
            * As a list of column names; in this case types are automatically inferred.
553
            * As a list of (name,type) pairs; this is equivalent to the dictionary form.
554

555
            If you supply a list of column names that does not match the names in the
556
            underlying data, the names given here will overwrite them. The number
557
            of names given in the schema should match the underlying data dimensions.
558
        schema_overrides : dict, default None
559
            Support type specification or override of one or more columns; note that
560
            any dtypes inferred from the columns param will be overridden.
561
        rechunk : bool, default True
562
            Make sure that all data is in contiguous memory.
563
        """
564
        return cls._from_pydf(
565
            arrow_to_pydf(
566
                data,
567
                schema=schema,
568
                schema_overrides=schema_overrides,
569
                rechunk=rechunk,
570
            )
571
        )
572

573
    @classmethod
574
    def _from_pandas(
575
        cls,
576
        data: pd.DataFrame,
577
        schema: SchemaDefinition | None = None,
578
        *,
579
        schema_overrides: SchemaDict | None = None,
580
        rechunk: bool = True,
581
        nan_to_null: bool = True,
582
        include_index: bool = False,
583
    ) -> DataFrame:
584
        """
585
        Construct a Polars DataFrame from a pandas DataFrame.
586

587
        Parameters
588
        ----------
589
        data : pandas DataFrame
590
            Two-dimensional data represented as a pandas DataFrame.
591
        schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict
592
            The DataFrame schema may be declared in several ways:
593

594
            * As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
595
            * As a list of column names; in this case types are automatically inferred.
596
            * As a list of (name,type) pairs; this is equivalent to the dictionary form.
597

598
            If you supply a list of column names that does not match the names in the
599
            underlying data, the names given here will overwrite them. The number
600
            of names given in the schema should match the underlying data dimensions.
601
        schema_overrides : dict, default None
602
            Support type specification or override of one or more columns; note that
603
            any dtypes inferred from the columns param will be overridden.
604
        rechunk : bool, default True
605
            Make sure that all data is in contiguous memory.
606
        nan_to_null : bool, default True
607
            If the data contains NaN values they will be converted to null/None.
608
        include_index : bool, default False
609
            Load any non-default pandas indexes as columns.
610
        """
611
        return cls._from_pydf(
612
            pandas_to_pydf(
613
                data,
614
                schema=schema,
615
                schema_overrides=schema_overrides,
616
                rechunk=rechunk,
617
                nan_to_null=nan_to_null,
618
                include_index=include_index,
619
            )
620
        )
621

622
    def _replace(self, column: str, new_column: Series) -> DataFrame:
623
        """Replace a column by a new Series (in place)."""
624
        self._df.replace(column, new_column._s)
625
        return self
626

627
    @classmethod
628
    def _import_columns(cls, pointer: int, width: int) -> DataFrame:
629
        return cls._from_pydf(PyDataFrame._import_columns(pointer, width))
630

631
    @property
632
    @unstable()
633
    def plot(self) -> DataFramePlot:
634
        """
635
        Create a plot namespace.
636

637
        .. warning::
638
            This functionality is currently considered **unstable**. It may be
639
            changed at any point without it being considered a breaking change.
640

641
        .. versionchanged:: 1.6.0
642
            In prior versions of Polars, HvPlot was the plotting backend. If you would
643
            like to restore the previous plotting functionality, all you need to do
644
            is add `import hvplot.polars` at the top of your script and replace
645
            `df.plot` with `df.hvplot`.
646

647
        Polars does not implement plotting logic itself, but instead defers to
648
        `Altair <https://altair-viz.github.io/>`_:
649

650
        - `df.plot.line(**kwargs)`
651
          is shorthand for
652
          `alt.Chart(df).mark_line(tooltip=True).encode(**kwargs).interactive()`
653
        - `df.plot.point(**kwargs)`
654
          is shorthand for
655
          `alt.Chart(df).mark_point(tooltip=True).encode(**kwargs).interactive()` (and
656
          `plot.scatter` is provided as an alias)
657
        - `df.plot.bar(**kwargs)`
658
          is shorthand for
659
          `alt.Chart(df).mark_bar(tooltip=True).encode(**kwargs).interactive()`
660
        - for any other attribute `attr`, `df.plot.attr(**kwargs)`
661
          is shorthand for
662
          `alt.Chart(df).mark_attr(tooltip=True).encode(**kwargs).interactive()`
663

664
        For configuration, we suggest reading
665
        `Chart Configuration <https://altair-viz.github.io/altair-tutorial/notebooks/08-Configuration.html>`_.
666
        For example, you can:
667

668
        - Change the width/height/title with
669
          ``.properties(width=500, height=350, title="My amazing plot")``.
670
        - Change the x-axis label rotation with ``.configure_axisX(labelAngle=30)``.
671
        - Change the opacity of the points in your scatter plot with
672
          ``.configure_point(opacity=.5)``.
673

674
        Examples
675
        --------
676
        Scatter plot:
677

678
        >>> df = pl.DataFrame(
679
        ...     {
680
        ...         "length": [1, 4, 6],
681
        ...         "width": [4, 5, 6],
682
        ...         "species": ["setosa", "setosa", "versicolor"],
683
        ...     }
684
        ... )
685
        >>> df.plot.point(x="length", y="width", color="species")  # doctest: +SKIP
686

687
        Set the x-axis title by using ``altair.X``:
688

689
        >>> import altair as alt
690
        >>> df.plot.point(
691
        ...     x=alt.X("length", title="Length"), y="width", color="species"
692
        ... )  # doctest: +SKIP
693

694
        Line plot:
695

696
        >>> from datetime import date
697
        >>> df = pl.DataFrame(
698
        ...     {
699
        ...         "date": [date(2020, 1, 2), date(2020, 1, 3), date(2020, 1, 4)] * 2,
700
        ...         "price": [1, 4, 6, 1, 5, 2],
701
        ...         "stock": ["a", "a", "a", "b", "b", "b"],
702
        ...     }
703
        ... )
704
        >>> df.plot.line(x="date", y="price", color="stock")  # doctest: +SKIP
705

706
        Bar plot:
707

708
        >>> df = pl.DataFrame(
709
        ...     {
710
        ...         "day": ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] * 2,
711
        ...         "group": ["a"] * 7 + ["b"] * 7,
712
        ...         "value": [1, 3, 2, 4, 5, 6, 1, 1, 3, 2, 4, 5, 1, 2],
713
        ...     }
714
        ... )
715
        >>> df.plot.bar(
716
        ...     x="day", y="value", color="day", column="group"
717
        ... )  # doctest: +SKIP
718

719
        Or, to make a stacked version of the plot above:
720

721
        >>> df.plot.bar(x="day", y="value", color="group")  # doctest: +SKIP
722
        """
723
        if not _ALTAIR_AVAILABLE or parse_version(altair.__version__) < (5, 4, 0):
724
            msg = "altair>=5.4.0 is required for `.plot`"
725
            raise ModuleUpgradeRequiredError(msg)
726
        return DataFramePlot(self)
727

728
    @property
729
    @unstable()
730
    def style(self) -> GT:
731
        """
732
        Create a Great Table for styling.
733

734
        .. warning::
735
            This functionality is currently considered **unstable**. It may be
736
            changed at any point without it being considered a breaking change.
737

738
        Polars does not implement styling logic itself, but instead defers to
739
        the Great Tables package. Please see the `Great Tables reference <https://posit-dev.github.io/great-tables/reference/>`_
740
        for more information and documentation.
741

742
        Examples
743
        --------
744
        Import some styling helpers, and create example data:
745

746
        >>> import polars.selectors as cs
747
        >>> from great_tables import loc, style
748
        >>> df = pl.DataFrame(
749
        ...     {
750
        ...         "site_id": [0, 1, 2],
751
        ...         "measure_a": [5, 4, 6],
752
        ...         "measure_b": [7, 3, 3],
753
        ...     }
754
        ... )
755

756
        Emphasize the site_id as row names:
757

758
        >>> df.style.tab_stub(rowname_col="site_id")  # doctest: +SKIP
759

760
        Fill the background for the highest measure_a value row:
761

762
        >>> df.style.tab_style(
763
        ...     style.fill("yellow"),
764
        ...     loc.body(rows=pl.col("measure_a") == pl.col("measure_a").max()),
765
        ... )  # doctest: +SKIP
766

767
        Put a spanner (high-level label) over measure columns:
768

769
        >>> df.style.tab_spanner(
770
        ...     "Measures", cs.starts_with("measure")
771
        ... )  # doctest: +SKIP
772

773
        Format measure_b values to two decimal places:
774

775
        >>> df.style.fmt_number("measure_b", decimals=2)  # doctest: +SKIP
776
        """
777
        if not _GREAT_TABLES_AVAILABLE:
778
            msg = "great_tables is required for `.style`"
779
            raise ModuleNotFoundError(msg)
780

781
        return great_tables.GT(self)
782

783
    @property
784
    def shape(self) -> tuple[int, int]:
785
        """
786
        Get the shape of the DataFrame.
787

788
        Examples
789
        --------
790
        >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5]})
791
        >>> df.shape
792
        (5, 1)
793
        """
794
        return self._df.shape()
795

796
    @property
797
    def height(self) -> int:
798
        """
799
        Get the number of rows.
800

801
        Returns
802
        -------
803
        int
804

805
        Examples
806
        --------
807
        >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5]})
808
        >>> df.height
809
        5
810
        """
811
        return self._df.height()
812

813
    @property
814
    def width(self) -> int:
815
        """
816
        Get the number of columns.
817

818
        Returns
819
        -------
820
        int
821

822
        Examples
823
        --------
824
        >>> df = pl.DataFrame(
825
        ...     {
826
        ...         "foo": [1, 2, 3],
827
        ...         "bar": [4, 5, 6],
828
        ...     }
829
        ... )
830
        >>> df.width
831
        2
832
        """
833
        return self._df.width()
834

835
    @property
836
    def columns(self) -> list[str]:
837
        """
838
        Get or set column names.
839

840
        Returns
841
        -------
842
        list of str
843
            A list containing the name of each column in order.
844

845
        Examples
846
        --------
847
        >>> df = pl.DataFrame(
848
        ...     {
849
        ...         "foo": [1, 2, 3],
850
        ...         "bar": [6, 7, 8],
851
        ...         "ham": ["a", "b", "c"],
852
        ...     }
853
        ... )
854
        >>> df.columns
855
        ['foo', 'bar', 'ham']
856

857
        Set column names:
858

859
        >>> df.columns = ["apple", "banana", "orange"]
860
        >>> df
861
        shape: (3, 3)
862
        ┌───────┬────────┬────────┐
863
        │ apple ┆ banana ┆ orange │
864
        │ ---   ┆ ---    ┆ ---    │
865
        │ i64   ┆ i64    ┆ str    │
866
        ╞═══════╪════════╪════════╡
867
        │ 1     ┆ 6      ┆ a      │
868
        │ 2     ┆ 7      ┆ b      │
869
        │ 3     ┆ 8      ┆ c      │
870
        └───────┴────────┴────────┘
871
        """
872
        return self._df.columns()
873

874
    @columns.setter
875
    def columns(self, names: Sequence[str]) -> None:
876
        """
877
        Change the column names of the `DataFrame`.
878

879
        Parameters
880
        ----------
881
        names
882
            A list with new names for the `DataFrame`.
883
            The length of the list should be equal to the width of the `DataFrame`.
884
        """
885
        self._df.set_column_names(names)
886

887
    @property
888
    def dtypes(self) -> list[DataType]:
889
        """
890
        Get the column data types.
891

892
        The data types can also be found in column headers when printing the DataFrame.
893

894
        Returns
895
        -------
896
        list of DataType
897
            A list containing the data type of each column in order.
898

899
        See Also
900
        --------
901
        schema
902

903
        Examples
904
        --------
905
        >>> df = pl.DataFrame(
906
        ...     {
907
        ...         "foo": [1, 2, 3],
908
        ...         "bar": [6.0, 7.0, 8.0],
909
        ...         "ham": ["a", "b", "c"],
910
        ...     }
911
        ... )
912
        >>> df.dtypes
913
        [Int64, Float64, String]
914
        >>> df
915
        shape: (3, 3)
916
        ┌─────┬─────┬─────┐
917
        │ foo ┆ bar ┆ ham │
918
        │ --- ┆ --- ┆ --- │
919
        │ i64 ┆ f64 ┆ str │
920
        ╞═════╪═════╪═════╡
921
        │ 1   ┆ 6.0 ┆ a   │
922
        │ 2   ┆ 7.0 ┆ b   │
923
        │ 3   ┆ 8.0 ┆ c   │
924
        └─────┴─────┴─────┘
925
        """
926
        return self._df.dtypes()
927

928
    @property
929
    def flags(self) -> dict[str, dict[str, bool]]:
930
        """
931
        Get flags that are set on the columns of this DataFrame.
932

933
        Returns
934
        -------
935
        dict
936
            Mapping from column names to column flags.
937
        """
938
        return {name: self[name].flags for name in self.columns}
939

940
    @property
941
    def schema(self) -> Schema:
942
        """
943
        Get an ordered mapping of column names to their data type.
944

945
        Examples
946
        --------
947
        >>> df = pl.DataFrame(
948
        ...     {
949
        ...         "foo": [1, 2, 3],
950
        ...         "bar": [6.0, 7.0, 8.0],
951
        ...         "ham": ["a", "b", "c"],
952
        ...     }
953
        ... )
954
        >>> df.schema
955
        Schema({'foo': Int64, 'bar': Float64, 'ham': String})
956
        """
957
        return Schema(zip(self.columns, self.dtypes), check_dtypes=False)
958

959
    def __array__(
960
        self, dtype: npt.DTypeLike | None = None, copy: bool | None = None
961
    ) -> np.ndarray[Any, Any]:
962
        """
963
        Return a NumPy ndarray with the given data type.
964

965
        This method ensures a Polars DataFrame can be treated as a NumPy ndarray.
966
        It enables `np.asarray` and NumPy universal functions.
967

968
        See the NumPy documentation for more information:
969
        https://numpy.org/doc/stable/user/basics.interoperability.html#the-array-method
970
        """
971
        if copy is None:
972
            writable, allow_copy = False, True
973
        elif copy is True:
974
            writable, allow_copy = True, True
975
        elif copy is False:
976
            writable, allow_copy = False, False
977
        else:
978
            msg = f"invalid input for `copy`: {copy!r}"
979
            raise TypeError(msg)
980

981
        arr = self.to_numpy(writable=writable, allow_copy=allow_copy)
982

983
        if dtype is not None and dtype != arr.dtype:
984
            if copy is False:
985
                # TODO: Only raise when data must be copied
986
                msg = f"copy not allowed: cast from {arr.dtype} to {dtype} prohibited"
987
                raise RuntimeError(msg)
988

989
            arr = arr.__array__(dtype)
990

991
        return arr
992

993
    def __dataframe__(
994
        self,
995
        nan_as_null: bool = False,  # noqa: FBT001
996
        allow_copy: bool = True,  # noqa: FBT001
997
    ) -> PolarsDataFrame:
998
        """
999
        Convert to a dataframe object implementing the dataframe interchange protocol.
1000

1001
        Parameters
1002
        ----------
1003
        nan_as_null
1004
            Overwrite null values in the data with `NaN`.
1005

1006
            .. warning::
1007
                This functionality has not been implemented and the parameter will be
1008
                removed in a future version.
1009
                Setting this to `True` will raise a `NotImplementedError`.
1010
        allow_copy
1011
            Allow memory to be copied to perform the conversion. If set to `False`,
1012
            causes conversions that are not zero-copy to fail.
1013

1014
        Notes
1015
        -----
1016
        Details on the Python dataframe interchange protocol:
1017
        https://data-apis.org/dataframe-protocol/latest/index.html
1018

1019
        Examples
1020
        --------
1021
        Convert a Polars DataFrame to a generic dataframe object and access some
1022
        properties.
1023

1024
        >>> df = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0], "c": ["x", "y"]})
1025
        >>> dfi = df.__dataframe__()
1026
        >>> dfi.num_rows()
1027
        2
1028
        >>> dfi.get_column(1).dtype
1029
        (<DtypeKind.FLOAT: 2>, 64, 'g', '=')
1030
        """
1031
        if nan_as_null:
1032
            msg = (
1033
                "functionality for `nan_as_null` has not been implemented and the"
1034
                " parameter will be removed in a future version"
1035
                "\n\nUse the default `nan_as_null=False`."
1036
            )
1037
            raise NotImplementedError(msg)
1038

1039
        from polars.interchange.dataframe import PolarsDataFrame
1040

1041
        return PolarsDataFrame(self, allow_copy=allow_copy)
1042

1043
    def _comp(self, other: Any, op: ComparisonOperator) -> DataFrame:
1044
        """Compare a DataFrame with another object."""
1045
        if isinstance(other, DataFrame):
1046
            return self._compare_to_other_df(other, op)
1047
        else:
1048
            return self._compare_to_non_df(other, op)
1049

1050
    def _compare_to_other_df(
1051
        self,
1052
        other: DataFrame,
1053
        op: ComparisonOperator,
1054
    ) -> DataFrame:
1055
        """Compare a DataFrame with another DataFrame."""
1056
        if self.columns != other.columns:
1057
            msg = "DataFrame columns do not match"
1058
            raise ValueError(msg)
1059
        if self.shape != other.shape:
1060
            msg = "DataFrame dimensions do not match"
1061
            raise ValueError(msg)
1062

1063
        suffix = "__POLARS_CMP_OTHER"
1064
        other_renamed = other.select(F.all().name.suffix(suffix))
1065
        combined = F.concat([self, other_renamed], how="horizontal")
1066

1067
        if op == "eq":
1068
            expr = [F.col(n) == F.col(f"{n}{suffix}") for n in self.columns]
1069
        elif op == "neq":
1070
            expr = [F.col(n) != F.col(f"{n}{suffix}") for n in self.columns]
1071
        elif op == "gt":
1072
            expr = [F.col(n) > F.col(f"{n}{suffix}") for n in self.columns]
1073
        elif op == "lt":
1074
            expr = [F.col(n) < F.col(f"{n}{suffix}") for n in self.columns]
1075
        elif op == "gt_eq":
1076
            expr = [F.col(n) >= F.col(f"{n}{suffix}") for n in self.columns]
1077
        elif op == "lt_eq":
1078
            expr = [F.col(n) <= F.col(f"{n}{suffix}") for n in self.columns]
1079
        else:
1080
            msg = f"unexpected comparison operator {op!r}"
1081
            raise ValueError(msg)
1082

1083
        return combined.select(expr)
1084

1085
    def _compare_to_non_df(
1086
        self,
1087
        other: Any,
1088
        op: ComparisonOperator,
1089
    ) -> DataFrame:
1090
        """Compare a DataFrame with a non-DataFrame object."""
1091
        warn_null_comparison(other)
1092
        if op == "eq":
1093
            return self.select(F.all() == other)
1094
        elif op == "neq":
1095
            return self.select(F.all() != other)
1096
        elif op == "gt":
1097
            return self.select(F.all() > other)
1098
        elif op == "lt":
1099
            return self.select(F.all() < other)
1100
        elif op == "gt_eq":
1101
            return self.select(F.all() >= other)
1102
        elif op == "lt_eq":
1103
            return self.select(F.all() <= other)
1104
        else:
1105
            msg = f"unexpected comparison operator {op!r}"
1106
            raise ValueError(msg)
1107

1108
    def _div(self, other: Any, *, floordiv: bool) -> DataFrame:
1109
        if isinstance(other, pl.Series):
1110
            if floordiv:
1111
                return self.select(F.all() // lit(other))
1112
            return self.select(F.all() / lit(other))
1113

1114
        elif not isinstance(other, DataFrame):
1115
            s = _prepare_other_arg(other, length=self.height)
1116
            other = DataFrame([s.alias(f"n{i}") for i in range(self.width)])
1117

1118
        orig_dtypes = other.dtypes
1119
        # TODO: Dispatch to a native floordiv
1120
        other = self._cast_all_from_to(other, INTEGER_DTYPES, Float64)
1121
        df = self._from_pydf(self._df.div_df(other._df))
1122

1123
        df = (
1124
            df
1125
            if not floordiv
1126
            else df.with_columns([s.floor() for s in df if s.dtype.is_float()])
1127
        )
1128
        if floordiv:
1129
            int_casts = [
1130
                col(column).cast(tp)
1131
                for i, (column, tp) in enumerate(self.schema.items())
1132
                if tp.is_integer()
1133
                and (orig_dtypes[i].is_integer() or orig_dtypes[i] == Null)
1134
            ]
1135
            if int_casts:
1136
                return df.with_columns(int_casts)
1137
        return df
1138

1139
    def _cast_all_from_to(
1140
        self, df: DataFrame, from_: frozenset[PolarsDataType], to: PolarsDataType
1141
    ) -> DataFrame:
1142
        casts = [s.cast(to).alias(s.name) for s in df if s.dtype in from_]
1143
        return df.with_columns(casts) if casts else df
1144

1145
    def __floordiv__(self, other: DataFrame | Series | int | float) -> DataFrame:
1146
        return self._div(other, floordiv=True)
1147

1148
    def __truediv__(self, other: DataFrame | Series | int | float) -> DataFrame:
1149
        return self._div(other, floordiv=False)
1150

1151
    def __bool__(self) -> NoReturn:
1152
        msg = (
1153
            "the truth value of a DataFrame is ambiguous"
1154
            "\n\nHint: to check if a DataFrame contains any values, use `is_empty()`."
1155
        )
1156
        raise TypeError(msg)
1157

1158
    def __eq__(self, other: object) -> DataFrame:  # type: ignore[override]
1159
        return self._comp(other, "eq")
1160

1161
    def __ne__(self, other: object) -> DataFrame:  # type: ignore[override]
1162
        return self._comp(other, "neq")
1163

1164
    def __gt__(self, other: Any) -> DataFrame:
1165
        return self._comp(other, "gt")
1166

1167
    def __lt__(self, other: Any) -> DataFrame:
1168
        return self._comp(other, "lt")
1169

1170
    def __ge__(self, other: Any) -> DataFrame:
1171
        return self._comp(other, "gt_eq")
1172

1173
    def __le__(self, other: Any) -> DataFrame:
1174
        return self._comp(other, "lt_eq")
1175

1176
    def __getstate__(self) -> bytes:
1177
        return self.serialize()
1178

1179
    def __setstate__(self, state: bytes) -> None:
1180
        self._df = self.deserialize(BytesIO(state))._df
1181

1182
    def __mul__(self, other: DataFrame | Series | int | float) -> DataFrame:
1183
        if isinstance(other, DataFrame):
1184
            return self._from_pydf(self._df.mul_df(other._df))
1185

1186
        other = _prepare_other_arg(other)
1187
        return self._from_pydf(self._df.mul(other._s))
1188

1189
    def __rmul__(self, other: int | float) -> DataFrame:
1190
        return self * other
1191

1192
    def __add__(
1193
        self, other: DataFrame | Series | int | float | bool | str
1194
    ) -> DataFrame:
1195
        if isinstance(other, DataFrame):
1196
            return self._from_pydf(self._df.add_df(other._df))
1197
        other = _prepare_other_arg(other)
1198
        return self._from_pydf(self._df.add(other._s))
1199

1200
    def __radd__(
1201
        self, other: DataFrame | Series | int | float | bool | str
1202
    ) -> DataFrame:
1203
        if isinstance(other, str):
1204
            return self.select((lit(other) + F.col("*")).name.keep())
1205
        return self + other
1206

1207
    def __sub__(self, other: DataFrame | Series | int | float) -> DataFrame:
1208
        if isinstance(other, DataFrame):
1209
            return self._from_pydf(self._df.sub_df(other._df))
1210
        other = _prepare_other_arg(other)
1211
        return self._from_pydf(self._df.sub(other._s))
1212

1213
    def __mod__(self, other: DataFrame | Series | int | float) -> DataFrame:
1214
        if isinstance(other, DataFrame):
1215
            return self._from_pydf(self._df.rem_df(other._df))
1216
        other = _prepare_other_arg(other)
1217
        return self._from_pydf(self._df.rem(other._s))
1218

1219
    def __str__(self) -> str:
1220
        return self._df.as_str()
1221

1222
    def __repr__(self) -> str:
1223
        return self.__str__()
1224

1225
    def __contains__(self, key: str) -> bool:
1226
        return key in self.columns
1227

1228
    def __iter__(self) -> Iterator[Series]:
1229
        return self.iter_columns()
1230

1231
    def __reversed__(self) -> Iterator[Series]:
1232
        return reversed(self.get_columns())
1233

1234
    # `str` overlaps with `Sequence[str]`
1235
    # We can ignore this but we must keep this overload ordering
1236
    @overload
1237
    def __getitem__(
1238
        self, key: tuple[SingleIndexSelector, SingleColSelector]
1239
    ) -> Any: ...
1240

1241
    @overload
1242
    def __getitem__(  # type: ignore[overload-overlap]
1243
        self, key: str | tuple[MultiIndexSelector, SingleColSelector]
1244
    ) -> Series: ...
1245

1246
    @overload
1247
    def __getitem__(
1248
        self,
1249
        key: (
1250
            SingleIndexSelector
1251
            | MultiIndexSelector
1252
            | MultiColSelector
1253
            | tuple[SingleIndexSelector, MultiColSelector]
1254
            | tuple[MultiIndexSelector, MultiColSelector]
1255
        ),
1256
    ) -> DataFrame: ...
1257

1258
    def __getitem__(
1259
        self,
1260
        key: (
1261
            SingleIndexSelector
1262
            | SingleColSelector
1263
            | MultiColSelector
1264
            | MultiIndexSelector
1265
            | tuple[SingleIndexSelector, SingleColSelector]
1266
            | tuple[SingleIndexSelector, MultiColSelector]
1267
            | tuple[MultiIndexSelector, SingleColSelector]
1268
            | tuple[MultiIndexSelector, MultiColSelector]
1269
        ),
1270
    ) -> DataFrame | Series | Any:
1271
        """
1272
        Get part of the DataFrame as a new DataFrame, Series, or scalar.
1273

1274
        Parameters
1275
        ----------
1276
        key
1277
            Rows / columns to select. This is easiest to explain via example. Suppose
1278
            we have a DataFrame with columns `'a'`, `'d'`, `'c'`, `'d'`. Here is what
1279
            various types of `key` would do:
1280

1281
            - `df[0, 'a']` extracts the first element of column `'a'` and returns a
1282
              scalar.
1283
            - `df[0]` extracts the first row and returns a Dataframe.
1284
            - `df['a']` extracts column `'a'` and returns a Series.
1285
            - `df[0:2]` extracts the first two rows and returns a Dataframe.
1286
            - `df[0:2, 'a']` extracts the first two rows from column `'a'` and returns
1287
              a Series.
1288
            - `df[0:2, 0]` extracts the first two rows from the first column and returns
1289
              a Series.
1290
            - `df[[0, 1], [0, 1, 2]]` extracts the first two rows and the first three
1291
              columns and returns a Dataframe.
1292
            - `df[0: 2, ['a', 'c']]` extracts the first two rows from columns `'a'` and
1293
              `'c'` and returns a Dataframe.
1294
            - `df[:, 0: 2]` extracts all rows from the first two columns and returns a
1295
              Dataframe.
1296
            - `df[:, 'a': 'c']` extracts all rows and all columns positioned between
1297
              `'a'` and `'c'` *inclusive* and returns a Dataframe. In our example,
1298
              that would extract columns `'a'`, `'d'`, and `'c'`.
1299

1300
        Returns
1301
        -------
1302
        DataFrame, Series, or scalar, depending on `key`.
1303

1304
        Examples
1305
        --------
1306
        >>> df = pl.DataFrame(
1307
        ...     {"a": [1, 2, 3], "d": [4, 5, 6], "c": [1, 3, 2], "b": [7, 8, 9]}
1308
        ... )
1309
        >>> df[0]
1310
        shape: (1, 4)
1311
        ┌─────┬─────┬─────┬─────┐
1312
        │ a   ┆ d   ┆ c   ┆ b   │
1313
        │ --- ┆ --- ┆ --- ┆ --- │
1314
        │ i64 ┆ i64 ┆ i64 ┆ i64 │
1315
        ╞═════╪═════╪═════╪═════╡
1316
        │ 1   ┆ 4   ┆ 1   ┆ 7   │
1317
        └─────┴─────┴─────┴─────┘
1318
        >>> df[0, "a"]
1319
        1
1320
        >>> df["a"]
1321
        shape: (3,)
1322
        Series: 'a' [i64]
1323
        [
1324
            1
1325
            2
1326
            3
1327
        ]
1328
        >>> df[0:2]
1329
        shape: (2, 4)
1330
        ┌─────┬─────┬─────┬─────┐
1331
        │ a   ┆ d   ┆ c   ┆ b   │
1332
        │ --- ┆ --- ┆ --- ┆ --- │
1333
        │ i64 ┆ i64 ┆ i64 ┆ i64 │
1334
        ╞═════╪═════╪═════╪═════╡
1335
        │ 1   ┆ 4   ┆ 1   ┆ 7   │
1336
        │ 2   ┆ 5   ┆ 3   ┆ 8   │
1337
        └─────┴─────┴─────┴─────┘
1338
        >>> df[0:2, "a"]
1339
        shape: (2,)
1340
        Series: 'a' [i64]
1341
        [
1342
            1
1343
            2
1344
        ]
1345
        >>> df[0:2, 0]
1346
        shape: (2,)
1347
        Series: 'a' [i64]
1348
        [
1349
            1
1350
            2
1351
        ]
1352
        >>> df[[0, 1], [0, 1, 2]]
1353
        shape: (2, 3)
1354
        ┌─────┬─────┬─────┐
1355
        │ a   ┆ d   ┆ c   │
1356
        │ --- ┆ --- ┆ --- │
1357
        │ i64 ┆ i64 ┆ i64 │
1358
        ╞═════╪═════╪═════╡
1359
        │ 1   ┆ 4   ┆ 1   │
1360
        │ 2   ┆ 5   ┆ 3   │
1361
        └─────┴─────┴─────┘
1362
        >>> df[0:2, ["a", "c"]]
1363
        shape: (2, 2)
1364
        ┌─────┬─────┐
1365
        │ a   ┆ c   │
1366
        │ --- ┆ --- │
1367
        │ i64 ┆ i64 │
1368
        ╞═════╪═════╡
1369
        │ 1   ┆ 1   │
1370
        │ 2   ┆ 3   │
1371
        └─────┴─────┘
1372
        >>> df[:, 0:2]
1373
        shape: (3, 2)
1374
        ┌─────┬─────┐
1375
        │ a   ┆ d   │
1376
        │ --- ┆ --- │
1377
        │ i64 ┆ i64 │
1378
        ╞═════╪═════╡
1379
        │ 1   ┆ 4   │
1380
        │ 2   ┆ 5   │
1381
        │ 3   ┆ 6   │
1382
        └─────┴─────┘
1383
        >>> df[:, "a":"c"]
1384
        shape: (3, 3)
1385
        ┌─────┬─────┬─────┐
1386
        │ a   ┆ d   ┆ c   │
1387
        │ --- ┆ --- ┆ --- │
1388
        │ i64 ┆ i64 ┆ i64 │
1389
        ╞═════╪═════╪═════╡
1390
        │ 1   ┆ 4   ┆ 1   │
1391
        │ 2   ┆ 5   ┆ 3   │
1392
        │ 3   ┆ 6   ┆ 2   │
1393
        └─────┴─────┴─────┘
1394
        """
1395
        return get_df_item_by_key(self, key)
1396

1397
    def __setitem__(
1398
        self,
1399
        key: str | Sequence[int] | Sequence[str] | tuple[Any, str | int],
1400
        value: Any,
1401
    ) -> None:  # pragma: no cover
1402
        """
1403
        Modify DataFrame elements in place, using assignment syntax.
1404

1405
        Parameters
1406
        ----------
1407
        key : str | Sequence[int] | Sequence[str] | tuple[Any, str | int]
1408
            Specifies the location(s) within the DataFrame to assign new values.
1409
            The behavior varies based on the type of `key`:
1410

1411
            - Str: `df["a"] = value`:
1412
                Not supported. Raises a `TypeError`. Use `df.with_columns(...)`
1413
                to add or modify columns.
1414

1415
            - Sequence[str]: `df[["a", "b"]] = value`:
1416
                Assigns multiple columns at once. `value` must be a 2D array-like
1417
                structure with the same number of columns as the list
1418
                of column names provided.
1419

1420
            - tuple[Any, str | int]: `df[row_idx, "a"] = value`:
1421
                Assigns a new value to a specific element in the DataFrame, where
1422
                `row_idx` specifies the row and `"a"` specifies the column.
1423

1424
            - `df[row_idx, col_idx] = value`:
1425
                Similar to the above, but `col_idx` is the integer index of the column.
1426

1427
        value : Any
1428
            The new value(s) to assign. The expected structure of `value` depends on the
1429
            form of `key`:
1430

1431
            - For multiple column assignment (`df[["a", "b"]] = value`), `value` should
1432
              be a 2D array-like object with shape (n_rows, n_columns).
1433

1434
            - For single element assignment (`df[row_idx, "a"] = value`), `value` should
1435
              be a scalar.
1436

1437
        Raises
1438
        ------
1439
        TypeError
1440
            If an unsupported assignment is attempted, such as assigning a Series
1441
            directly to a column using `df["a"] = series`.
1442

1443
        ValueError
1444
            If the shape of `value` does not match the expected shape based on `key`.
1445

1446
        Examples
1447
        --------
1448
        Sequence[str] :  `df[["a", "b"]] = value`:
1449

1450
        >>> import numpy as np
1451
        >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
1452
        >>> df[["a", "b"]] = np.array([[10, 40], [20, 50], [30, 60]])
1453
        >>> df
1454
        shape: (3, 2)
1455
        ┌─────┬─────┐
1456
        │ a   ┆ b   │
1457
        │ --- ┆ --- │
1458
        │ i64 ┆ i64 │
1459
        ╞═════╪═════╡
1460
        │ 10  ┆ 40  │
1461
        │ 20  ┆ 50  │
1462
        │ 30  ┆ 60  │
1463
        └─────┴─────┘
1464

1465
        tuple[Any, str | int] : `df[row_idx, "a"] = value`:
1466

1467
        >>> df[1, "a"] = 100
1468
        >>> df
1469
        shape: (3, 2)
1470
        ┌─────┬─────┐
1471
        │ a   ┆ b   │
1472
        │ --- ┆ --- │
1473
        │ i64 ┆ i64 │
1474
        ╞═════╪═════╡
1475
        │ 10  ┆ 40  │
1476
        │ 100 ┆ 50  │
1477
        │ 30  ┆ 60  │
1478
        └─────┴─────┘
1479

1480
        `df[row_idx, col_idx] = value`:
1481

1482
        >>> df[0, 1] = 30
1483
        >>> df
1484
        shape: (3, 2)
1485
        ┌─────┬─────┐
1486
        │ a   ┆ b   │
1487
        │ --- ┆ --- │
1488
        │ i64 ┆ i64 │
1489
        ╞═════╪═════╡
1490
        │ 10  ┆ 30  │
1491
        │ 100 ┆ 50  │
1492
        │ 30  ┆ 60  │
1493
        └─────┴─────┘
1494
        """
1495
        # df["foo"] = series
1496
        if isinstance(key, str):
1497
            msg = (
1498
                "DataFrame object does not support `Series` assignment by index"
1499
                "\n\nUse `DataFrame.with_columns`."
1500
            )
1501
            raise TypeError(msg)
1502

1503
        # df[["C", "D"]]
1504
        elif isinstance(key, list):
1505
            # TODO: Use python sequence constructors
1506
            value = np.array(value)
1507
            if value.ndim != 2:
1508
                msg = "can only set multiple columns with 2D matrix"
1509
                raise ValueError(msg)
1510
            if value.shape[1] != len(key):
1511
                msg = "matrix columns should be equal to list used to determine column names"
1512
                raise ValueError(msg)
1513

1514
            # TODO: we can parallelize this by calling from_numpy
1515
            columns = []
1516
            for i, name in enumerate(key):
1517
                columns.append(pl.Series(name, value[:, i]))
1518
            self._df = self.with_columns(columns)._df
1519

1520
        # df[a, b]
1521
        elif isinstance(key, tuple):
1522
            row_selection, col_selection = key
1523

1524
            if (
1525
                isinstance(row_selection, pl.Series) and row_selection.dtype == Boolean
1526
            ) or is_bool_sequence(row_selection):
1527
                msg = (
1528
                    "not allowed to set DataFrame by boolean mask in the row position"
1529
                    "\n\nConsider using `DataFrame.with_columns`."
1530
                )
1531
                raise TypeError(msg)
1532

1533
            # get series column selection
1534
            if isinstance(col_selection, str):
1535
                s = self.__getitem__(col_selection)
1536
            elif isinstance(col_selection, int):
1537
                s = self[:, col_selection]
1538
            else:
1539
                msg = f"unexpected column selection {col_selection!r}"
1540
                raise TypeError(msg)
1541

1542
            # dispatch to __setitem__ of Series to do modification
1543
            s[row_selection] = value
1544

1545
            # now find the location to place series
1546
            # df[idx]
1547
            if isinstance(col_selection, int):
1548
                self.replace_column(col_selection, s)
1549
            # df["foo"]
1550
            elif isinstance(col_selection, str):
1551
                self._replace(col_selection, s)
1552
        else:
1553
            msg = (
1554
                f"cannot use `__setitem__` on DataFrame"
1555
                f" with key {key!r} of type {type(key).__name__!r}"
1556
                f" and value {value!r} of type {type(value).__name__!r}"
1557
            )
1558
            raise TypeError(msg)
1559

1560
    def __len__(self) -> int:
1561
        return self.height
1562

1563
    def __copy__(self) -> DataFrame:
1564
        return self.clone()
1565

1566
    def __deepcopy__(self, memo: None = None) -> DataFrame:
1567
        return self.clone()
1568

1569
    def _ipython_key_completions_(self) -> list[str]:
1570
        return self.columns
1571

1572
    def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
1573
        """
1574
        Export a DataFrame via the Arrow PyCapsule Interface.
1575

1576
        https://arrow.apache.org/docs/dev/format/CDataInterface/PyCapsuleInterface.html
1577
        """
1578
        return self._df.__arrow_c_stream__(requested_schema)
1579

1580
    def _repr_html_(self, *, _from_series: bool = False) -> str:
1581
        """
1582
        Format output data in HTML for display in Jupyter Notebooks.
1583

1584
        Output rows and columns can be modified by setting the following ENVIRONMENT
1585
        variables:
1586

1587
        * POLARS_FMT_MAX_COLS: set the number of columns
1588
        * POLARS_FMT_MAX_ROWS: set the number of rows
1589
        """
1590
        max_cols = int(os.environ.get("POLARS_FMT_MAX_COLS", default=75))
1591
        if max_cols < 0:
1592
            max_cols = self.width
1593

1594
        max_rows = int(os.environ.get("POLARS_FMT_MAX_ROWS", default=10))
1595
        if max_rows < 0:
1596
            max_rows = self.height
1597

1598
        return "".join(
1599
            NotebookFormatter(
1600
                self,
1601
                max_cols=max_cols,
1602
                max_rows=max_rows,
1603
                from_series=_from_series,
1604
            ).render()
1605
        )
1606

1607
    def collect_schema(self) -> Schema:
1608
        """
1609
        Get an ordered mapping of column names to their data type.
1610

1611
        This is an alias for the :attr:`schema` property.
1612

1613
        See Also
1614
        --------
1615
        schema
1616

1617
        Notes
1618
        -----
1619
        This method is included to facilitate writing code that is generic for both
1620
        DataFrame and LazyFrame.
1621

1622
        Examples
1623
        --------
1624
        Determine the schema.
1625

1626
        >>> df = pl.DataFrame(
1627
        ...     {
1628
        ...         "foo": [1, 2, 3],
1629
        ...         "bar": [6.0, 7.0, 8.0],
1630
        ...         "ham": ["a", "b", "c"],
1631
        ...     }
1632
        ... )
1633
        >>> df.collect_schema()
1634
        Schema({'foo': Int64, 'bar': Float64, 'ham': String})
1635

1636
        Access various properties of the schema using the :class:`Schema` object.
1637

1638
        >>> schema = df.collect_schema()
1639
        >>> schema["bar"]
1640
        Float64
1641
        >>> schema.names()
1642
        ['foo', 'bar', 'ham']
1643
        >>> schema.dtypes()
1644
        [Int64, Float64, String]
1645
        >>> schema.len()
1646
        3
1647
        """
1648
        return self.schema
1649

1650
    def item(self, row: int | None = None, column: int | str | None = None) -> Any:
1651
        """
1652
        Return the DataFrame as a scalar, or return the element at the given row/column.
1653

1654
        Parameters
1655
        ----------
1656
        row
1657
            Optional row index.
1658
        column
1659
            Optional column index or name.
1660

1661
        See Also
1662
        --------
1663
        row : Get the values of a single row, either by index or by predicate.
1664

1665
        Notes
1666
        -----
1667
        If row/col not provided, this is equivalent to `df[0,0]`, with a check that
1668
        the shape is (1,1). With row/col, this is equivalent to `df[row,col]`.
1669

1670
        Examples
1671
        --------
1672
        >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
1673
        >>> df.select((pl.col("a") * pl.col("b")).sum()).item()
1674
        32
1675
        >>> df.item(1, 1)
1676
        5
1677
        >>> df.item(2, "b")
1678
        6
1679
        """
1680
        if row is None and column is None:
1681
            if self.shape != (1, 1):
1682
                msg = (
1683
                    "can only call `.item()` if the dataframe is of shape (1, 1),"
1684
                    " or if explicit row/col values are provided;"
1685
                    f" frame has shape {self.shape!r}"
1686
                )
1687
                raise ValueError(msg)
1688
            return self._df.to_series(0).get_index(0)
1689

1690
        elif row is None or column is None:
1691
            msg = "cannot call `.item()` with only one of `row` or `column`"
1692
            raise ValueError(msg)
1693

1694
        s = (
1695
            self._df.to_series(column)
1696
            if isinstance(column, int)
1697
            else self._df.get_column(column)
1698
        )
1699
        return s.get_index_signed(row)
1700

1701
    @deprecate_renamed_parameter("future", "compat_level", version="1.1")
1702
    def to_arrow(self, *, compat_level: CompatLevel | None = None) -> pa.Table:
1703
        """
1704
        Collect the underlying arrow arrays in an Arrow Table.
1705

1706
        This operation is mostly zero copy.
1707

1708
        Data types that do copy:
1709
            - CategoricalType
1710

1711
        .. versionchanged:: 1.1
1712
            The `future` parameter was renamed `compat_level`.
1713

1714
        Parameters
1715
        ----------
1716
        compat_level
1717
            Use a specific compatibility level
1718
            when exporting Polars' internal data structures.
1719

1720
        Examples
1721
        --------
1722
        >>> df = pl.DataFrame(
1723
        ...     {"foo": [1, 2, 3, 4, 5, 6], "bar": ["a", "b", "c", "d", "e", "f"]}
1724
        ... )
1725
        >>> df.to_arrow()
1726
        pyarrow.Table
1727
        foo: int64
1728
        bar: large_string
1729
        ----
1730
        foo: [[1,2,3,4,5,6]]
1731
        bar: [["a","b","c","d","e","f"]]
1732
        """
1733
        if not self.width:  # 0x0 dataframe, cannot infer schema from batches
1734
            return pa.table({})
1735

1736
        compat_level_py: int | bool
1737
        if compat_level is None:
1738
            compat_level_py = False
1739
        elif isinstance(compat_level, CompatLevel):
1740
            compat_level_py = compat_level._version
1741

1742
        record_batches = self._df.to_arrow(compat_level_py)
1743
        return pa.Table.from_batches(record_batches)
1744

1745
    @overload
1746
    def to_dict(self, *, as_series: Literal[True] = ...) -> dict[str, Series]: ...
1747

1748
    @overload
1749
    def to_dict(self, *, as_series: Literal[False]) -> dict[str, list[Any]]: ...
1750

1751
    @overload
1752
    def to_dict(
1753
        self, *, as_series: bool
1754
    ) -> dict[str, Series] | dict[str, list[Any]]: ...
1755

1756
    def to_dict(
1757
        self, *, as_series: bool = True
1758
    ) -> dict[str, Series] | dict[str, list[Any]]:
1759
        """
1760
        Convert DataFrame to a dictionary mapping column name to values.
1761

1762
        Parameters
1763
        ----------
1764
        as_series
1765
            True -> Values are Series
1766
            False -> Values are List[Any]
1767

1768
        See Also
1769
        --------
1770
        rows_by_key
1771
        to_dicts
1772

1773
        Examples
1774
        --------
1775
        >>> df = pl.DataFrame(
1776
        ...     {
1777
        ...         "A": [1, 2, 3, 4, 5],
1778
        ...         "fruits": ["banana", "banana", "apple", "apple", "banana"],
1779
        ...         "B": [5, 4, 3, 2, 1],
1780
        ...         "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
1781
        ...         "optional": [28, 300, None, 2, -30],
1782
        ...     }
1783
        ... )
1784
        >>> df
1785
        shape: (5, 5)
1786
        ┌─────┬────────┬─────┬────────┬──────────┐
1787
        │ A   ┆ fruits ┆ B   ┆ cars   ┆ optional │
1788
        │ --- ┆ ---    ┆ --- ┆ ---    ┆ ---      │
1789
        │ i64 ┆ str    ┆ i64 ┆ str    ┆ i64      │
1790
        ╞═════╪════════╪═════╪════════╪══════════╡
1791
        │ 1   ┆ banana ┆ 5   ┆ beetle ┆ 28       │
1792
        │ 2   ┆ banana ┆ 4   ┆ audi   ┆ 300      │
1793
        │ 3   ┆ apple  ┆ 3   ┆ beetle ┆ null     │
1794
        │ 4   ┆ apple  ┆ 2   ┆ beetle ┆ 2        │
1795
        │ 5   ┆ banana ┆ 1   ┆ beetle ┆ -30      │
1796
        └─────┴────────┴─────┴────────┴──────────┘
1797
        >>> df.to_dict(as_series=False)
1798
        {'A': [1, 2, 3, 4, 5],
1799
        'fruits': ['banana', 'banana', 'apple', 'apple', 'banana'],
1800
        'B': [5, 4, 3, 2, 1],
1801
        'cars': ['beetle', 'audi', 'beetle', 'beetle', 'beetle'],
1802
        'optional': [28, 300, None, 2, -30]}
1803
        >>> df.to_dict(as_series=True)
1804
        {'A': shape: (5,)
1805
        Series: 'A' [i64]
1806
        [
1807
            1
1808
            2
1809
            3
1810
            4
1811
            5
1812
        ], 'fruits': shape: (5,)
1813
        Series: 'fruits' [str]
1814
        [
1815
            "banana"
1816
            "banana"
1817
            "apple"
1818
            "apple"
1819
            "banana"
1820
        ], 'B': shape: (5,)
1821
        Series: 'B' [i64]
1822
        [
1823
            5
1824
            4
1825
            3
1826
            2
1827
            1
1828
        ], 'cars': shape: (5,)
1829
        Series: 'cars' [str]
1830
        [
1831
            "beetle"
1832
            "audi"
1833
            "beetle"
1834
            "beetle"
1835
            "beetle"
1836
        ], 'optional': shape: (5,)
1837
        Series: 'optional' [i64]
1838
        [
1839
            28
1840
            300
1841
            null
1842
            2
1843
            -30
1844
        ]}
1845
        """
1846
        if as_series:
1847
            return {s.name: s for s in self}
1848
        else:
1849
            return {s.name: s.to_list() for s in self}
1850

1851
    def to_dicts(self) -> list[dict[str, Any]]:
1852
        """
1853
        Convert every row to a dictionary of Python-native values.
1854

1855
        Notes
1856
        -----
1857
        If you have `ns`-precision temporal values you should be aware that Python
1858
        natively only supports up to `μs`-precision; `ns`-precision values will be
1859
        truncated to microseconds on conversion to Python. If this matters to your
1860
        use-case you should export to a different format (such as Arrow or NumPy).
1861

1862
        Examples
1863
        --------
1864
        >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
1865
        >>> df.to_dicts()
1866
        [{'foo': 1, 'bar': 4}, {'foo': 2, 'bar': 5}, {'foo': 3, 'bar': 6}]
1867
        """
1868
        return self.rows(named=True)
1869

1870
    def to_numpy(
1871
        self,
1872
        *,
1873
        order: IndexOrder = "fortran",
1874
        writable: bool = False,
1875
        allow_copy: bool = True,
1876
        structured: bool = False,
1877
        use_pyarrow: bool | None = None,
1878
    ) -> np.ndarray[Any, Any]:
1879
        """
1880
        Convert this DataFrame to a NumPy ndarray.
1881

1882
        This operation copies data only when necessary. The conversion is zero copy when
1883
        all of the following hold:
1884

1885
        - The DataFrame is fully contiguous in memory, with all Series back-to-back and
1886
          all Series consisting of a single chunk.
1887
        - The data type is an integer or float.
1888
        - The DataFrame contains no null values.
1889
        - The `order` parameter is set to `fortran` (default).
1890
        - The `writable` parameter is set to `False` (default).
1891

1892
        Parameters
1893
        ----------
1894
        order
1895
            The index order of the returned NumPy array, either C-like or
1896
            Fortran-like. In general, using the Fortran-like index order is faster.
1897
            However, the C-like order might be more appropriate to use for downstream
1898
            applications to prevent cloning data, e.g. when reshaping into a
1899
            one-dimensional array.
1900
        writable
1901
            Ensure the resulting array is writable. This will force a copy of the data
1902
            if the array was created without copy, as the underlying Arrow data is
1903
            immutable.
1904
        allow_copy
1905
            Allow memory to be copied to perform the conversion. If set to `False`,
1906
            causes conversions that are not zero-copy to fail.
1907
        structured
1908
            Return a `structured array`_ with a data type that corresponds to the
1909
            DataFrame schema. If set to `False` (default), a 2D ndarray is
1910
            returned instead.
1911

1912
            .. _structured array: https://numpy.org/doc/stable/user/basics.rec.html
1913

1914
        use_pyarrow
1915
            Use `pyarrow.Array.to_numpy
1916
            <https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array.to_numpy>`_
1917

1918
            function for the conversion to NumPy if necessary.
1919

1920
            .. deprecated:: 0.20.28
1921
                Polars now uses its native engine by default for conversion to NumPy.
1922

1923
        Examples
1924
        --------
1925
        Numeric data without nulls can be converted without copying data in some cases.
1926
        The resulting array will not be writable.
1927

1928
        >>> df = pl.DataFrame({"a": [1, 2, 3]})
1929
        >>> arr = df.to_numpy()
1930
        >>> arr
1931
        array([[1],
1932
               [2],
1933
               [3]])
1934
        >>> arr.flags.writeable
1935
        False
1936

1937
        Set `writable=True` to force data copy to make the array writable.
1938

1939
        >>> df.to_numpy(writable=True).flags.writeable
1940
        True
1941

1942
        If the DataFrame contains different numeric data types, the resulting data type
1943
        will be the supertype. This requires data to be copied. Integer types with
1944
        nulls are cast to a float type with `nan` representing a null value.
1945

1946
        >>> df = pl.DataFrame({"a": [1, 2, None], "b": [4.0, 5.0, 6.0]})
1947
        >>> df.to_numpy()
1948
        array([[ 1.,  4.],
1949
               [ 2.,  5.],
1950
               [nan,  6.]])
1951

1952
        Set `allow_copy=False` to raise an error if data would be copied.
1953

1954
        >>> s.to_numpy(allow_copy=False)  # doctest: +SKIP
1955
        Traceback (most recent call last):
1956
        ...
1957
        RuntimeError: copy not allowed: cannot convert to a NumPy array without copying data
1958

1959
        Polars defaults to F-contiguous order. Use `order="c"` to force the resulting
1960
        array to be C-contiguous.
1961

1962
        >>> df.to_numpy(order="c").flags.c_contiguous
1963
        True
1964

1965
        DataFrames with mixed types will result in an array with an object dtype.
1966

1967
        >>> df = pl.DataFrame(
1968
        ...     {
1969
        ...         "foo": [1, 2, 3],
1970
        ...         "bar": [6.5, 7.0, 8.5],
1971
        ...         "ham": ["a", "b", "c"],
1972
        ...     },
1973
        ...     schema_overrides={"foo": pl.UInt8, "bar": pl.Float32},
1974
        ... )
1975
        >>> df.to_numpy()
1976
        array([[1, 6.5, 'a'],
1977
               [2, 7.0, 'b'],
1978
               [3, 8.5, 'c']], dtype=object)
1979

1980
        Set `structured=True` to convert to a structured array, which can better
1981
        preserve individual column data such as name and data type.
1982

1983
        >>> df.to_numpy(structured=True)
1984
        array([(1, 6.5, 'a'), (2, 7. , 'b'), (3, 8.5, 'c')],
1985
              dtype=[('foo', 'u1'), ('bar', '<f4'), ('ham', '<U1')])
1986
        """  # noqa: W505
1987
        if use_pyarrow is not None:
1988
            issue_deprecation_warning(
1989
                "the `use_pyarrow` parameter for `DataFrame.to_numpy` is deprecated."
1990
                " Polars now uses its native engine by default for conversion to NumPy.",
1991
                version="0.20.28",
1992
            )
1993

1994
        if structured:
1995
            if not allow_copy and not self.is_empty():
1996
                msg = "copy not allowed: cannot create structured array without copying data"
1997
                raise RuntimeError(msg)
1998

1999
            arrays = []
2000
            struct_dtype = []
2001
            for s in self.iter_columns():
2002
                if s.dtype == Struct:
2003
                    arr = s.struct.unnest().to_numpy(
2004
                        structured=True,
2005
                        allow_copy=True,
2006
                        use_pyarrow=use_pyarrow,
2007
                    )
2008
                else:
2009
                    arr = s.to_numpy(use_pyarrow=use_pyarrow)
2010

2011
                if s.dtype == String and not s.has_nulls():
2012
                    arr = arr.astype(str, copy=False)
2013
                arrays.append(arr)
2014
                struct_dtype.append((s.name, arr.dtype, arr.shape[1:]))
2015

2016
            out = np.empty(self.height, dtype=struct_dtype)
2017
            for idx, c in enumerate(self.columns):
2018
                out[c] = arrays[idx]
2019
            return out
2020

2021
        return self._df.to_numpy(order, writable=writable, allow_copy=allow_copy)
2022

2023
    @overload
2024
    def to_jax(
2025
        self,
2026
        return_type: Literal["array"] = ...,
2027
        *,
2028
        device: jax.Device | str | None = ...,
2029
        label: str | Expr | Sequence[str | Expr] | None = ...,
2030
        features: str | Expr | Sequence[str | Expr] | None = ...,
2031
        dtype: PolarsDataType | None = ...,
2032
        order: IndexOrder = ...,
2033
    ) -> jax.Array: ...
2034

2035
    @overload
2036
    def to_jax(
2037
        self,
2038
        return_type: Literal["dict"],
2039
        *,
2040
        device: jax.Device | str | None = ...,
2041
        label: str | Expr | Sequence[str | Expr] | None = ...,
2042
        features: str | Expr | Sequence[str | Expr] | None = ...,
2043
        dtype: PolarsDataType | None = ...,
2044
        order: IndexOrder = ...,
2045
    ) -> dict[str, jax.Array]: ...
2046

2047
    @unstable()
2048
    def to_jax(
2049
        self,
2050
        return_type: JaxExportType = "array",
2051
        *,
2052
        device: jax.Device | str | None = None,
2053
        label: str | Expr | Sequence[str | Expr] | None = None,
2054
        features: str | Expr | Sequence[str | Expr] | None = None,
2055
        dtype: PolarsDataType | None = None,
2056
        order: IndexOrder = "fortran",
2057
    ) -> jax.Array | dict[str, jax.Array]:
2058
        """
2059
        Convert DataFrame to a Jax Array, or dict of Jax Arrays.
2060

2061
        .. versionadded:: 0.20.27
2062

2063
        .. warning::
2064
            This functionality is currently considered **unstable**. It may be
2065
            changed at any point without it being considered a breaking change.
2066

2067
        Parameters
2068
        ----------
2069
        return_type : {"array", "dict"}
2070
            Set return type; a Jax Array, or dict of Jax Arrays.
2071
        device
2072
            Specify the jax `Device` on which the array will be created; can provide
2073
            a string (such as "cpu", "gpu", or "tpu") in which case the device is
2074
            retrieved as `jax.devices(string)[0]`. For more specific control you
2075
            can supply the instantiated `Device` directly. If None, arrays are
2076
            created on the default device.
2077
        label
2078
            One or more column names, expressions, or selectors that label the feature
2079
            data; results in a `{"label": ..., "features": ...}` dict being returned
2080
            when `return_type` is "dict" instead of a `{"col": array, }` dict.
2081
        features
2082
            One or more column names, expressions, or selectors that contain the feature
2083
            data; if omitted, all columns that are not designated as part of the label
2084
            are used. Only applies when `return_type` is "dict".
2085
        dtype
2086
            Unify the dtype of all returned arrays; this casts any column that is
2087
            not already of the required dtype before converting to Array. Note that
2088
            export will be single-precision (32bit) unless the Jax config/environment
2089
            directs otherwise (eg: "jax_enable_x64" was set True in the config object
2090
            at startup, or "JAX_ENABLE_X64" is set to "1" in the environment).
2091
        order : {"c", "fortran"}
2092
            The index order of the returned Jax array, either C-like (row-major) or
2093
            Fortran-like (column-major).
2094

2095
        See Also
2096
        --------
2097
        to_dummies
2098
        to_numpy
2099
        to_torch
2100

2101
        Examples
2102
        --------
2103
        >>> df = pl.DataFrame(
2104
        ...     {
2105
        ...         "lbl": [0, 1, 2, 3],
2106
        ...         "feat1": [1, 0, 0, 1],
2107
        ...         "feat2": [1.5, -0.5, 0.0, -2.25],
2108
        ...     }
2109
        ... )
2110

2111
        Standard return type (2D Array), on the standard device:
2112

2113
        >>> df.to_jax()
2114
        Array([[ 0.  ,  1.  ,  1.5 ],
2115
               [ 1.  ,  0.  , -0.5 ],
2116
               [ 2.  ,  0.  ,  0.  ],
2117
               [ 3.  ,  1.  , -2.25]], dtype=float32)
2118

2119
        Create the Array on the default GPU device:
2120

2121
        >>> a = df.to_jax(device="gpu")  # doctest: +SKIP
2122
        >>> a.device()  # doctest: +SKIP
2123
        GpuDevice(id=0, process_index=0)
2124

2125
        Create the Array on a specific GPU device:
2126

2127
        >>> gpu_device = jax.devices("gpu")[1]  # doctest: +SKIP
2128
        >>> a = df.to_jax(device=gpu_device)  # doctest: +SKIP
2129
        >>> a.device()  # doctest: +SKIP
2130
        GpuDevice(id=1, process_index=0)
2131

2132
        As a dictionary of individual Arrays:
2133

2134
        >>> df.to_jax("dict")
2135
        {'lbl': Array([0, 1, 2, 3], dtype=int32),
2136
         'feat1': Array([1, 0, 0, 1], dtype=int32),
2137
         'feat2': Array([ 1.5 , -0.5 ,  0.  , -2.25], dtype=float32)}
2138

2139
        As a "label" and "features" dictionary; note that as "features" is not
2140
        declared, it defaults to all the columns that are not in "label":
2141

2142
        >>> df.to_jax("dict", label="lbl")
2143
        {'label': Array([[0],
2144
                [1],
2145
                [2],
2146
                [3]], dtype=int32),
2147
         'features': Array([[ 1.  ,  1.5 ],
2148
                [ 0.  , -0.5 ],
2149
                [ 0.  ,  0.  ],
2150
                [ 1.  , -2.25]], dtype=float32)}
2151

2152
        As a "label" and "features" dictionary where each is designated using
2153
        a col or selector expression (which can also be used to cast the data
2154
        if the label and features are better-represented with different dtypes):
2155

2156
        >>> import polars.selectors as cs
2157
        >>> df.to_jax(
2158
        ...     return_type="dict",
2159
        ...     features=cs.float(),
2160
        ...     label=pl.col("lbl").cast(pl.UInt8),
2161
        ... )
2162
        {'label': Array([[0],
2163
                [1],
2164
                [2],
2165
                [3]], dtype=uint8),
2166
         'features': Array([[ 1.5 ],
2167
                [-0.5 ],
2168
                [ 0.  ],
2169
                [-2.25]], dtype=float32)}
2170
        """
2171
        if return_type != "dict" and (label is not None or features is not None):
2172
            msg = "`label` and `features` only apply when `return_type` is 'dict'"
2173
            raise ValueError(msg)
2174
        elif return_type == "dict" and label is None and features is not None:
2175
            msg = "`label` is required if setting `features` when `return_type='dict'"
2176
            raise ValueError(msg)
2177

2178
        jx = import_optional(
2179
            "jax",
2180
            install_message="Please see `https://jax.readthedocs.io/en/latest/installation.html` "
2181
            "for specific installation recommendations for the Jax package",
2182
        )
2183
        enabled_double_precision = jx.config.jax_enable_x64 or bool(
2184
            int(os.environ.get("JAX_ENABLE_X64", "0"))
2185
        )
2186
        if dtype:
2187
            frame = self.cast(dtype)
2188
        elif not enabled_double_precision:
2189
            # enforce single-precision unless environment/config directs otherwise
2190
            frame = self.cast({Float64: Float32, Int64: Int32, UInt64: UInt32})
2191
        else:
2192
            frame = self
2193

2194
        if isinstance(device, str):
2195
            device = jx.devices(device)[0]
2196

2197
        with contextlib.nullcontext() if device is None else jx.default_device(device):
2198
            if return_type == "array":
2199
                # note: jax arrays are immutable, so can avoid a copy (vs torch)
2200
                from polars.ml.utilities import frame_to_numpy
2201

2202
                arr = frame_to_numpy(
2203
                    df=frame,
2204
                    order=order,
2205
                    writable=False,
2206
                    target="Jax Array",
2207
                )
2208
                return jx.numpy.asarray(a=arr, order="K")
2209

2210
            elif return_type == "dict":
2211
                if label is not None:
2212
                    # return a {"label": array(s), "features": array(s)} dict
2213
                    label_frame = frame.select(label)
2214
                    features_frame = (
2215
                        frame.select(features)
2216
                        if features is not None
2217
                        else frame.drop(*label_frame.columns)
2218
                    )
2219
                    return {
2220
                        "label": label_frame.to_jax(),
2221
                        "features": features_frame.to_jax(),
2222
                    }
2223
                else:
2224
                    # return a {"col": array} dict
2225
                    return {srs.name: srs.to_jax() for srs in frame}
2226
            else:
2227
                valid_jax_types = ", ".join(get_args(JaxExportType))
2228
                msg = f"invalid `return_type`: {return_type!r}\nExpected one of: {valid_jax_types}"
2229
                raise ValueError(msg)
2230

2231
    @overload
2232
    def to_torch(
2233
        self,
2234
        return_type: Literal["tensor"] = ...,
2235
        *,
2236
        label: str | Expr | Sequence[str | Expr] | None = ...,
2237
        features: str | Expr | Sequence[str | Expr] | None = ...,
2238
        dtype: PolarsDataType | None = ...,
2239
    ) -> torch.Tensor: ...
2240

2241
    @overload
2242
    def to_torch(
2243
        self,
2244
        return_type: Literal["dataset"],
2245
        *,
2246
        label: str | Expr | Sequence[str | Expr] | None = ...,
2247
        features: str | Expr | Sequence[str | Expr] | None = ...,
2248
        dtype: PolarsDataType | None = ...,
2249
    ) -> PolarsDataset: ...
2250

2251
    @overload
2252
    def to_torch(
2253
        self,
2254
        return_type: Literal["dict"],
2255
        *,
2256
        label: str | Expr | Sequence[str | Expr] | None = ...,
2257
        features: str | Expr | Sequence[str | Expr] | None = ...,
2258
        dtype: PolarsDataType | None = ...,
2259
    ) -> dict[str, torch.Tensor]: ...
2260

2261
    @unstable()
2262
    def to_torch(
2263
        self,
2264
        return_type: TorchExportType = "tensor",
2265
        *,
2266
        label: str | Expr | Sequence[str | Expr] | None = None,
2267
        features: str | Expr | Sequence[str | Expr] | None = None,
2268
        dtype: PolarsDataType | None = None,
2269
    ) -> torch.Tensor | dict[str, torch.Tensor] | PolarsDataset:
2270
        """
2271
        Convert DataFrame to a PyTorch Tensor, Dataset, or dict of Tensors.
2272

2273
        .. versionadded:: 0.20.23
2274

2275
        .. warning::
2276
            This functionality is currently considered **unstable**. It may be
2277
            changed at any point without it being considered a breaking change.
2278

2279
        Parameters
2280
        ----------
2281
        return_type : {"tensor", "dataset", "dict"}
2282
            Set return type; a PyTorch Tensor, PolarsDataset (a frame-specialized
2283
            TensorDataset), or dict of Tensors.
2284
        label
2285
            One or more column names, expressions, or selectors that label the feature
2286
            data; when `return_type` is "dataset", the PolarsDataset will return
2287
            `(features, label)` tensor tuples for each row. Otherwise, it returns
2288
            `(features,)` tensor tuples where the feature contains all the row data.
2289
        features
2290
            One or more column names, expressions, or selectors that contain the feature
2291
            data; if omitted, all columns that are not designated as part of the label
2292
            are used.
2293
        dtype
2294
            Unify the dtype of all returned tensors; this casts any column that is
2295
            not of the required dtype before converting to Tensor. This includes
2296
            the label column *unless* the label is an expression (such as
2297
            `pl.col("label_column").cast(pl.Int16)`).
2298

2299
        See Also
2300
        --------
2301
        to_dummies
2302
        to_jax
2303
        to_numpy
2304

2305
        Examples
2306
        --------
2307
        >>> df = pl.DataFrame(
2308
        ...     {
2309
        ...         "lbl": [0, 1, 2, 3],
2310
        ...         "feat1": [1, 0, 0, 1],
2311
        ...         "feat2": [1.5, -0.5, 0.0, -2.25],
2312
        ...     }
2313
        ... )
2314

2315
        Standard return type (Tensor), with f32 supertype:
2316

2317
        >>> df.to_torch(dtype=pl.Float32)
2318
        tensor([[ 0.0000,  1.0000,  1.5000],
2319
                [ 1.0000,  0.0000, -0.5000],
2320
                [ 2.0000,  0.0000,  0.0000],
2321
                [ 3.0000,  1.0000, -2.2500]])
2322

2323
        As a dictionary of individual Tensors:
2324

2325
        >>> df.to_torch("dict")
2326
        {'lbl': tensor([0, 1, 2, 3]),
2327
         'feat1': tensor([1, 0, 0, 1]),
2328
         'feat2': tensor([ 1.5000, -0.5000,  0.0000, -2.2500], dtype=torch.float64)}
2329

2330
        As a "label" and "features" dictionary; note that as "features" is not
2331
        declared, it defaults to all the columns that are not in "label":
2332

2333
        >>> df.to_torch("dict", label="lbl", dtype=pl.Float32)
2334
        {'label': tensor([[0.],
2335
                 [1.],
2336
                 [2.],
2337
                 [3.]]),
2338
         'features': tensor([[ 1.0000,  1.5000],
2339
                 [ 0.0000, -0.5000],
2340
                 [ 0.0000,  0.0000],
2341
                 [ 1.0000, -2.2500]])}
2342

2343
        As a PolarsDataset, with f64 supertype:
2344

2345
        >>> ds = df.to_torch("dataset", dtype=pl.Float64)
2346
        >>> ds[3]
2347
        (tensor([ 3.0000,  1.0000, -2.2500], dtype=torch.float64),)
2348
        >>> ds[:2]
2349
        (tensor([[ 0.0000,  1.0000,  1.5000],
2350
                 [ 1.0000,  0.0000, -0.5000]], dtype=torch.float64),)
2351
        >>> ds[[0, 3]]
2352
        (tensor([[ 0.0000,  1.0000,  1.5000],
2353
                 [ 3.0000,  1.0000, -2.2500]], dtype=torch.float64),)
2354

2355
        As a convenience the PolarsDataset can opt in to half-precision data
2356
        for experimentation (usually this would be set on the model/pipeline):
2357

2358
        >>> list(ds.half())
2359
        [(tensor([0.0000, 1.0000, 1.5000], dtype=torch.float16),),
2360
         (tensor([ 1.0000,  0.0000, -0.5000], dtype=torch.float16),),
2361
         (tensor([2., 0., 0.], dtype=torch.float16),),
2362
         (tensor([ 3.0000,  1.0000, -2.2500], dtype=torch.float16),)]
2363

2364
        Pass PolarsDataset to a DataLoader, designating the label:
2365

2366
        >>> from torch.utils.data import DataLoader
2367
        >>> ds = df.to_torch("dataset", label="lbl")
2368
        >>> dl = DataLoader(ds, batch_size=2)
2369
        >>> batches = list(dl)
2370
        >>> batches[0]
2371
        [tensor([[ 1.0000,  1.5000],
2372
                 [ 0.0000, -0.5000]], dtype=torch.float64), tensor([0, 1])]
2373

2374
        Note that labels can be given as expressions, allowing them to have
2375
        a dtype independent of the feature columns (multi-column labels are
2376
        supported).
2377

2378
        >>> ds = df.to_torch(
2379
        ...     return_type="dataset",
2380
        ...     dtype=pl.Float32,
2381
        ...     label=pl.col("lbl").cast(pl.Int16),
2382
        ... )
2383
        >>> ds[:2]
2384
        (tensor([[ 1.0000,  1.5000],
2385
                 [ 0.0000, -0.5000]]), tensor([0, 1], dtype=torch.int16))
2386

2387
        Easily integrate with (for example) scikit-learn and other datasets:
2388

2389
        >>> from sklearn.datasets import fetch_california_housing  # doctest: +SKIP
2390
        >>> housing = fetch_california_housing()  # doctest: +SKIP
2391
        >>> df = pl.DataFrame(
2392
        ...     data=housing.data,
2393
        ...     schema=housing.feature_names,
2394
        ... ).with_columns(
2395
        ...     Target=housing.target,
2396
        ... )  # doctest: +SKIP
2397
        >>> train = df.to_torch("dataset", label="Target")  # doctest: +SKIP
2398
        >>> loader = DataLoader(
2399
        ...     train,
2400
        ...     shuffle=True,
2401
        ...     batch_size=64,
2402
        ... )  # doctest: +SKIP
2403
        """
2404
        if return_type not in ("dataset", "dict") and (
2405
            label is not None or features is not None
2406
        ):
2407
            msg = "`label` and `features` only apply when `return_type` is 'dataset' or 'dict'"
2408
            raise ValueError(msg)
2409
        elif return_type == "dict" and label is None and features is not None:
2410
            msg = "`label` is required if setting `features` when `return_type='dict'"
2411
            raise ValueError(msg)
2412

2413
        torch = import_optional("torch")
2414

2415
        # Cast columns.
2416
        if dtype in (UInt16, UInt32, UInt64):
2417
            msg = f"PyTorch does not support u16, u32, or u64 dtypes; given {dtype}"
2418
            raise ValueError(msg)
2419

2420
        to_dtype = dtype or {UInt16: Int32, UInt32: Int64, UInt64: Int64}
2421

2422
        if label is not None:
2423
            label_frame = self.select(label)
2424
            # Avoid casting the label if it's an expression.
2425
            if not isinstance(label, pl.Expr):
2426
                label_frame = label_frame.cast(to_dtype)  # type: ignore[arg-type]
2427
            features_frame = (
2428
                self.select(features)
2429
                if features is not None
2430
                else self.drop(*label_frame.columns)
2431
            ).cast(to_dtype)  # type: ignore[arg-type]
2432
            frame = F.concat([label_frame, features_frame], how="horizontal")
2433
        else:
2434
            frame = (self.select(features) if features is not None else self).cast(
2435
                to_dtype  # type: ignore[arg-type]
2436
            )
2437

2438
        if return_type == "tensor":
2439
            # note: torch tensors are not immutable, so we must consider them writable
2440
            from polars.ml.utilities import frame_to_numpy
2441

2442
            arr = frame_to_numpy(frame, writable=True, target="Tensor")
2443
            return torch.from_numpy(arr)
2444

2445
        elif return_type == "dict":
2446
            if label is not None:
2447
                # return a {"label": tensor(s), "features": tensor(s)} dict
2448
                return {
2449
                    "label": label_frame.to_torch(),
2450
                    "features": features_frame.to_torch(),
2451
                }
2452
            else:
2453
                # return a {"col": tensor} dict
2454
                return {srs.name: srs.to_torch() for srs in frame}
2455

2456
        elif return_type == "dataset":
2457
            # return a torch Dataset object
2458
            from polars.ml.torch import PolarsDataset
2459

2460
            pds_label = None if label is None else label_frame.columns
2461
            return PolarsDataset(frame, label=pds_label, features=features)
2462
        else:
2463
            valid_torch_types = ", ".join(get_args(TorchExportType))
2464
            msg = f"invalid `return_type`: {return_type!r}\nExpected one of: {valid_torch_types}"
2465
            raise ValueError(msg)
2466

2467
    def to_pandas(
2468
        self,
2469
        *,
2470
        use_pyarrow_extension_array: bool = False,
2471
        **kwargs: Any,
2472
    ) -> pd.DataFrame:
2473
        """
2474
        Convert this DataFrame to a pandas DataFrame.
2475

2476
        This operation copies data if `use_pyarrow_extension_array` is not enabled.
2477

2478
        Parameters
2479
        ----------
2480
        use_pyarrow_extension_array
2481
            Use PyArrow-backed extension arrays instead of NumPy arrays for the columns
2482
            of the pandas DataFrame. This allows zero copy operations and preservation
2483
            of null values. Subsequent operations on the resulting pandas DataFrame may
2484
            trigger conversion to NumPy if those operations are not supported by PyArrow
2485
            compute functions.
2486
        **kwargs
2487
            Additional keyword arguments to be passed to
2488
            :meth:`pyarrow.Table.to_pandas`.
2489

2490
        Returns
2491
        -------
2492
        :class:`pandas.DataFrame`
2493

2494
        Notes
2495
        -----
2496
        This operation requires that both :mod:`pandas` and :mod:`pyarrow` are
2497
        installed.
2498

2499
        Examples
2500
        --------
2501
        >>> df = pl.DataFrame(
2502
        ...     {
2503
        ...         "foo": [1, 2, 3],
2504
        ...         "bar": [6.0, 7.0, 8.0],
2505
        ...         "ham": ["a", "b", "c"],
2506
        ...     }
2507
        ... )
2508
        >>> df.to_pandas()
2509
           foo  bar ham
2510
        0    1  6.0   a
2511
        1    2  7.0   b
2512
        2    3  8.0   c
2513

2514
        Null values in numeric columns are converted to `NaN`.
2515

2516
        >>> df = pl.DataFrame(
2517
        ...     {
2518
        ...         "foo": [1, 2, None],
2519
        ...         "bar": [6.0, None, 8.0],
2520
        ...         "ham": [None, "b", "c"],
2521
        ...     }
2522
        ... )
2523
        >>> df.to_pandas()
2524
           foo  bar   ham
2525
        0  1.0  6.0  None
2526
        1  2.0  NaN     b
2527
        2  NaN  8.0     c
2528

2529
        Pass `use_pyarrow_extension_array=True` to get a pandas DataFrame with columns
2530
        backed by PyArrow extension arrays. This will preserve null values.
2531

2532
        >>> df.to_pandas(use_pyarrow_extension_array=True)
2533
            foo   bar   ham
2534
        0     1   6.0  <NA>
2535
        1     2  <NA>     b
2536
        2  <NA>   8.0     c
2537
        >>> _.dtypes
2538
        foo           int64[pyarrow]
2539
        bar          double[pyarrow]
2540
        ham    large_string[pyarrow]
2541
        dtype: object
2542
        """
2543
        if use_pyarrow_extension_array:
2544
            if parse_version(pd.__version__) < (1, 5):
2545
                msg = f'pandas>=1.5.0 is required for `to_pandas("use_pyarrow_extension_array=True")`, found Pandas {pd.__version__!r}'
2546
                raise ModuleUpgradeRequiredError(msg)
2547
            if not _PYARROW_AVAILABLE or parse_version(pa.__version__) < (8, 0):
2548
                msg = "pyarrow>=8.0.0 is required for `to_pandas(use_pyarrow_extension_array=True)`"
2549
                if _PYARROW_AVAILABLE:
2550
                    msg += f", found pyarrow {pa.__version__!r}."
2551
                    raise ModuleUpgradeRequiredError(msg)
2552
                else:
2553
                    raise ModuleNotFoundError(msg)
2554

2555
        # handle Object columns separately (Arrow does not convert them correctly)
2556
        if Object in self.dtypes:
2557
            return self._to_pandas_with_object_columns(
2558
                use_pyarrow_extension_array=use_pyarrow_extension_array, **kwargs
2559
            )
2560

2561
        return self._to_pandas_without_object_columns(
2562
            self, use_pyarrow_extension_array=use_pyarrow_extension_array, **kwargs
2563
        )
2564

2565
    def _to_pandas_with_object_columns(
2566
        self,
2567
        *,
2568
        use_pyarrow_extension_array: bool,
2569
        **kwargs: Any,
2570
    ) -> pd.DataFrame:
2571
        # Find which columns are of type pl.Object, and which aren't:
2572
        object_columns = []
2573
        not_object_columns = []
2574
        for i, dtype in enumerate(self.dtypes):
2575
            if dtype.is_object():
2576
                object_columns.append(i)
2577
            else:
2578
                not_object_columns.append(i)
2579

2580
        # Export columns that aren't pl.Object, in the same order:
2581
        if not_object_columns:
2582
            df_without_objects = self[:, not_object_columns]
2583
            pandas_df = self._to_pandas_without_object_columns(
2584
                df_without_objects,
2585
                use_pyarrow_extension_array=use_pyarrow_extension_array,
2586
                **kwargs,
2587
            )
2588
        else:
2589
            pandas_df = pd.DataFrame()
2590

2591
        # Add columns that are pl.Object, using Series' custom to_pandas()
2592
        # logic for this case. We do this in order, so the original index for
2593
        # the next column in this dataframe is correct for the partially
2594
        # constructed Pandas dataframe, since there are no additional or
2595
        # missing columns to the inserted column's left.
2596
        for i in object_columns:
2597
            name = self.columns[i]
2598
            pandas_df.insert(i, name, self.to_series(i).to_pandas())
2599

2600
        return pandas_df
2601

2602
    def _to_pandas_without_object_columns(
2603
        self,
2604
        df: DataFrame,
2605
        *,
2606
        use_pyarrow_extension_array: bool,
2607
        **kwargs: Any,
2608
    ) -> pd.DataFrame:
2609
        if not df.width:  # Empty dataframe, cannot infer schema from batches
2610
            return pd.DataFrame()
2611

2612
        record_batches = df._df.to_pandas()
2613
        tbl = pa.Table.from_batches(record_batches)
2614
        if use_pyarrow_extension_array:
2615
            return tbl.to_pandas(
2616
                self_destruct=True,
2617
                split_blocks=True,
2618
                types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype),
2619
                **kwargs,
2620
            )
2621

2622
        date_as_object = kwargs.pop("date_as_object", False)
2623
        return tbl.to_pandas(date_as_object=date_as_object, **kwargs)
2624

2625
    def to_series(self, index: int = 0) -> Series:
2626
        """
2627
        Select column as Series at index location.
2628

2629
        Parameters
2630
        ----------
2631
        index
2632
            Location of selection.
2633

2634
        See Also
2635
        --------
2636
        get_column
2637

2638
        Examples
2639
        --------
2640
        >>> df = pl.DataFrame(
2641
        ...     {
2642
        ...         "foo": [1, 2, 3],
2643
        ...         "bar": [6, 7, 8],
2644
        ...         "ham": ["a", "b", "c"],
2645
        ...     }
2646
        ... )
2647
        >>> df.to_series(1)
2648
        shape: (3,)
2649
        Series: 'bar' [i64]
2650
        [
2651
                6
2652
                7
2653
                8
2654
        ]
2655
        """
2656
        return wrap_s(self._df.to_series(index))
2657

2658
    def to_init_repr(self, n: int = 1000) -> str:
2659
        """
2660
        Convert DataFrame to instantiable string representation.
2661

2662
        Parameters
2663
        ----------
2664
        n
2665
            Only use first n rows.
2666

2667
        See Also
2668
        --------
2669
        polars.Series.to_init_repr
2670
        polars.from_repr
2671

2672
        Examples
2673
        --------
2674
        >>> df = pl.DataFrame(
2675
        ...     [
2676
        ...         pl.Series("foo", [1, 2, 3], dtype=pl.UInt8),
2677
        ...         pl.Series("bar", [6.0, 7.0, 8.0], dtype=pl.Float32),
2678
        ...         pl.Series("ham", ["a", "b", "c"], dtype=pl.String),
2679
        ...     ]
2680
        ... )
2681
        >>> print(df.to_init_repr())
2682
        pl.DataFrame(
2683
            [
2684
                pl.Series('foo', [1, 2, 3], dtype=pl.UInt8),
2685
                pl.Series('bar', [6.0, 7.0, 8.0], dtype=pl.Float32),
2686
                pl.Series('ham', ['a', 'b', 'c'], dtype=pl.String),
2687
            ]
2688
        )
2689

2690
        >>> df_from_str_repr = eval(df.to_init_repr())
2691
        >>> df_from_str_repr
2692
        shape: (3, 3)
2693
        ┌─────┬─────┬─────┐
2694
        │ foo ┆ bar ┆ ham │
2695
        │ --- ┆ --- ┆ --- │
2696
        │ u8  ┆ f32 ┆ str │
2697
        ╞═════╪═════╪═════╡
2698
        │ 1   ┆ 6.0 ┆ a   │
2699
        │ 2   ┆ 7.0 ┆ b   │
2700
        │ 3   ┆ 8.0 ┆ c   │
2701
        └─────┴─────┴─────┘
2702
        """
2703
        output = StringIO()
2704
        output.write("pl.DataFrame(\n    [\n")
2705

2706
        for i in range(self.width):
2707
            output.write("        ")
2708
            output.write(self.to_series(i).to_init_repr(n))
2709
            output.write(",\n")
2710

2711
        output.write("    ]\n)\n")
2712

2713
        return output.getvalue()
2714

2715
    @overload
2716
    def serialize(
2717
        self, file: None = ..., *, format: Literal["binary"] = ...
2718
    ) -> bytes: ...
2719

2720
    @overload
2721
    def serialize(self, file: None = ..., *, format: Literal["json"]) -> str: ...
2722

2723
    @overload
2724
    def serialize(
2725
        self, file: IOBase | str | Path, *, format: SerializationFormat = ...
2726
    ) -> None: ...
2727

2728
    def serialize(
2729
        self,
2730
        file: IOBase | str | Path | None = None,
2731
        *,
2732
        format: SerializationFormat = "binary",
2733
    ) -> bytes | str | None:
2734
        r"""
2735
        Serialize this DataFrame to a file or string in JSON format.
2736

2737
        Parameters
2738
        ----------
2739
        file
2740
            File path or writable file-like object to which the result will be written.
2741
            If set to `None` (default), the output is returned as a string instead.
2742
        format
2743
            The format in which to serialize. Options:
2744

2745
            - `"binary"`: Serialize to binary format (bytes). This is the default.
2746
            - `"json"`: Serialize to JSON format (string).
2747

2748
        Notes
2749
        -----
2750
        Serialization is not stable across Polars versions: a LazyFrame serialized
2751
        in one Polars version may not be deserializable in another Polars version.
2752

2753
        Examples
2754
        --------
2755
        Serialize the DataFrame into a binary representation.
2756

2757
        >>> df = pl.DataFrame(
2758
        ...     {
2759
        ...         "foo": [1, 2, 3],
2760
        ...         "bar": [6, 7, 8],
2761
        ...     }
2762
        ... )
2763
        >>> bytes = df.serialize()
2764
        >>> type(bytes)
2765
        <class 'bytes'>
2766

2767
        The bytes can later be deserialized back into a DataFrame.
2768

2769
        >>> import io
2770
        >>> pl.DataFrame.deserialize(io.BytesIO(bytes))
2771
        shape: (3, 2)
2772
        ┌─────┬─────┐
2773
        │ foo ┆ bar │
2774
        │ --- ┆ --- │
2775
        │ i64 ┆ i64 │
2776
        ╞═════╪═════╡
2777
        │ 1   ┆ 6   │
2778
        │ 2   ┆ 7   │
2779
        │ 3   ┆ 8   │
2780
        └─────┴─────┘
2781
        """
2782
        if format == "binary":
2783
            serializer = self._df.serialize_binary
2784
        elif format == "json":
2785
            serializer = self._df.serialize_json
2786
        else:
2787
            msg = f"`format` must be one of {{'binary', 'json'}}, got {format!r}"
2788
            raise ValueError(msg)
2789

2790
        return serialize_polars_object(serializer, file, format)
2791

2792
    @overload
2793
    def write_json(self, file: None = ...) -> str: ...
2794

2795
    @overload
2796
    def write_json(self, file: IOBase | str | Path) -> None: ...
2797

2798
    def write_json(self, file: IOBase | str | Path | None = None) -> str | None:
2799
        """
2800
        Serialize to JSON representation.
2801

2802
        Parameters
2803
        ----------
2804
        file
2805
            File path or writable file-like object to which the result will be written.
2806
            If set to `None` (default), the output is returned as a string instead.
2807

2808
        See Also
2809
        --------
2810
        DataFrame.write_ndjson
2811

2812
        Examples
2813
        --------
2814
        >>> df = pl.DataFrame(
2815
        ...     {
2816
        ...         "foo": [1, 2, 3],
2817
        ...         "bar": [6, 7, 8],
2818
        ...     }
2819
        ... )
2820
        >>> df.write_json()
2821
        '[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]'
2822
        """
2823

2824
        def write_json_to_string() -> str:
2825
            with BytesIO() as buf:
2826
                self._df.write_json(buf)
2827
                json_bytes = buf.getvalue()
2828
            return json_bytes.decode("utf8")
2829

2830
        if file is None:
2831
            return write_json_to_string()
2832
        elif isinstance(file, StringIO):
2833
            json_str = write_json_to_string()
2834
            file.write(json_str)
2835
            return None
2836
        elif isinstance(file, (str, Path)):
2837
            file = normalize_filepath(file)
2838
            self._df.write_json(file)
2839
            return None
2840
        else:
2841
            self._df.write_json(file)
2842
            return None
2843

2844
    @overload
2845
    def write_ndjson(self, file: None = None) -> str: ...
2846

2847
    @overload
2848
    def write_ndjson(self, file: str | Path | IO[bytes] | IO[str]) -> None: ...
2849

2850
    def write_ndjson(
2851
        self, file: str | Path | IO[bytes] | IO[str] | None = None
2852
    ) -> str | None:
2853
        r"""
2854
        Serialize to newline delimited JSON representation.
2855

2856
        Parameters
2857
        ----------
2858
        file
2859
            File path or writable file-like object to which the result will be written.
2860
            If set to `None` (default), the output is returned as a string instead.
2861

2862
        Examples
2863
        --------
2864
        >>> df = pl.DataFrame(
2865
        ...     {
2866
        ...         "foo": [1, 2, 3],
2867
        ...         "bar": [6, 7, 8],
2868
        ...     }
2869
        ... )
2870
        >>> df.write_ndjson()
2871
        '{"foo":1,"bar":6}\n{"foo":2,"bar":7}\n{"foo":3,"bar":8}\n'
2872
        """
2873
        should_return_buffer = False
2874
        target: str | Path | IO[bytes] | IO[str]
2875
        if file is None:
2876
            target = cast("IO[bytes]", BytesIO())
2877
            should_return_buffer = True
2878
        elif isinstance(file, (str, os.PathLike)):
2879
            target = normalize_filepath(file)
2880
        else:
2881
            target = file
2882

2883
        engine: EngineType = "in-memory"
2884

2885
        from polars.lazyframe.opt_flags import QueryOptFlags
2886

2887
        self.lazy().sink_ndjson(
2888
            target,
2889
            optimizations=QueryOptFlags._eager(),
2890
            engine=engine,
2891
        )
2892

2893
        if should_return_buffer:
2894
            return str(target.getvalue(), encoding="utf-8")  # type: ignore[union-attr]
2895

2896
        return None
2897

2898
    @overload
2899
    def write_csv(
2900
        self,
2901
        file: None = None,
2902
        *,
2903
        include_bom: bool = ...,
2904
        include_header: bool = ...,
2905
        separator: str = ...,
2906
        line_terminator: str = ...,
2907
        quote_char: str = ...,
2908
        batch_size: int = ...,
2909
        datetime_format: str | None = ...,
2910
        date_format: str | None = ...,
2911
        time_format: str | None = ...,
2912
        float_scientific: bool | None = ...,
2913
        float_precision: int | None = ...,
2914
        decimal_comma: bool = ...,
2915
        null_value: str | None = ...,
2916
        quote_style: CsvQuoteStyle | None = ...,
2917
        storage_options: dict[str, Any] | None = ...,
2918
        credential_provider: CredentialProviderFunction | Literal["auto"] | None = ...,
2919
        retries: int = ...,
2920
    ) -> str: ...
2921

2922
    @overload
2923
    def write_csv(
2924
        self,
2925
        file: str | Path | IO[str] | IO[bytes],
2926
        *,
2927
        include_bom: bool = ...,
2928
        include_header: bool = ...,
2929
        separator: str = ...,
2930
        line_terminator: str = ...,
2931
        quote_char: str = ...,
2932
        batch_size: int = ...,
2933
        datetime_format: str | None = ...,
2934
        date_format: str | None = ...,
2935
        time_format: str | None = ...,
2936
        float_scientific: bool | None = ...,
2937
        float_precision: int | None = ...,
2938
        decimal_comma: bool = ...,
2939
        null_value: str | None = ...,
2940
        quote_style: CsvQuoteStyle | None = ...,
2941
        storage_options: dict[str, Any] | None = ...,
2942
        credential_provider: CredentialProviderFunction | Literal["auto"] | None = ...,
2943
        retries: int = ...,
2944
    ) -> None: ...
2945

2946
    def write_csv(
2947
        self,
2948
        file: str | Path | IO[str] | IO[bytes] | None = None,
2949
        *,
2950
        include_bom: bool = False,
2951
        include_header: bool = True,
2952
        separator: str = ",",
2953
        line_terminator: str = "\n",
2954
        quote_char: str = '"',
2955
        batch_size: int = 1024,
2956
        datetime_format: str | None = None,
2957
        date_format: str | None = None,
2958
        time_format: str | None = None,
2959
        float_scientific: bool | None = None,
2960
        float_precision: int | None = None,
2961
        decimal_comma: bool = False,
2962
        null_value: str | None = None,
2963
        quote_style: CsvQuoteStyle | None = None,
2964
        storage_options: dict[str, Any] | None = None,
2965
        credential_provider: (
2966
            CredentialProviderFunction | Literal["auto"] | None
2967
        ) = "auto",
2968
        retries: int = 2,
2969
    ) -> str | None:
2970
        """
2971
        Write to comma-separated values (CSV) file.
2972

2973
        Parameters
2974
        ----------
2975
        file
2976
            File path or writable file-like object to which the result will be written.
2977
            If set to `None` (default), the output is returned as a string instead.
2978
        include_bom
2979
            Whether to include UTF-8 BOM in the CSV output.
2980
        include_header
2981
            Whether to include header in the CSV output.
2982
        separator
2983
            Separate CSV fields with this symbol.
2984
        line_terminator
2985
            String used to end each row.
2986
        quote_char
2987
            Byte to use as quoting character.
2988
        batch_size
2989
            Number of rows that will be processed per thread.
2990
        datetime_format
2991
            A format string, with the specifiers defined by the
2992
            `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
2993
            Rust crate. If no format specified, the default fractional-second
2994
            precision is inferred from the maximum timeunit found in the frame's
2995
            Datetime cols (if any).
2996
        date_format
2997
            A format string, with the specifiers defined by the
2998
            `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
2999
            Rust crate.
3000
        time_format
3001
            A format string, with the specifiers defined by the
3002
            `chrono <https://docs.rs/chrono/latest/chrono/format/strftime/index.html>`_
3003
            Rust crate.
3004
        float_scientific
3005
            Whether to use scientific form always (true), never (false), or
3006
            automatically (None) for `Float32` and `Float64` datatypes.
3007
        float_precision
3008
            Number of decimal places to write, applied to both `Float32` and
3009
            `Float64` datatypes.
3010
        decimal_comma
3011
            Use a comma as the decimal separator instead of a point in standard
3012
            notation. Floats will be encapsulated in quotes if necessary; set the
3013
            field separator to override.
3014
        null_value
3015
            A string representing null values (defaulting to the empty string).
3016
        quote_style : {'necessary', 'always', 'non_numeric', 'never'}
3017
            Determines the quoting strategy used.
3018

3019
            - necessary (default): This puts quotes around fields only when necessary.
3020
              They are necessary when fields contain a quote,
3021
              separator or record terminator.
3022
              Quotes are also necessary when writing an empty record
3023
              (which is indistinguishable from a record with one empty field).
3024
              This is the default.
3025
            - always: This puts quotes around every field. Always.
3026
            - never: This never puts quotes around fields, even if that results in
3027
              invalid CSV data (e.g.: by not quoting strings containing the separator).
3028
            - non_numeric: This puts quotes around all fields that are non-numeric.
3029
              Namely, when writing a field that does not parse as a valid float
3030
              or integer, then quotes will be used even if they aren`t strictly
3031
              necessary.
3032
        storage_options
3033
            Options that indicate how to connect to a cloud provider.
3034

3035
            The cloud providers currently supported are AWS, GCP, and Azure.
3036
            See supported keys here:
3037

3038
            * `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
3039
            * `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
3040
            * `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
3041
            * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
3042
            `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
3043

3044
            If `storage_options` is not provided, Polars will try to infer the
3045
            information from environment variables.
3046
        credential_provider
3047
            Provide a function that can be called to provide cloud storage
3048
            credentials. The function is expected to return a dictionary of
3049
            credential keys along with an optional credential expiry time.
3050

3051
            .. warning::
3052
                This functionality is considered **unstable**. It may be changed
3053
                at any point without it being considered a breaking change.
3054
        retries
3055
            Number of retries if accessing a cloud instance fails.
3056

3057
        Examples
3058
        --------
3059
        >>> import pathlib
3060
        >>>
3061
        >>> df = pl.DataFrame(
3062
        ...     {
3063
        ...         "foo": [1, 2, 3, 4, 5],
3064
        ...         "bar": [6, 7, 8, 9, 10],
3065
        ...         "ham": ["a", "b", "c", "d", "e"],
3066
        ...     }
3067
        ... )
3068
        >>> path: pathlib.Path = dirpath / "new_file.csv"
3069
        >>> df.write_csv(path, separator=",")
3070
        """
3071
        from polars.io.csv._utils import _check_arg_is_1byte
3072

3073
        _check_arg_is_1byte("separator", separator, can_be_empty=False)
3074
        _check_arg_is_1byte("quote_char", quote_char, can_be_empty=True)
3075
        if not null_value:
3076
            null_value = None
3077

3078
        should_return_buffer = False
3079
        target: str | Path | IO[bytes] | IO[str]
3080
        if file is None:
3081
            target = cast("IO[bytes]", BytesIO())
3082
            should_return_buffer = True
3083
        elif isinstance(file, (str, os.PathLike)):
3084
            target = normalize_filepath(file)
3085
        else:
3086
            target = file
3087

3088
        engine: EngineType = "in-memory"
3089

3090
        from polars.lazyframe.opt_flags import QueryOptFlags
3091

3092
        self.lazy().sink_csv(
3093
            target,
3094
            include_bom=include_bom,
3095
            include_header=include_header,
3096
            separator=separator,
3097
            line_terminator=line_terminator,
3098
            quote_char=quote_char,
3099
            batch_size=batch_size,
3100
            datetime_format=datetime_format,
3101
            date_format=date_format,
3102
            time_format=time_format,
3103
            float_scientific=float_scientific,
3104
            float_precision=float_precision,
3105
            decimal_comma=decimal_comma,
3106
            null_value=null_value,
3107
            quote_style=quote_style,
3108
            storage_options=storage_options,
3109
            credential_provider=credential_provider,
3110
            retries=retries,
3111
            optimizations=QueryOptFlags._eager(),
3112
            engine=engine,
3113
        )
3114

3115
        if should_return_buffer:
3116
            return str(target.getvalue(), encoding="utf-8")  # type: ignore[union-attr]
3117

3118
        return None
3119

3120
    def write_clipboard(self, *, separator: str = "\t", **kwargs: Any) -> None:
3121
        """
3122
        Copy `DataFrame` in csv format to the system clipboard with `write_csv`.
3123

3124
        Useful for pasting into Excel or other similar spreadsheet software.
3125

3126
        Parameters
3127
        ----------
3128
        separator
3129
            Separate CSV fields with this symbol.
3130
        kwargs
3131
            Additional arguments to pass to `write_csv`.
3132

3133
        See Also
3134
        --------
3135
        polars.read_clipboard: Read a DataFrame from the clipboard.
3136
        write_csv: Write to comma-separated values (CSV) file.
3137
        """
3138
        result: str = self.write_csv(file=None, separator=separator, **kwargs)
3139
        _write_clipboard_string(result)
3140

3141
    def write_avro(
3142
        self,
3143
        file: str | Path | IO[bytes],
3144
        compression: AvroCompression = "uncompressed",
3145
        name: str = "",
3146
    ) -> None:
3147
        """
3148
        Write to Apache Avro file.
3149

3150
        Parameters
3151
        ----------
3152
        file
3153
            File path or writable file-like object to which the data will be written.
3154
        compression : {'uncompressed', 'snappy', 'deflate'}
3155
            Compression method. Defaults to "uncompressed".
3156
        name
3157
            Schema name. Defaults to empty string.
3158

3159
        Examples
3160
        --------
3161
        >>> import pathlib
3162
        >>>
3163
        >>> df = pl.DataFrame(
3164
        ...     {
3165
        ...         "foo": [1, 2, 3, 4, 5],
3166
        ...         "bar": [6, 7, 8, 9, 10],
3167
        ...         "ham": ["a", "b", "c", "d", "e"],
3168
        ...     }
3169
        ... )
3170
        >>> path: pathlib.Path = dirpath / "new_file.avro"
3171
        >>> df.write_avro(path)
3172
        """
3173
        if compression is None:
3174
            compression = "uncompressed"
3175
        if isinstance(file, (str, Path)):
3176
            file = normalize_filepath(file)
3177
        if name is None:
3178
            name = ""
3179

3180
        self._df.write_avro(file, compression, name)
3181

3182
    def write_excel(
3183
        self,
3184
        workbook: str | Workbook | IO[bytes] | Path | None = None,
3185
        worksheet: str | Worksheet | None = None,
3186
        *,
3187
        position: tuple[int, int] | str = "A1",
3188
        table_style: str | dict[str, Any] | None = None,
3189
        table_name: str | None = None,
3190
        column_formats: ColumnFormatDict | None = None,
3191
        dtype_formats: dict[OneOrMoreDataTypes, str] | None = None,
3192
        conditional_formats: ConditionalFormatDict | None = None,
3193
        header_format: dict[str, Any] | None = None,
3194
        column_totals: ColumnTotalsDefinition | None = None,
3195
        column_widths: ColumnWidthsDefinition | None = None,
3196
        row_totals: RowTotalsDefinition | None = None,
3197
        row_heights: dict[int | tuple[int, ...], int] | int | None = None,
3198
        sparklines: dict[str, Sequence[str] | dict[str, Any]] | None = None,
3199
        formulas: dict[str, str | dict[str, str]] | None = None,
3200
        float_precision: int = 3,
3201
        include_header: bool = True,
3202
        autofilter: bool = True,
3203
        autofit: bool = False,
3204
        hidden_columns: Sequence[str] | SelectorType | None = None,
3205
        hide_gridlines: bool = False,
3206
        sheet_zoom: int | None = None,
3207
        freeze_panes: (
3208
            str
3209
            | tuple[int, int]
3210
            | tuple[str, int, int]
3211
            | tuple[int, int, int, int]
3212
            | None
3213
        ) = None,
3214
    ) -> Workbook:
3215
        """
3216
        Write frame data to a table in an Excel workbook/worksheet.
3217

3218
        Parameters
3219
        ----------
3220
        workbook : {str, Workbook}
3221
            String name or path of the workbook to create, BytesIO object, file opened
3222
            in binary-mode, or an `xlsxwriter.Workbook` object that has not been closed.
3223
            If None, writes to a `dataframe.xlsx` workbook in the working directory.
3224
        worksheet : {str, Worksheet}
3225
            Name of target worksheet or an `xlsxwriter.Worksheet` object (in which
3226
            case `workbook` must be the parent `xlsxwriter.Workbook` object); if None,
3227
            writes to "Sheet1" when creating a new workbook (note that writing to an
3228
            existing workbook requires a valid existing -or new- worksheet name).
3229
        position : {str, tuple}
3230
            Table position in Excel notation (eg: "A1"), or a (row,col) integer tuple.
3231
        table_style : {str, dict}
3232
            A named Excel table style, such as "Table Style Medium 4", or a dictionary
3233
            of `{"key":value,}` options containing one or more of the following keys:
3234
            "style", "first_column", "last_column", "banded_columns, "banded_rows".
3235
        table_name : str
3236
            Name of the output table object in the worksheet; can then be referred to
3237
            in the sheet by formulae/charts, or by subsequent `xlsxwriter` operations.
3238
        column_formats : dict
3239
            A `{colname(s):str,}` or `{selector:str,}` dictionary for applying an
3240
            Excel format string to the given columns. Formats defined here (such as
3241
            "dd/mm/yyyy", "0.00%", etc) will override any defined in `dtype_formats`.
3242
        dtype_formats : dict
3243
            A `{dtype:str,}` dictionary that sets the default Excel format for the
3244
            given dtype. (This can be overridden on a per-column basis by the
3245
            `column_formats` param).
3246
        conditional_formats : dict
3247
            A dictionary of colname (or selector) keys to a format str, dict, or list
3248
            that defines conditional formatting options for the specified columns.
3249

3250
            * If supplying a string typename, should be one of the valid `xlsxwriter`
3251
              types such as "3_color_scale", "data_bar", etc.
3252
            * If supplying a dictionary you can make use of any/all `xlsxwriter`
3253
              supported options, including icon sets, formulae, etc.
3254
            * Supplying multiple columns as a tuple/key will apply a single format
3255
              across all columns - this is effective in creating a heatmap, as the
3256
              min/max values will be determined across the entire range, not per-column.
3257
            * Finally, you can also supply a list made up from the above options
3258
              in order to apply *more* than one conditional format to the same range.
3259
        header_format : dict
3260
            A `{key:value,}` dictionary of `xlsxwriter` format options to apply
3261
            to the table header row, such as `{"bold":True, "font_color":"#702963"}`.
3262
        column_totals : {bool, list, dict}
3263
            Add a column-total row to the exported table.
3264

3265
            * If True, all numeric columns will have an associated total using "sum".
3266
            * If passing a string, it must be one of the valid total function names
3267
              and all numeric columns will have an associated total using that function.
3268
            * If passing a list of colnames, only those given will have a total.
3269
            * For more control, pass a `{colname:funcname,}` dict.
3270

3271
            Valid column-total function names are "average", "count_nums", "count",
3272
            "max", "min", "std_dev", "sum", and "var".
3273
        column_widths : {dict, int}
3274
            A `{colname:int,}` or `{selector:int,}` dict or a single integer that
3275
            sets (or overrides if autofitting) table column widths, in integer pixel
3276
            units. If given as an integer the same value is used for all table columns.
3277
        row_totals : {dict, list, bool}
3278
            Add a row-total column to the right-hand side of the exported table.
3279

3280
            * If True, a column called "total" will be added at the end of the table
3281
              that applies a "sum" function row-wise across all numeric columns.
3282
            * If passing a list/sequence of column names, only the matching columns
3283
              will participate in the sum.
3284
            * Can also pass a `{colname:columns,}` dictionary to create one or
3285
              more total columns with distinct names, referencing different columns.
3286
        row_heights : {dict, int}
3287
            An int or `{row_index:int,}` dictionary that sets the height of the given
3288
            rows (if providing a dictionary) or all rows (if providing an integer) that
3289
            intersect with the table body (including any header and total row) in
3290
            integer pixel units. Note that `row_index` starts at zero and will be
3291
            the header row (unless `include_header` is False).
3292
        sparklines : dict
3293
            A `{colname:list,}` or `{colname:dict,}` dictionary defining one or more
3294
            sparklines to be written into a new column in the table.
3295

3296
            * If passing a list of colnames (used as the source of the sparkline data)
3297
              the default sparkline settings are used (eg: line chart with no markers).
3298
            * For more control an `xlsxwriter`-compliant options dict can be supplied,
3299
              in which case three additional polars-specific keys are available:
3300
              "columns", "insert_before", and "insert_after". These allow you to define
3301
              the source columns and position the sparkline(s) with respect to other
3302
              table columns. If no position directive is given, sparklines are added to
3303
              the end of the table (eg: to the far right) in the order they are given.
3304
        formulas : dict
3305
            A `{colname:formula,}` or `{colname:dict,}` dictionary defining one or
3306
            more formulas to be written into a new column in the table. Note that you
3307
            are strongly advised to use structured references in your formulae wherever
3308
            possible to make it simple to reference columns by name.
3309

3310
            * If providing a string formula (such as "=[@colx]*[@coly]") the column will
3311
              be added to the end of the table (eg: to the far right), after any default
3312
              sparklines and before any row_totals.
3313
            * For the most control supply an options dictionary with the following keys:
3314
              "formula" (mandatory), one of "insert_before" or "insert_after", and
3315
              optionally "return_dtype". The latter is used to appropriately format the
3316
              output of the formula and allow it to participate in row/column totals.
3317
        float_precision : int
3318
            Default number of decimals displayed for floating point columns (note that
3319
            this is purely a formatting directive; the actual values are not rounded).
3320
        include_header : bool
3321
            Indicate if the table should be created with a header row.
3322
        autofilter : bool
3323
            If the table has headers, provide autofilter capability.
3324
        autofit : bool
3325
            Calculate individual column widths from the data.
3326
        hidden_columns : str | list
3327
             A column name, list of column names, or a selector representing table
3328
             columns to mark as hidden in the output worksheet.
3329
        hide_gridlines : bool
3330
            Do not display any gridlines on the output worksheet.
3331
        sheet_zoom : int
3332
            Set the default zoom level of the output worksheet.
3333
        freeze_panes : str | (str, int, int) | (int, int) | (int, int, int, int)
3334
            Freeze workbook panes.
3335

3336
            * If (row, col) is supplied, panes are split at the top-left corner of the
3337
              specified cell, which are 0-indexed. Thus, to freeze only the top row,
3338
              supply (1, 0).
3339
            * Alternatively, cell notation can be used to supply the cell. For example,
3340
              "A2" indicates the split occurs at the top-left of cell A2, which is the
3341
              equivalent of (1, 0).
3342
            * If (row, col, top_row, top_col) are supplied, the panes are split based on
3343
              the `row` and `col`, and the scrolling region is initialized to begin at
3344
              the `top_row` and `top_col`. Thus, to freeze only the top row and have the
3345
              scrolling region begin at row 10, column D (5th col), supply (1, 0, 9, 4).
3346
              Using cell notation for (row, col), supplying ("A2", 9, 4) is equivalent.
3347

3348
        Notes
3349
        -----
3350
        * A list of compatible `xlsxwriter` format property names can be found here:
3351
          https://xlsxwriter.readthedocs.io/format.html#format-methods-and-format-properties
3352

3353
        * Conditional formatting dictionaries should provide xlsxwriter-compatible
3354
          definitions; polars will take care of how they are applied on the worksheet
3355
          with respect to the relative sheet/column position. For supported options,
3356
          see: https://xlsxwriter.readthedocs.io/working_with_conditional_formats.html
3357

3358
        * Similarly, sparkline option dictionaries should contain xlsxwriter-compatible
3359
          key/values, as well as a mandatory polars "columns" key that defines the
3360
          sparkline source data; these source columns should all be adjacent. Two other
3361
          polars-specific keys are available to help define where the sparkline appears
3362
          in the table: "insert_after", and "insert_before". The value associated with
3363
          these keys should be the name of a column in the exported table.
3364
          https://xlsxwriter.readthedocs.io/working_with_sparklines.html
3365

3366
        * Formula dictionaries *must* contain a key called "formula", and then optional
3367
          "insert_after", "insert_before", and/or "return_dtype" keys. These additional
3368
          keys allow the column to be injected into the table at a specific location,
3369
          and/or to define the return type of the formula (eg: "Int64", "Float64", etc).
3370
          Formulas that refer to table columns should use Excel's structured references
3371
          syntax to ensure the formula is applied correctly and is table-relative.
3372
          https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e
3373

3374
        Examples
3375
        --------
3376
        Instantiate a basic DataFrame:
3377

3378
        >>> from random import uniform
3379
        >>> from datetime import date
3380
        >>>
3381
        >>> df = pl.DataFrame(
3382
        ...     {
3383
        ...         "dtm": [date(2023, 1, 1), date(2023, 1, 2), date(2023, 1, 3)],
3384
        ...         "num": [uniform(-500, 500), uniform(-500, 500), uniform(-500, 500)],
3385
        ...         "val": [10_000, 20_000, 30_000],
3386
        ...     }
3387
        ... )
3388

3389
        Export to "dataframe.xlsx" (the default workbook name, if not specified) in the
3390
        working directory, add column totals ("sum" by default) on all numeric columns,
3391
        then autofit:
3392

3393
        >>> df.write_excel(column_totals=True, autofit=True)  # doctest: +SKIP
3394

3395
        Write frame to a specific location on the sheet, set a named table style,
3396
        apply US-style date formatting, increase default float precision, apply a
3397
        non-default total function to a single column, autofit:
3398

3399
        >>> df.write_excel(  # doctest: +SKIP
3400
        ...     position="B4",
3401
        ...     table_style="Table Style Light 16",
3402
        ...     dtype_formats={pl.Date: "mm/dd/yyyy"},
3403
        ...     column_totals={"num": "average"},
3404
        ...     float_precision=6,
3405
        ...     autofit=True,
3406
        ... )
3407

3408
        Write the same frame to a named worksheet twice, applying different styles
3409
        and conditional formatting to each table, adding table titles using explicit
3410
        xlsxwriter integration:
3411

3412
        >>> from xlsxwriter import Workbook
3413
        >>> with Workbook("multi_frame.xlsx") as wb:  # doctest: +SKIP
3414
        ...     # basic/default conditional formatting
3415
        ...     df.write_excel(
3416
        ...         workbook=wb,
3417
        ...         worksheet="data",
3418
        ...         position=(3, 1),  # specify position as (row,col) coordinates
3419
        ...         conditional_formats={"num": "3_color_scale", "val": "data_bar"},
3420
        ...         table_style="Table Style Medium 4",
3421
        ...     )
3422
        ...
3423
        ...     # advanced conditional formatting, custom styles
3424
        ...     df.write_excel(
3425
        ...         workbook=wb,
3426
        ...         worksheet="data",
3427
        ...         position=(df.height + 7, 1),
3428
        ...         table_style={
3429
        ...             "style": "Table Style Light 4",
3430
        ...             "first_column": True,
3431
        ...         },
3432
        ...         conditional_formats={
3433
        ...             "num": {
3434
        ...                 "type": "3_color_scale",
3435
        ...                 "min_color": "#76933c",
3436
        ...                 "mid_color": "#c4d79b",
3437
        ...                 "max_color": "#ebf1de",
3438
        ...             },
3439
        ...             "val": {
3440
        ...                 "type": "data_bar",
3441
        ...                 "data_bar_2010": True,
3442
        ...                 "bar_color": "#9bbb59",
3443
        ...                 "bar_negative_color_same": True,
3444
        ...                 "bar_negative_border_color_same": True,
3445
        ...             },
3446
        ...         },
3447
        ...         column_formats={"num": "#,##0.000;[White]-#,##0.000"},
3448
        ...         column_widths={"val": 125},
3449
        ...         autofit=True,
3450
        ...     )
3451
        ...
3452
        ...     # add some table titles (with a custom format)
3453
        ...     ws = wb.get_worksheet_by_name("data")
3454
        ...     fmt_title = wb.add_format(
3455
        ...         {
3456
        ...             "font_color": "#4f6228",
3457
        ...             "font_size": 12,
3458
        ...             "italic": True,
3459
        ...             "bold": True,
3460
        ...         }
3461
        ...     )
3462
        ...     ws.write(2, 1, "Basic/default conditional formatting", fmt_title)
3463
        ...     ws.write(
3464
        ...         df.height + 6, 1, "Customised conditional formatting", fmt_title
3465
        ...     )
3466

3467
        Export a table containing two different types of sparklines. Use default
3468
        options for the "trend" sparkline and customized options (and positioning)
3469
        for the "+/-" win_loss sparkline, with non-default integer dtype formatting,
3470
        column totals, a subtle two-tone heatmap and hidden worksheet gridlines:
3471

3472
        >>> df = pl.DataFrame(
3473
        ...     {
3474
        ...         "id": ["aaa", "bbb", "ccc", "ddd", "eee"],
3475
        ...         "q1": [100, 55, -20, 0, 35],
3476
        ...         "q2": [30, -10, 15, 60, 20],
3477
        ...         "q3": [-50, 0, 40, 80, 80],
3478
        ...         "q4": [75, 55, 25, -10, -55],
3479
        ...     }
3480
        ... )
3481
        >>> df.write_excel(  # doctest: +SKIP
3482
        ...     table_style="Table Style Light 2",
3483
        ...     # apply accounting format to all flavours of integer
3484
        ...     dtype_formats={dt: "#,##0_);(#,##0)" for dt in [pl.Int32, pl.Int64]},
3485
        ...     sparklines={
3486
        ...         # default options; just provide source cols
3487
        ...         "trend": ["q1", "q2", "q3", "q4"],
3488
        ...         # customized sparkline type, with positioning directive
3489
        ...         "+/-": {
3490
        ...             "columns": ["q1", "q2", "q3", "q4"],
3491
        ...             "insert_after": "id",
3492
        ...             "type": "win_loss",
3493
        ...         },
3494
        ...     },
3495
        ...     conditional_formats={
3496
        ...         # create a unified multi-column heatmap
3497
        ...         ("q1", "q2", "q3", "q4"): {
3498
        ...             "type": "2_color_scale",
3499
        ...             "min_color": "#95b3d7",
3500
        ...             "max_color": "#ffffff",
3501
        ...         },
3502
        ...     },
3503
        ...     column_totals=["q1", "q2", "q3", "q4"],
3504
        ...     row_totals=True,
3505
        ...     hide_gridlines=True,
3506
        ... )
3507

3508
        Export a table containing an Excel formula-based column that calculates a
3509
        standardised Z-score, showing use of structured references in conjunction
3510
        with positioning directives, column totals, and custom formatting.
3511

3512
        >>> df = pl.DataFrame(
3513
        ...     {
3514
        ...         "id": ["a123", "b345", "c567", "d789", "e101"],
3515
        ...         "points": [99, 45, 50, 85, 35],
3516
        ...     }
3517
        ... )
3518
        >>> df.write_excel(  # doctest: +SKIP
3519
        ...     table_style={
3520
        ...         "style": "Table Style Medium 15",
3521
        ...         "first_column": True,
3522
        ...     },
3523
        ...     column_formats={
3524
        ...         "id": {"font": "Consolas"},
3525
        ...         "points": {"align": "center"},
3526
        ...         "z-score": {"align": "center"},
3527
        ...     },
3528
        ...     column_totals="average",
3529
        ...     formulas={
3530
        ...         "z-score": {
3531
        ...             # use structured references to refer to the table columns and 'totals' row
3532
        ...             "formula": "=STANDARDIZE([@points], [[#Totals],[points]], STDEV([points]))",
3533
        ...             "insert_after": "points",
3534
        ...             "return_dtype": pl.Float64,
3535
        ...         }
3536
        ...     },
3537
        ...     hide_gridlines=True,
3538
        ...     sheet_zoom=125,
3539
        ... )
3540

3541
        Create and reference a Worksheet object directly, adding a basic chart.
3542
        Taking advantage of structured references to set chart series values and
3543
        categories is strongly recommended so that you do not have to calculate
3544
        cell positions with respect to the frame data and worksheet:
3545

3546
        >>> with Workbook("basic_chart.xlsx") as wb:  # doctest: +SKIP
3547
        ...     # create worksheet object and write frame data to it
3548
        ...     ws = wb.add_worksheet("demo")
3549
        ...     df.write_excel(
3550
        ...         workbook=wb,
3551
        ...         worksheet=ws,
3552
        ...         table_name="DataTable",
3553
        ...         table_style="Table Style Medium 26",
3554
        ...         hide_gridlines=True,
3555
        ...     )
3556
        ...     # create chart object, point to the written table
3557
        ...     # data using structured references, and style it
3558
        ...     chart = wb.add_chart({"type": "column"})
3559
        ...     chart.set_title({"name": "Example Chart"})
3560
        ...     chart.set_legend({"none": True})
3561
        ...     chart.set_style(38)
3562
        ...     chart.add_series(
3563
        ...         {  # note the use of structured references
3564
        ...             "values": "=DataTable[points]",
3565
        ...             "categories": "=DataTable[id]",
3566
        ...             "data_labels": {"value": True},
3567
        ...         }
3568
        ...     )
3569
        ...     # add chart to the worksheet
3570
        ...     ws.insert_chart("D1", chart)
3571
        """  # noqa: W505
3572
        from polars.io.spreadsheet._write_utils import (
3573
            _unpack_multi_column_dict,
3574
            _xl_apply_conditional_formats,
3575
            _xl_inject_sparklines,
3576
            _xl_setup_table_columns,
3577
            _xl_setup_table_options,
3578
            _xl_setup_workbook,
3579
            _xl_unique_table_name,
3580
            _XLFormatCache,
3581
        )
3582

3583
        xlsxwriter = import_optional("xlsxwriter", err_prefix="Excel export requires")
3584
        from xlsxwriter.utility import xl_cell_to_rowcol
3585

3586
        # setup workbook/worksheet
3587
        wb, ws, can_close = _xl_setup_workbook(workbook, worksheet)
3588
        df, is_empty = self, self.is_empty()
3589

3590
        # note: `_xl_setup_table_columns` converts nested data (List, Struct, etc.) to
3591
        # string, so we keep a reference to the original so that column selection with
3592
        # selectors that target such types remains correct
3593
        df_original = df
3594

3595
        # setup table format/columns
3596
        fmt_cache = _XLFormatCache(wb)
3597
        column_formats = column_formats or {}
3598
        table_style, table_options = _xl_setup_table_options(table_style)
3599
        table_name = table_name or _xl_unique_table_name(wb)
3600
        table_columns, column_formats, df = _xl_setup_table_columns(  # type: ignore[assignment]
3601
            df=df,
3602
            format_cache=fmt_cache,
3603
            column_formats=column_formats,
3604
            column_totals=column_totals,
3605
            dtype_formats=dtype_formats,
3606
            header_format=header_format,
3607
            float_precision=float_precision,
3608
            table_style=table_style,
3609
            row_totals=row_totals,
3610
            sparklines=sparklines,
3611
            formulas=formulas,
3612
        )
3613

3614
        # normalise cell refs (eg: "B3" => (2,1)) and establish table start/finish,
3615
        # accounting for potential presence/absence of headers and a totals row.
3616
        table_start = (
3617
            xl_cell_to_rowcol(position) if isinstance(position, str) else position
3618
        )
3619
        table_finish = (
3620
            table_start[0]
3621
            + df.height
3622
            + int(is_empty)
3623
            - int(not include_header)
3624
            + int(bool(column_totals)),
3625
            table_start[1] + df.width - 1,
3626
        )
3627

3628
        excel_max_valid_rows = 1048575
3629
        excel_max_valid_cols = 16384
3630

3631
        if (
3632
            table_finish[0] > excel_max_valid_rows
3633
            or table_finish[1] > excel_max_valid_cols
3634
        ):
3635
            msg = f"writing {df.height}x{df.width} frame at {position!r} does not fit worksheet dimensions of {excel_max_valid_rows} rows and {excel_max_valid_cols} columns"
3636
            raise InvalidOperationError(msg)
3637

3638
        # write table structure and formats into the target sheet
3639
        if not is_empty or include_header:
3640
            ws.add_table(
3641
                *table_start,
3642
                *table_finish,
3643
                {
3644
                    "data": df.rows(),
3645
                    "style": table_style,
3646
                    "columns": table_columns,
3647
                    "header_row": include_header,
3648
                    "autofilter": autofilter,
3649
                    "total_row": bool(column_totals) and not is_empty,
3650
                    "name": table_name,
3651
                    **table_options,
3652
                },
3653
            )
3654

3655
            # apply conditional formats
3656
            if conditional_formats:
3657
                _xl_apply_conditional_formats(
3658
                    df=df,
3659
                    ws=ws,
3660
                    conditional_formats=conditional_formats,
3661
                    table_start=table_start,
3662
                    include_header=include_header,
3663
                    format_cache=fmt_cache,
3664
                )
3665

3666
        # additional column-level properties
3667
        if hidden_columns is None:
3668
            hidden = set()
3669
        elif isinstance(hidden_columns, str):
3670
            hidden = {hidden_columns}
3671
        else:
3672
            hidden = set(_expand_selectors(df_original, hidden_columns))
3673

3674
        # Autofit section needs to be present above column_widths section
3675
        # to ensure that parameters provided in the column_widths section
3676
        # are not overwritten by autofit
3677
        #
3678
        # table/rows all written; apply (optional) autofit
3679
        if autofit and not is_empty:
3680
            xlv = xlsxwriter.__version__
3681
            if parse_version(xlv) < (3, 0, 8):
3682
                msg = f"`autofit=True` requires xlsxwriter 3.0.8 or higher, found {xlv}"
3683
                raise ModuleUpgradeRequiredError(msg)
3684
            ws.autofit()
3685

3686
        if isinstance(column_widths, int):
3687
            column_widths = dict.fromkeys(df.columns, column_widths)
3688
        else:
3689
            column_widths = _expand_selector_dicts(  # type: ignore[assignment]
3690
                df_original, column_widths, expand_keys=True, expand_values=False
3691
            )
3692
        column_widths = _unpack_multi_column_dict(column_widths or {})  # type: ignore[assignment]
3693

3694
        for column in df.columns:
3695
            options = {"hidden": True} if column in hidden else {}
3696
            col_idx = table_start[1] + df.get_column_index(column)
3697
            if column in column_widths:  # type: ignore[operator]
3698
                ws.set_column_pixels(
3699
                    col_idx,
3700
                    col_idx,
3701
                    column_widths[column],  # type: ignore[index]
3702
                    None,
3703
                    options,
3704
                )
3705
            elif options:
3706
                ws.set_column(col_idx, col_idx, None, None, options)
3707

3708
        # finally, inject any sparklines into the table
3709
        for column, params in (sparklines or {}).items():
3710
            _xl_inject_sparklines(
3711
                ws,
3712
                df,
3713
                table_start,
3714
                column,
3715
                include_header=include_header,
3716
                params=params,
3717
            )
3718

3719
        # worksheet options
3720
        if hide_gridlines:
3721
            ws.hide_gridlines(2)
3722
        if sheet_zoom:
3723
            ws.set_zoom(sheet_zoom)
3724
        if row_heights:
3725
            if isinstance(row_heights, int):
3726
                for idx in range(table_start[0], table_finish[0] + 1):
3727
                    ws.set_row_pixels(idx, row_heights)
3728
            elif isinstance(row_heights, dict):
3729
                for idx, height in _unpack_multi_column_dict(row_heights).items():  # type: ignore[assignment]
3730
                    ws.set_row_pixels(idx, height)
3731

3732
        if freeze_panes:
3733
            if isinstance(freeze_panes, str):
3734
                ws.freeze_panes(freeze_panes)
3735
            else:
3736
                ws.freeze_panes(*freeze_panes)
3737

3738
        if can_close:
3739
            wb.close()
3740
        return wb
3741

3742
    @overload
3743
    def write_ipc(
3744
        self,
3745
        file: None,
3746
        *,
3747
        compression: IpcCompression = "uncompressed",
3748
        compat_level: CompatLevel | None = None,
3749
        storage_options: dict[str, Any] | None = None,
3750
        credential_provider: (
3751
            CredentialProviderFunction | Literal["auto"] | None
3752
        ) = "auto",
3753
        retries: int = 2,
3754
    ) -> BytesIO: ...
3755

3756
    @overload
3757
    def write_ipc(
3758
        self,
3759
        file: str | Path | IO[bytes],
3760
        *,
3761
        compression: IpcCompression = "uncompressed",
3762
        compat_level: CompatLevel | None = None,
3763
        storage_options: dict[str, Any] | None = None,
3764
        credential_provider: (
3765
            CredentialProviderFunction | Literal["auto"] | None
3766
        ) = "auto",
3767
        retries: int = 2,
3768
    ) -> None: ...
3769

3770
    @deprecate_renamed_parameter("future", "compat_level", version="1.1")
3771
    def write_ipc(
3772
        self,
3773
        file: str | Path | IO[bytes] | None,
3774
        *,
3775
        compression: IpcCompression = "uncompressed",
3776
        compat_level: CompatLevel | None = None,
3777
        storage_options: dict[str, Any] | None = None,
3778
        credential_provider: (
3779
            CredentialProviderFunction | Literal["auto"] | None
3780
        ) = "auto",
3781
        retries: int = 2,
3782
    ) -> BytesIO | None:
3783
        """
3784
        Write to Arrow IPC binary stream or Feather file.
3785

3786
        See "File or Random Access format" in https://arrow.apache.org/docs/python/ipc.html.
3787

3788
        .. versionchanged:: 1.1
3789
            The `future` parameter was renamed `compat_level`.
3790

3791
        Parameters
3792
        ----------
3793
        file
3794
            Path or writable file-like object to which the IPC data will be
3795
            written. If set to `None`, the output is returned as a BytesIO object.
3796
        compression : {'uncompressed', 'lz4', 'zstd'}
3797
            Compression method. Defaults to "uncompressed".
3798
        compat_level
3799
            Use a specific compatibility level
3800
            when exporting Polars' internal data structures.
3801
        storage_options
3802
            Options that indicate how to connect to a cloud provider.
3803

3804
            The cloud providers currently supported are AWS, GCP, and Azure.
3805
            See supported keys here:
3806

3807
            * `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
3808
            * `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
3809
            * `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
3810
            * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
3811
            `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
3812

3813
            If `storage_options` is not provided, Polars will try to infer the
3814
            information from environment variables.
3815
        credential_provider
3816
            Provide a function that can be called to provide cloud storage
3817
            credentials. The function is expected to return a dictionary of
3818
            credential keys along with an optional credential expiry time.
3819

3820
            .. warning::
3821
                This functionality is considered **unstable**. It may be changed
3822
                at any point without it being considered a breaking change.
3823
        retries
3824
            Number of retries if accessing a cloud instance fails.
3825

3826
        Examples
3827
        --------
3828
        >>> import pathlib
3829
        >>>
3830
        >>> df = pl.DataFrame(
3831
        ...     {
3832
        ...         "foo": [1, 2, 3, 4, 5],
3833
        ...         "bar": [6, 7, 8, 9, 10],
3834
        ...         "ham": ["a", "b", "c", "d", "e"],
3835
        ...     }
3836
        ... )
3837
        >>> path: pathlib.Path = dirpath / "new_file.arrow"
3838
        >>> df.write_ipc(path)
3839
        """
3840
        return_bytes = file is None
3841
        target: str | Path | IO[bytes]
3842
        if file is None:
3843
            target = BytesIO()
3844
        else:
3845
            target = file
3846

3847
        from polars.lazyframe.opt_flags import QueryOptFlags
3848

3849
        self.lazy().sink_ipc(
3850
            target,
3851
            compression=compression,
3852
            compat_level=compat_level,
3853
            storage_options=storage_options,
3854
            credential_provider=credential_provider,
3855
            retries=retries,
3856
            optimizations=QueryOptFlags._eager(),
3857
            engine="in-memory",
3858
        )
3859
        return target if return_bytes else None  # type: ignore[return-value]
3860

3861
    @overload
3862
    def write_ipc_stream(
3863
        self,
3864
        file: None,
3865
        *,
3866
        compression: IpcCompression = "uncompressed",
3867
        compat_level: CompatLevel | None = None,
3868
    ) -> BytesIO: ...
3869

3870
    @overload
3871
    def write_ipc_stream(
3872
        self,
3873
        file: str | Path | IO[bytes],
3874
        *,
3875
        compression: IpcCompression = "uncompressed",
3876
        compat_level: CompatLevel | None = None,
3877
    ) -> None: ...
3878

3879
    @deprecate_renamed_parameter("future", "compat_level", version="1.1")
3880
    def write_ipc_stream(
3881
        self,
3882
        file: str | Path | IO[bytes] | None,
3883
        *,
3884
        compression: IpcCompression = "uncompressed",
3885
        compat_level: CompatLevel | None = None,
3886
    ) -> BytesIO | None:
3887
        """
3888
        Write to Arrow IPC record batch stream.
3889

3890
        See "Streaming format" in https://arrow.apache.org/docs/python/ipc.html.
3891

3892
        .. versionchanged:: 1.1
3893
            The `future` parameter was renamed `compat_level`.
3894

3895
        Parameters
3896
        ----------
3897
        file
3898
            Path or writable file-like object to which the IPC record batch data will
3899
            be written. If set to `None`, the output is returned as a BytesIO object.
3900
        compression : {'uncompressed', 'lz4', 'zstd'}
3901
            Compression method. Defaults to "uncompressed".
3902
        compat_level
3903
            Use a specific compatibility level
3904
            when exporting Polars' internal data structures.
3905

3906
        Examples
3907
        --------
3908
        >>> import pathlib
3909
        >>>
3910
        >>> df = pl.DataFrame(
3911
        ...     {
3912
        ...         "foo": [1, 2, 3, 4, 5],
3913
        ...         "bar": [6, 7, 8, 9, 10],
3914
        ...         "ham": ["a", "b", "c", "d", "e"],
3915
        ...     }
3916
        ... )
3917
        >>> path: pathlib.Path = dirpath / "new_file.arrow"
3918
        >>> df.write_ipc_stream(path)
3919
        """
3920
        return_bytes = file is None
3921
        if return_bytes:
3922
            file = BytesIO()
3923
        elif isinstance(file, (str, Path)):
3924
            file = normalize_filepath(file)
3925

3926
        compat_level_py: int | bool
3927
        if compat_level is None:
3928
            compat_level_py = True
3929
        elif isinstance(compat_level, CompatLevel):
3930
            compat_level_py = compat_level._version
3931

3932
        if compression is None:
3933
            compression = "uncompressed"
3934

3935
        self._df.write_ipc_stream(file, compression, compat_level_py)
3936
        return file if return_bytes else None  # type: ignore[return-value]
3937

3938
    def write_parquet(
3939
        self,
3940
        file: str | Path | IO[bytes],
3941
        *,
3942
        compression: ParquetCompression = "zstd",
3943
        compression_level: int | None = None,
3944
        statistics: bool | str | dict[str, bool] = True,
3945
        row_group_size: int | None = None,
3946
        data_page_size: int | None = None,
3947
        use_pyarrow: bool = False,
3948
        pyarrow_options: dict[str, Any] | None = None,
3949
        partition_by: str | Sequence[str] | None = None,
3950
        partition_chunk_size_bytes: int = 4_294_967_296,
3951
        storage_options: dict[str, Any] | None = None,
3952
        credential_provider: (
3953
            CredentialProviderFunction | Literal["auto"] | None
3954
        ) = "auto",
3955
        retries: int = 2,
3956
        metadata: ParquetMetadata | None = None,
3957
        mkdir: bool = False,
3958
    ) -> None:
3959
        """
3960
        Write to Apache Parquet file.
3961

3962
        Parameters
3963
        ----------
3964
        file
3965
            File path or writable file-like object to which the result will be written.
3966
            This should be a path to a directory if writing a partitioned dataset.
3967
        compression : {'lz4', 'uncompressed', 'snappy', 'gzip', 'lzo', 'brotli', 'zstd'}
3968
            Choose "zstd" for good compression performance.
3969
            Choose "lz4" for fast compression/decompression.
3970
            Choose "snappy" for more backwards compatibility guarantees
3971
            when you deal with older parquet readers.
3972
        compression_level
3973
            The level of compression to use. Higher compression means smaller files on
3974
            disk.
3975

3976
            - "gzip" : min-level: 0, max-level: 9.
3977
            - "brotli" : min-level: 0, max-level: 11.
3978
            - "zstd" : min-level: 1, max-level: 22.
3979

3980
        statistics
3981
            Write statistics to the parquet headers. This is the default behavior.
3982

3983
            Possible values:
3984

3985
            - `True`: enable default set of statistics (default). Some
3986
              statistics may be disabled.
3987
            - `False`: disable all statistics
3988
            - "full": calculate and write all available statistics. Cannot be
3989
              combined with `use_pyarrow`.
3990
            - `{ "statistic-key": True / False, ... }`. Cannot be combined with
3991
              `use_pyarrow`. Available keys:
3992

3993
              - "min": column minimum value (default: `True`)
3994
              - "max": column maximum value (default: `True`)
3995
              - "distinct_count": number of unique column values (default: `False`)
3996
              - "null_count": number of null values in column (default: `True`)
3997
        row_group_size
3998
            Size of the row groups in number of rows. Defaults to 512^2 rows.
3999
        data_page_size
4000
            Size of the data page in bytes. Defaults to 1024^2 bytes.
4001
        use_pyarrow
4002
            Use C++ parquet implementation vs Rust parquet implementation.
4003
            At the moment C++ supports more features.
4004
        pyarrow_options
4005
            Arguments passed to `pyarrow.parquet.write_table`.
4006

4007
            If you pass `partition_cols` here, the dataset will be written
4008
            using `pyarrow.parquet.write_to_dataset`.
4009
            The `partition_cols` parameter leads to write the dataset to a directory.
4010
            Similar to Spark's partitioned datasets.
4011
        partition_by
4012
            Column(s) to partition by. A partitioned dataset will be written if this is
4013
            specified. This parameter is considered unstable and is subject to change.
4014
        partition_chunk_size_bytes
4015
            Approximate size to split DataFrames within a single partition when
4016
            writing. Note this is calculated using the size of the DataFrame in
4017
            memory - the size of the output file may differ depending on the
4018
            file format / compression.
4019
        storage_options
4020
            Options that indicate how to connect to a cloud provider.
4021

4022
            The cloud providers currently supported are AWS, GCP, and Azure.
4023
            See supported keys here:
4024

4025
            * `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
4026
            * `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
4027
            * `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
4028
            * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
4029
            `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
4030

4031
            If `storage_options` is not provided, Polars will try to infer the
4032
            information from environment variables.
4033
        credential_provider
4034
            Provide a function that can be called to provide cloud storage
4035
            credentials. The function is expected to return a dictionary of
4036
            credential keys along with an optional credential expiry time.
4037

4038
            .. warning::
4039
                This functionality is considered **unstable**. It may be changed
4040
                at any point without it being considered a breaking change.
4041
        retries
4042
            Number of retries if accessing a cloud instance fails.
4043
        metadata
4044
            A dictionary or callback to add key-values to the file-level Parquet
4045
            metadata.
4046

4047
            .. warning::
4048
                This functionality is considered **experimental**. It may be removed or
4049
                changed at any point without it being considered a breaking change.
4050
        mkdir: bool
4051
            Recursively create all the directories in the path.
4052

4053
            .. warning::
4054
                This functionality is considered **unstable**. It may be changed at any
4055
                point without it being considered a breaking change.
4056

4057
        Examples
4058
        --------
4059
        >>> import pathlib
4060
        >>>
4061
        >>> df = pl.DataFrame(
4062
        ...     {
4063
        ...         "foo": [1, 2, 3, 4, 5],
4064
        ...         "bar": [6, 7, 8, 9, 10],
4065
        ...         "ham": ["a", "b", "c", "d", "e"],
4066
        ...     }
4067
        ... )
4068
        >>> path: pathlib.Path = dirpath / "new_file.parquet"
4069
        >>> df.write_parquet(path)
4070

4071
        We can use pyarrow with use_pyarrow_write_to_dataset=True
4072
        to write partitioned datasets. The following example will
4073
        write the first row to ../watermark=1/*.parquet and the
4074
        other rows to ../watermark=2/*.parquet.
4075

4076
        >>> df = pl.DataFrame({"a": [1, 2, 3], "watermark": [1, 2, 2]})
4077
        >>> path: pathlib.Path = dirpath / "partitioned_object"
4078
        >>> df.write_parquet(
4079
        ...     path,
4080
        ...     use_pyarrow=True,
4081
        ...     pyarrow_options={"partition_cols": ["watermark"]},
4082
        ... )
4083
        """
4084
        if compression is None:
4085
            compression = "uncompressed"
4086
        if isinstance(file, (str, Path)):
4087
            if partition_by is not None or (
4088
                pyarrow_options is not None and pyarrow_options.get("partition_cols")
4089
            ):
4090
                file = normalize_filepath(file, check_not_directory=False)
4091
            else:
4092
                file = normalize_filepath(file)
4093

4094
        if use_pyarrow:
4095
            if statistics == "full" or isinstance(statistics, dict):
4096
                msg = "write_parquet with `use_pyarrow=True` allows only boolean values for `statistics`"
4097
                raise ValueError(msg)
4098
            if metadata is not None:
4099
                msg = "write_parquet with `use_pyarrow=True` cannot be combined with `metadata`"
4100
                raise ValueError(msg)
4101
            if mkdir:
4102
                msg = "write_parquet with `use_pyarrow=True` cannot be combined with `mkdir`"
4103
                raise ValueError(msg)
4104

4105
            tbl = self.to_arrow()
4106
            data = {}
4107

4108
            for i, column in enumerate(tbl):
4109
                # extract the name before casting
4110
                name = f"column_{i}" if column._name is None else column._name
4111

4112
                data[name] = column
4113

4114
            tbl = pa.table(data)
4115

4116
            # do not remove this import!
4117
            # needed below
4118
            import pyarrow.parquet  # noqa: F401
4119

4120
            if pyarrow_options is None:
4121
                pyarrow_options = {}
4122
            pyarrow_options["compression"] = (
4123
                None if compression == "uncompressed" else compression
4124
            )
4125
            pyarrow_options["compression_level"] = compression_level
4126
            pyarrow_options["write_statistics"] = statistics
4127
            pyarrow_options["row_group_size"] = row_group_size
4128
            pyarrow_options["data_page_size"] = data_page_size
4129

4130
            if pyarrow_options.get("partition_cols"):
4131
                pa.parquet.write_to_dataset(
4132
                    table=tbl,
4133
                    root_path=file,
4134
                    **(pyarrow_options or {}),
4135
                )
4136
            else:
4137
                pa.parquet.write_table(
4138
                    table=tbl,
4139
                    where=file,
4140
                    **(pyarrow_options or {}),
4141
                )
4142

4143
            return
4144

4145
        target: str | Path | IO[bytes] | PartitioningScheme = file
4146
        engine: EngineType = "in-memory"
4147
        if partition_by is not None:
4148
            if not isinstance(file, str):
4149
                msg = "expected file to be a `str` since partition-by is set"
4150
                raise TypeError(msg)
4151

4152
            from polars.io import PartitionByKey
4153

4154
            target = PartitionByKey(file, by=partition_by)
4155
            mkdir = True
4156
            engine = "streaming"
4157

4158
        from polars.lazyframe.opt_flags import QueryOptFlags
4159

4160
        self.lazy().sink_parquet(
4161
            target,
4162
            compression=compression,
4163
            compression_level=compression_level,
4164
            statistics=statistics,
4165
            row_group_size=row_group_size,
4166
            data_page_size=data_page_size,
4167
            storage_options=storage_options,
4168
            credential_provider=credential_provider,
4169
            retries=retries,
4170
            metadata=metadata,
4171
            engine=engine,
4172
            mkdir=mkdir,
4173
            optimizations=QueryOptFlags._eager(),
4174
        )
4175

4176
    def write_database(
4177
        self,
4178
        table_name: str,
4179
        connection: ConnectionOrCursor | str,
4180
        *,
4181
        if_table_exists: DbWriteMode = "fail",
4182
        engine: DbWriteEngine | None = None,
4183
        engine_options: dict[str, Any] | None = None,
4184
    ) -> int:
4185
        """
4186
        Write the data in a Polars DataFrame to a database.
4187

4188
        .. versionadded:: 0.20.26
4189
            Support for instantiated connection objects in addition to URI strings, and
4190
            a new `engine_options` parameter.
4191

4192
        Parameters
4193
        ----------
4194
        table_name
4195
            Schema-qualified name of the table to create or append to in the target
4196
            SQL database. If your table name contains special characters, it should
4197
            be quoted.
4198
        connection
4199
            An existing SQLAlchemy or ADBC connection against the target database, or
4200
            a URI string that will be used to instantiate such a connection, such as:
4201

4202
            * "postgresql://user:pass@server:port/database"
4203
            * "sqlite:////path/to/database.db"
4204
        if_table_exists : {'append', 'replace', 'fail'}
4205
            The insert mode:
4206

4207
            * 'replace' will create a new database table, overwriting an existing one.
4208
            * 'append' will append to an existing table.
4209
            * 'fail' will fail if table already exists.
4210
        engine : {'sqlalchemy', 'adbc'}
4211
            Select the engine to use for writing frame data; only necessary when
4212
            supplying a URI string (defaults to 'sqlalchemy' if unset)
4213
        engine_options
4214
            Additional options to pass to the insert method associated with the engine
4215
            specified by the option `engine`.
4216

4217
            * Setting `engine` to "sqlalchemy" currently inserts using Pandas' `to_sql`
4218
              method (though this will eventually be phased out in favor of a native
4219
              solution).
4220
            * Setting `engine` to "adbc" inserts using the ADBC cursor's `adbc_ingest`
4221
              method.
4222

4223
        Examples
4224
        --------
4225
        Insert into a temporary table using a PostgreSQL URI and the ADBC engine:
4226

4227
        >>> df.write_database(
4228
        ...     table_name="target_table",
4229
        ...     connection="postgresql://user:pass@server:port/database",
4230
        ...     engine="adbc",
4231
        ...     engine_options={"temporary": True},
4232
        ... )  # doctest: +SKIP
4233

4234
        Insert into a table using a `pyodbc` SQLAlchemy connection to SQL Server
4235
        that was instantiated with "fast_executemany=True" to improve performance:
4236

4237
        >>> pyodbc_uri = (
4238
        ...     "mssql+pyodbc://user:pass@server:1433/test?"
4239
        ...     "driver=ODBC+Driver+18+for+SQL+Server"
4240
        ... )
4241
        >>> engine = create_engine(pyodbc_uri, fast_executemany=True)  # doctest: +SKIP
4242
        >>> df.write_database(
4243
        ...     table_name="target_table",
4244
        ...     connection=engine,
4245
        ... )  # doctest: +SKIP
4246

4247
        Returns
4248
        -------
4249
        int
4250
            The number of rows affected, if the driver provides this information.
4251
            Otherwise, returns -1.
4252
        """
4253
        if if_table_exists not in (valid_write_modes := get_args(DbWriteMode)):
4254
            allowed = ", ".join(repr(m) for m in valid_write_modes)
4255
            msg = f"write_database `if_table_exists` must be one of {{{allowed}}}, got {if_table_exists!r}"
4256
            raise ValueError(msg)
4257

4258
        connection_module_root = type(connection).__module__.split(".", 1)[0]
4259

4260
        if engine is None:
4261
            if isinstance(connection, str) or connection_module_root == "sqlalchemy":
4262
                engine = "sqlalchemy"
4263
            elif connection_module_root.startswith("adbc"):
4264
                engine = "adbc"
4265

4266
        def unpack_table_name(name: str) -> tuple[str | None, str | None, str]:
4267
            """Unpack optionally qualified table name to catalog/schema/table tuple."""
4268
            from csv import reader as delimited_read
4269

4270
            components: list[str | None] = next(delimited_read([name], delimiter="."))  # type: ignore[arg-type]
4271
            if len(components) > 3:
4272
                msg = f"`table_name` appears to be invalid: '{name}'"
4273
                raise ValueError(msg)
4274
            catalog, schema, tbl = ([None] * (3 - len(components))) + components
4275
            return catalog, schema, tbl  # type: ignore[return-value]
4276

4277
        if engine == "adbc":
4278
            from polars.io.database._utils import (
4279
                _get_adbc_module_name_from_uri,
4280
                _import_optional_adbc_driver,
4281
                _open_adbc_connection,
4282
            )
4283

4284
            conn, can_close_conn = (
4285
                (_open_adbc_connection(connection), True)
4286
                if isinstance(connection, str)
4287
                else (connection, False)
4288
            )
4289

4290
            driver_manager = import_optional("adbc_driver_manager")
4291

4292
            # base class for ADBC connections
4293
            if not isinstance(conn, driver_manager.dbapi.Connection):
4294
                msg = f"unrecognised connection type {connection!r}"
4295
                raise TypeError(msg)
4296

4297
            driver_manager_str_version = getattr(driver_manager, "__version__", "0.0")
4298
            driver_manager_version = parse_version(driver_manager_str_version)
4299

4300
            if if_table_exists == "fail":
4301
                # if the table exists, 'create' will raise an error,
4302
                # resulting in behaviour equivalent to 'fail'
4303
                mode = "create"
4304
            elif if_table_exists == "replace":
4305
                if driver_manager_version < (0, 7):
4306
                    msg = (
4307
                        "`if_table_exists = 'replace'` requires ADBC version >= 0.7, "
4308
                        f"found {driver_manager_str_version}"
4309
                    )
4310
                    raise ModuleUpgradeRequiredError(msg)
4311
                mode = "replace"
4312
            elif if_table_exists == "append":
4313
                mode = "append"
4314
            else:
4315
                msg = (
4316
                    f"unexpected value for `if_table_exists`: {if_table_exists!r}"
4317
                    f"\n\nChoose one of {{'fail', 'replace', 'append'}}"
4318
                )
4319
                raise ValueError(msg)
4320

4321
            with (
4322
                conn if can_close_conn else contextlib.nullcontext(),
4323
                conn.cursor() as cursor,
4324
            ):
4325
                catalog, db_schema, unpacked_table_name = unpack_table_name(table_name)
4326
                n_rows: int
4327

4328
                adbc_module_name = (
4329
                    _get_adbc_module_name_from_uri(connection)
4330
                    if isinstance(connection, str)
4331
                    else connection_module_root
4332
                )
4333
                adbc_driver = _import_optional_adbc_driver(
4334
                    adbc_module_name, dbapi_submodule=False
4335
                )
4336
                adbc_driver_str_version = getattr(adbc_driver, "__version__", "0.0")
4337
                adbc_driver_version = parse_version(adbc_driver_str_version)
4338

4339
                if adbc_module_name.split("_")[-1] == "sqlite":
4340
                    catalog, db_schema = db_schema, None
4341

4342
                    # note: ADBC didnt't support 'replace' until adbc-driver-sqlite
4343
                    # version 0.11 (it was released for other drivers in version 0.7)
4344
                    if (
4345
                        driver_manager_version >= (0, 7)
4346
                        and adbc_driver_version < (0, 11)
4347
                        and if_table_exists == "replace"
4348
                    ):
4349
                        cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
4350
                        mode = "create"
4351

4352
                # As of adbc_driver_manager 1.6.0, adbc_ingest can take a Polars
4353
                # DataFrame via the PyCapsule interface
4354
                data = self if driver_manager_version >= (1, 6) else self.to_arrow()
4355

4356
                # use of schema-qualified table names was released in
4357
                # adbc-driver-manager 0.7.0 and is working without bugs from driver
4358
                # version (e.g., adbc-driver-postgresql) version 0.8.0
4359
                if driver_manager_version >= (0, 7) and adbc_driver_version >= (0, 8):
4360
                    n_rows = cursor.adbc_ingest(
4361
                        unpacked_table_name,
4362
                        data=data,
4363
                        mode=mode,
4364
                        catalog_name=catalog,
4365
                        db_schema_name=db_schema,
4366
                        **(engine_options or {}),
4367
                    )
4368
                elif db_schema is not None:
4369
                    adbc_driver_pypi_name = adbc_module_name.replace("_", "-")
4370
                    msg = (
4371
                        "use of schema-qualified table names requires "
4372
                        "adbc-driver-manager version >= 0.7.0, found "
4373
                        f"{driver_manager_str_version} and {adbc_driver_pypi_name} "
4374
                        f"version >= 0.8.0, found {adbc_driver_str_version}"
4375
                    )
4376
                    raise ModuleUpgradeRequiredError(
4377
                        # https://github.com/apache/arrow-adbc/issues/1000
4378
                        # https://github.com/apache/arrow-adbc/issues/1109
4379
                        msg
4380
                    )
4381
                else:
4382
                    n_rows = cursor.adbc_ingest(
4383
                        table_name=unpacked_table_name,
4384
                        data=data,
4385
                        mode=mode,
4386
                        **(engine_options or {}),
4387
                    )
4388
                conn.commit()
4389
            return n_rows
4390

4391
        elif engine == "sqlalchemy":
4392
            if not _PANDAS_AVAILABLE:
4393
                msg = "writing with 'sqlalchemy' engine currently requires pandas.\n\nInstall with: pip install pandas"
4394
                raise ModuleNotFoundError(msg)
4395
            elif (pd_version := parse_version(pd.__version__)) < (1, 5):
4396
                msg = f"writing with 'sqlalchemy' engine requires pandas >= 1.5; found {pd.__version__!r}"
4397
                raise ModuleUpgradeRequiredError(msg)
4398

4399
            import_optional(
4400
                module_name="sqlalchemy",
4401
                min_version=("2.0" if pd_version >= (2, 2) else "1.4"),
4402
                min_err_prefix="pandas >= 2.2 requires",
4403
            )
4404
            # note: the catalog (database) should be a part of the connection string
4405
            from sqlalchemy.engine import Connectable, create_engine
4406
            from sqlalchemy.orm import Session
4407

4408
            sa_object: Connectable
4409
            if isinstance(connection, str):
4410
                sa_object = create_engine(connection)
4411
            elif isinstance(connection, Session):
4412
                sa_object = connection.connection()
4413
            elif isinstance(connection, Connectable):
4414
                sa_object = connection
4415
            else:
4416
                msg = f"unrecognised connection type {connection!r}"
4417
                raise TypeError(msg)
4418

4419
            catalog, db_schema, unpacked_table_name = unpack_table_name(table_name)
4420
            if catalog:
4421
                msg = f"Unexpected three-part table name; provide the database/catalog ({catalog!r}) on the connection URI"
4422
                raise ValueError(msg)
4423

4424
            # ensure conversion to pandas uses the pyarrow extension array option
4425
            # so that we can make use of the sql/db export *without* copying data
4426
            res: int | None = self.to_pandas(
4427
                use_pyarrow_extension_array=True,
4428
            ).to_sql(
4429
                name=unpacked_table_name,
4430
                schema=db_schema,
4431
                con=sa_object,
4432
                if_exists=if_table_exists,
4433
                index=False,
4434
                **(engine_options or {}),
4435
            )
4436
            return -1 if res is None else res
4437

4438
        elif isinstance(engine, str):
4439
            msg = f"engine {engine!r} is not supported"
4440
            raise ValueError(msg)
4441
        else:
4442
            msg = f"unrecognised connection type {connection!r}"
4443
            raise TypeError(msg)
4444

4445
    @unstable()
4446
    def write_iceberg(
4447
        self,
4448
        target: str | pyiceberg.table.Table,
4449
        mode: Literal["append", "overwrite"],
4450
    ) -> None:
4451
        """
4452
        Write DataFrame to an Iceberg table.
4453

4454
        .. warning::
4455
            This functionality is currently considered **unstable**. It may be
4456
            changed at any point without it being considered a breaking change.
4457

4458
        Parameters
4459
        ----------
4460
        target
4461
            Name of the table or the Table object representing an Iceberg table.
4462
        mode : {'append', 'overwrite'}
4463
            How to handle existing data.
4464

4465
            - If 'append', will add new data.
4466
            - If 'overwrite', will replace table with new data.
4467

4468
        """
4469
        from pyiceberg.catalog import load_catalog
4470

4471
        if isinstance(target, str):
4472
            catalog = load_catalog()
4473
            table = catalog.load_table(target)
4474
        else:
4475
            table = target
4476

4477
        data = self.to_arrow(compat_level=CompatLevel.oldest())
4478

4479
        if mode == "append":
4480
            table.append(data)
4481
        else:
4482
            table.overwrite(data)
4483

4484
    @overload
4485
    def write_delta(
4486
        self,
4487
        target: str | Path | deltalake.DeltaTable,
4488
        *,
4489
        mode: Literal["error", "append", "overwrite", "ignore"] = ...,
4490
        overwrite_schema: bool | None = ...,
4491
        storage_options: dict[str, str] | None = ...,
4492
        credential_provider: CredentialProviderFunction | Literal["auto"] | None = ...,
4493
        delta_write_options: dict[str, Any] | None = ...,
4494
    ) -> None: ...
4495

4496
    @overload
4497
    def write_delta(
4498
        self,
4499
        target: str | Path | deltalake.DeltaTable,
4500
        *,
4501
        mode: Literal["merge"],
4502
        overwrite_schema: bool | None = ...,
4503
        storage_options: dict[str, str] | None = ...,
4504
        credential_provider: CredentialProviderFunction | Literal["auto"] | None = ...,
4505
        delta_merge_options: dict[str, Any],
4506
    ) -> deltalake.table.TableMerger: ...
4507

4508
    def write_delta(
4509
        self,
4510
        target: str | Path | deltalake.DeltaTable,
4511
        *,
4512
        mode: Literal["error", "append", "overwrite", "ignore", "merge"] = "error",
4513
        overwrite_schema: bool | None = None,
4514
        storage_options: dict[str, str] | None = None,
4515
        credential_provider: CredentialProviderFunction
4516
        | Literal["auto"]
4517
        | None = "auto",
4518
        delta_write_options: dict[str, Any] | None = None,
4519
        delta_merge_options: dict[str, Any] | None = None,
4520
    ) -> deltalake.table.TableMerger | None:
4521
        """
4522
        Write DataFrame as delta table.
4523

4524
        Parameters
4525
        ----------
4526
        target
4527
            URI of a table or a DeltaTable object.
4528
        mode : {'error', 'append', 'overwrite', 'ignore', 'merge'}
4529
            How to handle existing data.
4530

4531
            - If 'error', throw an error if the table already exists (default).
4532
            - If 'append', will add new data.
4533
            - If 'overwrite', will replace table with new data.
4534
            - If 'ignore', will not write anything if table already exists.
4535
            - If 'merge', return a `TableMerger` object to merge data from the DataFrame
4536
              with the existing data.
4537
        overwrite_schema
4538
            If True, allows updating the schema of the table.
4539

4540
            .. deprecated:: 0.20.14
4541
                Use the parameter `delta_write_options` instead and pass
4542
                `{"schema_mode": "overwrite"}`.
4543
        storage_options
4544
            Extra options for the storage backends supported by `deltalake`.
4545
            For cloud storages, this may include configurations for authentication etc.
4546

4547
            - See a list of supported storage options for S3 `here <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html#variants>`__.
4548
            - See a list of supported storage options for GCS `here <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html#variants>`__.
4549
            - See a list of supported storage options for Azure `here <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variants>`__.
4550
        credential_provider
4551
            Provide a function that can be called to provide cloud storage
4552
            credentials. The function is expected to return a dictionary of
4553
            credential keys along with an optional credential expiry time.
4554

4555
            .. warning::
4556
                This functionality is considered **unstable**. It may be changed
4557
                at any point without it being considered a breaking change.
4558
        delta_write_options
4559
            Additional keyword arguments while writing a Delta lake Table.
4560
            See a list of supported write options `here <https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake>`__.
4561
        delta_merge_options
4562
            Keyword arguments which are required to `MERGE` a Delta lake Table.
4563
            See a list of supported merge options `here <https://delta-io.github.io/delta-rs/api/delta_table/#deltalake.DeltaTable.merge>`__.
4564

4565
        Raises
4566
        ------
4567
        TypeError
4568
            If the DataFrame contains unsupported data types.
4569
        ArrowInvalidError
4570
            If the DataFrame contains data types that could not be cast to their
4571
            primitive type.
4572
        TableNotFoundError
4573
            If the delta table doesn't exist and MERGE action is triggered
4574

4575
        Notes
4576
        -----
4577
        The Polars data types :class:`Null` and :class:`Time` are not supported
4578
        by the delta protocol specification and will raise a TypeError. Columns
4579
        using The :class:`Categorical` data type will be converted to
4580
        normal (non-categorical) strings when written.
4581

4582
        Polars columns are always nullable. To write data to a delta table with
4583
        non-nullable columns, a custom pyarrow schema has to be passed to the
4584
        `delta_write_options`. See the last example below.
4585

4586
        Examples
4587
        --------
4588
        Write a dataframe to the local filesystem as a Delta Lake table.
4589

4590
        >>> df = pl.DataFrame(
4591
        ...     {
4592
        ...         "foo": [1, 2, 3, 4, 5],
4593
        ...         "bar": [6, 7, 8, 9, 10],
4594
        ...         "ham": ["a", "b", "c", "d", "e"],
4595
        ...     }
4596
        ... )
4597
        >>> table_path = "/path/to/delta-table/"
4598
        >>> df.write_delta(table_path)  # doctest: +SKIP
4599

4600
        Append data to an existing Delta Lake table on the local filesystem.
4601
        Note that this will fail if the schema of the new data does not match the
4602
        schema of the existing table.
4603

4604
        >>> df.write_delta(table_path, mode="append")  # doctest: +SKIP
4605

4606
        Overwrite a Delta Lake table as a new version.
4607
        If the schemas of the new and old data are the same, specifying the
4608
        `schema_mode` is not required.
4609

4610
        >>> existing_table_path = "/path/to/delta-table/"
4611
        >>> df.write_delta(
4612
        ...     existing_table_path,
4613
        ...     mode="overwrite",
4614
        ...     delta_write_options={"schema_mode": "overwrite"},
4615
        ... )  # doctest: +SKIP
4616

4617
        Write a DataFrame as a Delta Lake table to a cloud object store like S3.
4618

4619
        >>> table_path = "s3://bucket/prefix/to/delta-table/"
4620
        >>> df.write_delta(
4621
        ...     table_path,
4622
        ...     storage_options={
4623
        ...         "AWS_REGION": "THE_AWS_REGION",
4624
        ...         "AWS_ACCESS_KEY_ID": "THE_AWS_ACCESS_KEY_ID",
4625
        ...         "AWS_SECRET_ACCESS_KEY": "THE_AWS_SECRET_ACCESS_KEY",
4626
        ...     },
4627
        ... )  # doctest: +SKIP
4628

4629
        Write DataFrame as a Delta Lake table with non-nullable columns.
4630

4631
        >>> import pyarrow as pa
4632
        >>> existing_table_path = "/path/to/delta-table/"
4633
        >>> df.write_delta(
4634
        ...     existing_table_path,
4635
        ...     delta_write_options={
4636
        ...         "schema": pa.schema([pa.field("foo", pa.int64(), nullable=False)])
4637
        ...     },
4638
        ... )  # doctest: +SKIP
4639

4640
        Write DataFrame as a Delta Lake table with zstd compression.
4641
        For all `delta_write_options` keyword arguments, check the deltalake docs
4642
        `here
4643
        <https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake>`__,
4644
        and for Writer Properties in particular `here
4645
        <https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.WriterProperties>`__.
4646

4647
        >>> import deltalake
4648
        >>> df.write_delta(
4649
        ...     table_path,
4650
        ...     delta_write_options={
4651
        ...         "writer_properties": deltalake.WriterProperties(compression="zstd"),
4652
        ...     },
4653
        ... )  # doctest: +SKIP
4654

4655
        Merge the DataFrame with an existing Delta Lake table.
4656
        For all `TableMerger` methods, check the deltalake docs
4657
        `here <https://delta-io.github.io/delta-rs/api/delta_table/delta_table_merger/>`__.
4658

4659
        >>> df = pl.DataFrame(
4660
        ...     {
4661
        ...         "foo": [1, 2, 3, 4, 5],
4662
        ...         "bar": [6, 7, 8, 9, 10],
4663
        ...         "ham": ["a", "b", "c", "d", "e"],
4664
        ...     }
4665
        ... )
4666
        >>> table_path = "/path/to/delta-table/"
4667
        >>> (
4668
        ...     df.write_delta(
4669
        ...         "table_path",
4670
        ...         mode="merge",
4671
        ...         delta_merge_options={
4672
        ...             "predicate": "s.foo = t.foo",
4673
        ...             "source_alias": "s",
4674
        ...             "target_alias": "t",
4675
        ...         },
4676
        ...     )
4677
        ...     .when_matched_update_all()
4678
        ...     .when_not_matched_insert_all()
4679
        ...     .execute()
4680
        ... )  # doctest: +SKIP
4681
        """
4682
        if overwrite_schema is not None:
4683
            issue_deprecation_warning(
4684
                "the parameter `overwrite_schema` for `write_delta` is deprecated."
4685
                ' Use the parameter `delta_write_options` instead and pass `{"schema_mode": "overwrite"}`.',
4686
                version="0.20.14",
4687
            )
4688

4689
        from polars.io.delta import (
4690
            _check_for_unsupported_types,
4691
            _check_if_delta_available,
4692
            _resolve_delta_lake_uri,
4693
        )
4694

4695
        _check_if_delta_available()
4696

4697
        from deltalake import DeltaTable, write_deltalake
4698

4699
        _check_for_unsupported_types(self.dtypes)
4700

4701
        if isinstance(target, (str, Path)):
4702
            target = _resolve_delta_lake_uri(str(target), strict=False)
4703

4704
        from polars.io.cloud.credential_provider._builder import (
4705
            _init_credential_provider_builder,
4706
        )
4707
        from polars.io.cloud.credential_provider._providers import (
4708
            _get_credentials_from_provider_expiry_aware,
4709
        )
4710

4711
        if not isinstance(target, DeltaTable):
4712
            credential_provider_builder = _init_credential_provider_builder(
4713
                credential_provider, target, storage_options, "write_delta"
4714
            )
4715
        elif credential_provider is not None and credential_provider != "auto":
4716
            msg = "cannot use credential_provider when passing a DeltaTable object"
4717
            raise ValueError(msg)
4718
        else:
4719
            credential_provider_builder = None
4720

4721
        del credential_provider
4722

4723
        credential_provider_creds = {}
4724

4725
        if credential_provider_builder and (
4726
            provider := credential_provider_builder.build_credential_provider()
4727
        ):
4728
            credential_provider_creds = (
4729
                _get_credentials_from_provider_expiry_aware(provider) or {}
4730
            )
4731

4732
        # We aren't calling into polars-native write functions so we just update
4733
        # the storage_options here.
4734
        storage_options = (
4735
            {**(storage_options or {}), **credential_provider_creds}
4736
            if storage_options is not None or credential_provider_builder is not None
4737
            else None
4738
        )
4739

4740
        if mode == "merge":
4741
            if delta_merge_options is None:
4742
                msg = "you need to pass delta_merge_options with at least a given predicate for `MERGE` to work."
4743
                raise ValueError(msg)
4744
            if isinstance(target, str):
4745
                dt = DeltaTable(table_uri=target, storage_options=storage_options)
4746
            else:
4747
                dt = target
4748

4749
            return dt.merge(self, **delta_merge_options)
4750

4751
        else:
4752
            if delta_write_options is None:
4753
                delta_write_options = {}
4754

4755
            if overwrite_schema:
4756
                delta_write_options["schema_mode"] = "overwrite"
4757

4758
            write_deltalake(
4759
                table_or_uri=target,
4760
                data=self,
4761
                mode=mode,
4762
                storage_options=storage_options,
4763
                **delta_write_options,
4764
            )
4765
            return None
4766

4767
    def estimated_size(self, unit: SizeUnit = "b") -> int | float:
4768
        """
4769
        Return an estimation of the total (heap) allocated size of the `DataFrame`.
4770

4771
        Estimated size is given in the specified unit (bytes by default).
4772

4773
        This estimation is the sum of the size of its buffers, validity, including
4774
        nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
4775
        size of 2 arrays is not the sum of the sizes computed from this function. In
4776
        particular, [`StructArray`]'s size is an upper bound.
4777

4778
        When an array is sliced, its allocated size remains constant because the buffer
4779
        unchanged. However, this function will yield a smaller number. This is because
4780
        this function returns the visible size of the buffer, not its total capacity.
4781

4782
        FFI buffers are included in this estimation.
4783

4784
        Notes
4785
        -----
4786
        For data with Object dtype, the estimated size only reports the pointer
4787
        size, which is a huge underestimation.
4788

4789
        Parameters
4790
        ----------
4791
        unit : {'b', 'kb', 'mb', 'gb', 'tb'}
4792
            Scale the returned size to the given unit.
4793

4794
        Examples
4795
        --------
4796
        >>> df = pl.DataFrame(
4797
        ...     {
4798
        ...         "x": list(reversed(range(1_000_000))),
4799
        ...         "y": [v / 1000 for v in range(1_000_000)],
4800
        ...         "z": [str(v) for v in range(1_000_000)],
4801
        ...     },
4802
        ...     schema=[("x", pl.UInt32), ("y", pl.Float64), ("z", pl.String)],
4803
        ... )
4804
        >>> df.estimated_size()
4805
        17888890
4806
        >>> df.estimated_size("mb")
4807
        17.0601749420166
4808
        """
4809
        sz = self._df.estimated_size()
4810
        return scale_bytes(sz, unit)
4811

4812
    def transpose(
4813
        self,
4814
        *,
4815
        include_header: bool = False,
4816
        header_name: str = "column",
4817
        column_names: str | Iterable[str] | None = None,
4818
    ) -> DataFrame:
4819
        """
4820
        Transpose a DataFrame over the diagonal.
4821

4822
        Parameters
4823
        ----------
4824
        include_header
4825
            If set, the column names will be added as first column.
4826
        header_name
4827
            If `include_header` is set, this determines the name of the column that will
4828
            be inserted.
4829
        column_names
4830
            Optional iterable yielding strings or a string naming an existing column.
4831
            These will name the value (non-header) columns in the transposed data.
4832

4833
        Notes
4834
        -----
4835
        This is a very expensive operation. Perhaps you can do it differently.
4836

4837
        Returns
4838
        -------
4839
        DataFrame
4840

4841
        Examples
4842
        --------
4843
        >>> df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
4844
        >>> df.transpose(include_header=True)
4845
        shape: (2, 4)
4846
        ┌────────┬──────────┬──────────┬──────────┐
4847
        │ column ┆ column_0 ┆ column_1 ┆ column_2 │
4848
        │ ---    ┆ ---      ┆ ---      ┆ ---      │
4849
        │ str    ┆ i64      ┆ i64      ┆ i64      │
4850
        ╞════════╪══════════╪══════════╪══════════╡
4851
        │ a      ┆ 1        ┆ 2        ┆ 3        │
4852
        │ b      ┆ 4        ┆ 5        ┆ 6        │
4853
        └────────┴──────────┴──────────┴──────────┘
4854

4855
        Replace the auto-generated column names with a list
4856

4857
        >>> df.transpose(include_header=False, column_names=["x", "y", "z"])
4858
        shape: (2, 3)
4859
        ┌─────┬─────┬─────┐
4860
        │ x   ┆ y   ┆ z   │
4861
        │ --- ┆ --- ┆ --- │
4862
        │ i64 ┆ i64 ┆ i64 │
4863
        ╞═════╪═════╪═════╡
4864
        │ 1   ┆ 2   ┆ 3   │
4865
        │ 4   ┆ 5   ┆ 6   │
4866
        └─────┴─────┴─────┘
4867

4868
        Include the header as a separate column
4869

4870
        >>> df.transpose(
4871
        ...     include_header=True, header_name="foo", column_names=["x", "y", "z"]
4872
        ... )
4873
        shape: (2, 4)
4874
        ┌─────┬─────┬─────┬─────┐
4875
        │ foo ┆ x   ┆ y   ┆ z   │
4876
        │ --- ┆ --- ┆ --- ┆ --- │
4877
        │ str ┆ i64 ┆ i64 ┆ i64 │
4878
        ╞═════╪═════╪═════╪═════╡
4879
        │ a   ┆ 1   ┆ 2   ┆ 3   │
4880
        │ b   ┆ 4   ┆ 5   ┆ 6   │
4881
        └─────┴─────┴─────┴─────┘
4882

4883
        Replace the auto-generated column with column names from a generator function
4884

4885
        >>> def name_generator():
4886
        ...     base_name = "my_column_"
4887
        ...     count = 0
4888
        ...     while True:
4889
        ...         yield f"{base_name}{count}"
4890
        ...         count += 1
4891
        >>> df.transpose(include_header=False, column_names=name_generator())
4892
        shape: (2, 3)
4893
        ┌─────────────┬─────────────┬─────────────┐
4894
        │ my_column_0 ┆ my_column_1 ┆ my_column_2 │
4895
        │ ---         ┆ ---         ┆ ---         │
4896
        │ i64         ┆ i64         ┆ i64         │
4897
        ╞═════════════╪═════════════╪═════════════╡
4898
        │ 1           ┆ 2           ┆ 3           │
4899
        │ 4           ┆ 5           ┆ 6           │
4900
        └─────────────┴─────────────┴─────────────┘
4901

4902
        Use an existing column as the new column names
4903

4904
        >>> df = pl.DataFrame(dict(id=["i", "j", "k"], a=[1, 2, 3], b=[4, 5, 6]))
4905
        >>> df.transpose(column_names="id")
4906
        shape: (2, 3)
4907
        ┌─────┬─────┬─────┐
4908
        │ i   ┆ j   ┆ k   │
4909
        │ --- ┆ --- ┆ --- │
4910
        │ i64 ┆ i64 ┆ i64 │
4911
        ╞═════╪═════╪═════╡
4912
        │ 1   ┆ 2   ┆ 3   │
4913
        │ 4   ┆ 5   ┆ 6   │
4914
        └─────┴─────┴─────┘
4915
        >>> df.transpose(include_header=True, header_name="new_id", column_names="id")
4916
        shape: (2, 4)
4917
        ┌────────┬─────┬─────┬─────┐
4918
        │ new_id ┆ i   ┆ j   ┆ k   │
4919
        │ ---    ┆ --- ┆ --- ┆ --- │
4920
        │ str    ┆ i64 ┆ i64 ┆ i64 │
4921
        ╞════════╪═════╪═════╪═════╡
4922
        │ a      ┆ 1   ┆ 2   ┆ 3   │
4923
        │ b      ┆ 4   ┆ 5   ┆ 6   │
4924
        └────────┴─────┴─────┴─────┘
4925
        """
4926
        keep_names_as = header_name if include_header else None
4927
        column_names_: Sequence[str] | None
4928
        if isinstance(column_names, Generator):
4929
            column_names_ = [next(column_names) for _ in range(self.height)]
4930
        else:
4931
            column_names_ = column_names  # type: ignore[assignment]
4932
        return self._from_pydf(self._df.transpose(keep_names_as, column_names_))
4933

4934
    def reverse(self) -> DataFrame:
4935
        """
4936
        Reverse the DataFrame.
4937

4938
        Examples
4939
        --------
4940
        >>> df = pl.DataFrame(
4941
        ...     {
4942
        ...         "key": ["a", "b", "c"],
4943
        ...         "val": [1, 2, 3],
4944
        ...     }
4945
        ... )
4946
        >>> df.reverse()
4947
        shape: (3, 2)
4948
        ┌─────┬─────┐
4949
        │ key ┆ val │
4950
        │ --- ┆ --- │
4951
        │ str ┆ i64 │
4952
        ╞═════╪═════╡
4953
        │ c   ┆ 3   │
4954
        │ b   ┆ 2   │
4955
        │ a   ┆ 1   │
4956
        └─────┴─────┘
4957
        """
4958
        return self.select(F.col("*").reverse())
4959

4960
    def rename(
4961
        self, mapping: Mapping[str, str] | Callable[[str], str], *, strict: bool = True
4962
    ) -> DataFrame:
4963
        """
4964
        Rename column names.
4965

4966
        Parameters
4967
        ----------
4968
        mapping
4969
            Key value pairs that map from old name to new name, or a function
4970
            that takes the old name as input and returns the new name.
4971
        strict
4972
            Validate that all column names exist in the current schema,
4973
            and throw an exception if any do not. (Note that this parameter
4974
            is a no-op when passing a function to `mapping`).
4975

4976
        Examples
4977
        --------
4978
        >>> df = pl.DataFrame(
4979
        ...     {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]}
4980
        ... )
4981
        >>> df.rename({"foo": "apple"})
4982
        shape: (3, 3)
4983
        ┌───────┬─────┬─────┐
4984
        │ apple ┆ bar ┆ ham │
4985
        │ ---   ┆ --- ┆ --- │
4986
        │ i64   ┆ i64 ┆ str │
4987
        ╞═══════╪═════╪═════╡
4988
        │ 1     ┆ 6   ┆ a   │
4989
        │ 2     ┆ 7   ┆ b   │
4990
        │ 3     ┆ 8   ┆ c   │
4991
        └───────┴─────┴─────┘
4992
        >>> df.rename(lambda column_name: "c" + column_name[1:])
4993
        shape: (3, 3)
4994
        ┌─────┬─────┬─────┐
4995
        │ coo ┆ car ┆ cam │
4996
        │ --- ┆ --- ┆ --- │
4997
        │ i64 ┆ i64 ┆ str │
4998
        ╞═════╪═════╪═════╡
4999
        │ 1   ┆ 6   ┆ a   │
5000
        │ 2   ┆ 7   ┆ b   │
5001
        │ 3   ┆ 8   ┆ c   │
5002
        └─────┴─────┴─────┘
5003
        """
5004
        from polars.lazyframe.opt_flags import QueryOptFlags
5005

5006
        return (
5007
            self.lazy()
5008
            .rename(mapping, strict=strict)
5009
            .collect(optimizations=QueryOptFlags._eager())
5010
        )
5011

5012
    def insert_column(self, index: int, column: IntoExprColumn) -> DataFrame:
5013
        """
5014
        Insert a Series (or expression) at a certain column index.
5015

5016
        This operation is in place.
5017

5018
        Parameters
5019
        ----------
5020
        index
5021
            Index at which to insert the new column.
5022
        column
5023
            `Series` or expression to insert.
5024

5025
        Examples
5026
        --------
5027
        Insert a new Series column at the given index:
5028

5029
        >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
5030
        >>> s = pl.Series("baz", [97, 98, 99])
5031
        >>> df.insert_column(1, s)
5032
        shape: (3, 3)
5033
        ┌─────┬─────┬─────┐
5034
        │ foo ┆ baz ┆ bar │
5035
        │ --- ┆ --- ┆ --- │
5036
        │ i64 ┆ i64 ┆ i64 │
5037
        ╞═════╪═════╪═════╡
5038
        │ 1   ┆ 97  ┆ 4   │
5039
        │ 2   ┆ 98  ┆ 5   │
5040
        │ 3   ┆ 99  ┆ 6   │
5041
        └─────┴─────┴─────┘
5042

5043
        Insert a new expression column at the given index:
5044

5045
        >>> df = pl.DataFrame(
5046
        ...     {"a": [2, 4, 2], "b": [0.5, 4, 10], "c": ["xx", "yy", "zz"]}
5047
        ... )
5048
        >>> expr = (pl.col("b") / pl.col("a")).alias("b_div_a")
5049
        >>> df.insert_column(2, expr)
5050
        shape: (3, 4)
5051
        ┌─────┬──────┬─────────┬─────┐
5052
        │ a   ┆ b    ┆ b_div_a ┆ c   │
5053
        │ --- ┆ ---  ┆ ---     ┆ --- │
5054
        │ i64 ┆ f64  ┆ f64     ┆ str │
5055
        ╞═════╪══════╪═════════╪═════╡
5056
        │ 2   ┆ 0.5  ┆ 0.25    ┆ xx  │
5057
        │ 4   ┆ 4.0  ┆ 1.0     ┆ yy  │
5058
        │ 2   ┆ 10.0 ┆ 5.0     ┆ zz  │
5059
        └─────┴──────┴─────────┴─────┘
5060
        """
5061
        if (original_index := index) < 0:
5062
            index = self.width + index
5063
            if index < 0:
5064
                msg = f"column index {original_index} is out of range (frame has {self.width} columns)"
5065
                raise IndexError(msg)
5066
        elif index > self.width:
5067
            msg = f"column index {original_index} is out of range (frame has {self.width} columns)"
5068
            raise IndexError(msg)
5069

5070
        if isinstance(column, pl.Series):
5071
            self._df.insert_column(index, column._s)
5072
        else:
5073
            if isinstance(column, str):
5074
                column = F.col(column)
5075
            if isinstance(column, pl.Expr):
5076
                cols = self.columns
5077
                cols.insert(index, column)  # type: ignore[arg-type]
5078
                self._df = self.select(cols)._df
5079
            else:
5080
                msg = f"column must be a Series or Expr, got {column!r} (type={qualified_type_name(column)})"
5081
                raise TypeError(msg)
5082
        return self
5083

5084
    def filter(
5085
        self,
5086
        *predicates: (
5087
            IntoExprColumn
5088
            | Iterable[IntoExprColumn]
5089
            | bool
5090
            | list[bool]
5091
            | np.ndarray[Any, Any]
5092
        ),
5093
        **constraints: Any,
5094
    ) -> DataFrame:
5095
        """
5096
        Filter rows, retaining those that match the given predicate expression(s).
5097

5098
        The original order of the remaining rows is preserved.
5099

5100
        Only rows where the predicate resolves as True are retained; when the
5101
        predicate result is False (or null), the row is discarded.
5102

5103
        Parameters
5104
        ----------
5105
        predicates
5106
            Expression(s) that evaluate to a boolean Series.
5107
        constraints
5108
            Column filters; use `name = value` to filter columns by the supplied value.
5109
            Each constraint will behave the same as `pl.col(name).eq(value)`, and
5110
            be implicitly joined with the other filter conditions using `&`.
5111

5112
        Notes
5113
        -----
5114
        If you are transitioning from Pandas, and performing filter operations based on
5115
        the comparison of two or more columns, please note that in Polars any comparison
5116
        involving `null` values will result in a `null` result, *not* boolean True or
5117
        False. As a result, these rows will not be retained. Ensure that null values
5118
        are handled appropriately to avoid unexpected behaviour (see examples below).
5119

5120
        See Also
5121
        --------
5122
        remove
5123

5124
        Examples
5125
        --------
5126
        >>> df = pl.DataFrame(
5127
        ...     {
5128
        ...         "foo": [1, 2, 3, None, 4, None, 0],
5129
        ...         "bar": [6, 7, 8, None, None, 9, 0],
5130
        ...         "ham": ["a", "b", "c", None, "d", "e", "f"],
5131
        ...     }
5132
        ... )
5133

5134
        Filter rows matching a condition:
5135

5136
        >>> df.filter(pl.col("foo") > 1)
5137
        shape: (3, 3)
5138
        ┌─────┬──────┬─────┐
5139
        │ foo ┆ bar  ┆ ham │
5140
        │ --- ┆ ---  ┆ --- │
5141
        │ i64 ┆ i64  ┆ str │
5142
        ╞═════╪══════╪═════╡
5143
        │ 2   ┆ 7    ┆ b   │
5144
        │ 3   ┆ 8    ┆ c   │
5145
        │ 4   ┆ null ┆ d   │
5146
        └─────┴──────┴─────┘
5147

5148
        Filter on multiple conditions, combined with and/or operators:
5149

5150
        >>> df.filter(
5151
        ...     (pl.col("foo") < 3) & (pl.col("ham") == "a"),
5152
        ... )
5153
        shape: (1, 3)
5154
        ┌─────┬─────┬─────┐
5155
        │ foo ┆ bar ┆ ham │
5156
        │ --- ┆ --- ┆ --- │
5157
        │ i64 ┆ i64 ┆ str │
5158
        ╞═════╪═════╪═════╡
5159
        │ 1   ┆ 6   ┆ a   │
5160
        └─────┴─────┴─────┘
5161

5162
        >>> df.filter(
5163
        ...     (pl.col("foo") == 1) | (pl.col("ham") == "c"),
5164
        ... )
5165
        shape: (2, 3)
5166
        ┌─────┬─────┬─────┐
5167
        │ foo ┆ bar ┆ ham │
5168
        │ --- ┆ --- ┆ --- │
5169
        │ i64 ┆ i64 ┆ str │
5170
        ╞═════╪═════╪═════╡
5171
        │ 1   ┆ 6   ┆ a   │
5172
        │ 3   ┆ 8   ┆ c   │
5173
        └─────┴─────┴─────┘
5174

5175
        Provide multiple filters using `*args` syntax:
5176

5177
        >>> df.filter(
5178
        ...     pl.col("foo") <= 2,
5179
        ...     ~pl.col("ham").is_in(["b", "c"]),
5180
        ... )
5181
        shape: (2, 3)
5182
        ┌─────┬─────┬─────┐
5183
        │ foo ┆ bar ┆ ham │
5184
        │ --- ┆ --- ┆ --- │
5185
        │ i64 ┆ i64 ┆ str │
5186
        ╞═════╪═════╪═════╡
5187
        │ 1   ┆ 6   ┆ a   │
5188
        │ 0   ┆ 0   ┆ f   │
5189
        └─────┴─────┴─────┘
5190

5191
        Provide multiple filters using `**kwargs` syntax:
5192

5193
        >>> df.filter(foo=2, ham="b")
5194
        shape: (1, 3)
5195
        ┌─────┬─────┬─────┐
5196
        │ foo ┆ bar ┆ ham │
5197
        │ --- ┆ --- ┆ --- │
5198
        │ i64 ┆ i64 ┆ str │
5199
        ╞═════╪═════╪═════╡
5200
        │ 2   ┆ 7   ┆ b   │
5201
        └─────┴─────┴─────┘
5202

5203
        Filter by comparing two columns against each other:
5204

5205
        >>> df.filter(
5206
        ...     pl.col("foo") == pl.col("bar"),
5207
        ... )
5208
        shape: (1, 3)
5209
        ┌─────┬─────┬─────┐
5210
        │ foo ┆ bar ┆ ham │
5211
        │ --- ┆ --- ┆ --- │
5212
        │ i64 ┆ i64 ┆ str │
5213
        ╞═════╪═════╪═════╡
5214
        │ 0   ┆ 0   ┆ f   │
5215
        └─────┴─────┴─────┘
5216

5217
        >>> df.filter(
5218
        ...     pl.col("foo") != pl.col("bar"),
5219
        ... )
5220
        shape: (3, 3)
5221
        ┌─────┬─────┬─────┐
5222
        │ foo ┆ bar ┆ ham │
5223
        │ --- ┆ --- ┆ --- │
5224
        │ i64 ┆ i64 ┆ str │
5225
        ╞═════╪═════╪═════╡
5226
        │ 1   ┆ 6   ┆ a   │
5227
        │ 2   ┆ 7   ┆ b   │
5228
        │ 3   ┆ 8   ┆ c   │
5229
        └─────┴─────┴─────┘
5230

5231
        Notice how the row with `None` values is filtered out. In order to keep the
5232
        same behavior as pandas, use:
5233

5234
        >>> df.filter(
5235
        ...     pl.col("foo").ne_missing(pl.col("bar")),
5236
        ... )
5237
        shape: (5, 3)
5238
        ┌──────┬──────┬─────┐
5239
        │ foo  ┆ bar  ┆ ham │
5240
        │ ---  ┆ ---  ┆ --- │
5241
        │ i64  ┆ i64  ┆ str │
5242
        ╞══════╪══════╪═════╡
5243
        │ 1    ┆ 6    ┆ a   │
5244
        │ 2    ┆ 7    ┆ b   │
5245
        │ 3    ┆ 8    ┆ c   │
5246
        │ 4    ┆ null ┆ d   │
5247
        │ null ┆ 9    ┆ e   │
5248
        └──────┴──────┴─────┘
5249
        """
5250
        from polars.lazyframe.opt_flags import QueryOptFlags
5251

5252
        return (
5253
            self.lazy()
5254
            .filter(*predicates, **constraints)
5255
            .collect(optimizations=QueryOptFlags._eager())
5256
        )
5257

5258
    def remove(
5259
        self,
5260
        *predicates: (
5261
            IntoExprColumn
5262
            | Iterable[IntoExprColumn]
5263
            | bool
5264
            | list[bool]
5265
            | np.ndarray[Any, Any]
5266
        ),
5267
        **constraints: Any,
5268
    ) -> DataFrame:
5269
        """
5270
        Remove rows, dropping those that match the given predicate expression(s).
5271

5272
        The original order of the remaining rows is preserved.
5273

5274
        Rows where the filter predicate does not evaluate to True are retained
5275
        (this includes rows where the predicate evaluates as `null`).
5276

5277
        Parameters
5278
        ----------
5279
        predicates
5280
            Expression that evaluates to a boolean Series.
5281
        constraints
5282
            Column filters; use `name = value` to filter columns using the supplied
5283
            value. Each constraint behaves the same as `pl.col(name).eq(value)`,
5284
            and is implicitly joined with the other filter conditions using `&`.
5285

5286
        Notes
5287
        -----
5288
        If you are transitioning from Pandas, and performing filter operations based on
5289
        the comparison of two or more columns, please note that in Polars any comparison
5290
        involving `null` values will result in a `null` result, *not* boolean True or
5291
        False. As a result, these rows will not be removed. Ensure that null values
5292
        are handled appropriately to avoid unexpected behaviour (see examples below).
5293

5294
        See Also
5295
        --------
5296
        filter
5297

5298
        Examples
5299
        --------
5300
        >>> df = pl.DataFrame(
5301
        ...     {
5302
        ...         "foo": [2, 3, None, 4, 0],
5303
        ...         "bar": [5, 6, None, None, 0],
5304
        ...         "ham": ["a", "b", None, "c", "d"],
5305
        ...     }
5306
        ... )
5307

5308
        Remove rows matching a condition:
5309

5310
        >>> df.remove(pl.col("bar") >= 5)
5311
        shape: (3, 3)
5312
        ┌──────┬──────┬──────┐
5313
        │ foo  ┆ bar  ┆ ham  │
5314
        │ ---  ┆ ---  ┆ ---  │
5315
        │ i64  ┆ i64  ┆ str  │
5316
        ╞══════╪══════╪══════╡
5317
        │ null ┆ null ┆ null │
5318
        │ 4    ┆ null ┆ c    │
5319
        │ 0    ┆ 0    ┆ d    │
5320
        └──────┴──────┴──────┘
5321

5322
        Discard rows based on multiple conditions, combined with and/or operators:
5323

5324
        >>> df.remove(
5325
        ...     (pl.col("foo") >= 0) & (pl.col("bar") >= 0),
5326
        ... )
5327
        shape: (2, 3)
5328
        ┌──────┬──────┬──────┐
5329
        │ foo  ┆ bar  ┆ ham  │
5330
        │ ---  ┆ ---  ┆ ---  │
5331
        │ i64  ┆ i64  ┆ str  │
5332
        ╞══════╪══════╪══════╡
5333
        │ null ┆ null ┆ null │
5334
        │ 4    ┆ null ┆ c    │
5335
        └──────┴──────┴──────┘
5336

5337
        >>> df.remove(
5338
        ...     (pl.col("foo") >= 0) | (pl.col("bar") >= 0),
5339
        ... )
5340
        shape: (1, 3)
5341
        ┌──────┬──────┬──────┐
5342
        │ foo  ┆ bar  ┆ ham  │
5343
        │ ---  ┆ ---  ┆ ---  │
5344
        │ i64  ┆ i64  ┆ str  │
5345
        ╞══════╪══════╪══════╡
5346
        │ null ┆ null ┆ null │
5347
        └──────┴──────┴──────┘
5348

5349
        Provide multiple constraints using `*args` syntax:
5350

5351
        >>> df.remove(
5352
        ...     pl.col("ham").is_not_null(),
5353
        ...     pl.col("bar") >= 0,
5354
        ... )
5355
        shape: (2, 3)
5356
        ┌──────┬──────┬──────┐
5357
        │ foo  ┆ bar  ┆ ham  │
5358
        │ ---  ┆ ---  ┆ ---  │
5359
        │ i64  ┆ i64  ┆ str  │
5360
        ╞══════╪══════╪══════╡
5361
        │ null ┆ null ┆ null │
5362
        │ 4    ┆ null ┆ c    │
5363
        └──────┴──────┴──────┘
5364

5365
        Provide constraints(s) using `**kwargs` syntax:
5366

5367
        >>> df.remove(foo=0, bar=0)
5368
        shape: (4, 3)
5369
        ┌──────┬──────┬──────┐
5370
        │ foo  ┆ bar  ┆ ham  │
5371
        │ ---  ┆ ---  ┆ ---  │
5372
        │ i64  ┆ i64  ┆ str  │
5373
        ╞══════╪══════╪══════╡
5374
        │ 2    ┆ 5    ┆ a    │
5375
        │ 3    ┆ 6    ┆ b    │
5376
        │ null ┆ null ┆ null │
5377
        │ 4    ┆ null ┆ c    │
5378
        └──────┴──────┴──────┘
5379

5380
        Remove rows by comparing two columns against each other:
5381

5382
        >>> df.remove(
5383
        ...     pl.col("foo").ne_missing(pl.col("bar")),
5384
        ... )
5385
        shape: (2, 3)
5386
        ┌──────┬──────┬──────┐
5387
        │ foo  ┆ bar  ┆ ham  │
5388
        │ ---  ┆ ---  ┆ ---  │
5389
        │ i64  ┆ i64  ┆ str  │
5390
        ╞══════╪══════╪══════╡
5391
        │ null ┆ null ┆ null │
5392
        │ 0    ┆ 0    ┆ d    │
5393
        └──────┴──────┴──────┘
5394
        """
5395
        from polars.lazyframe.opt_flags import QueryOptFlags
5396

5397
        return (
5398
            self.lazy()
5399
            .remove(*predicates, **constraints)
5400
            .collect(optimizations=QueryOptFlags._eager())
5401
        )
5402

5403
    @overload
5404
    def glimpse(
5405
        self,
5406
        *,
5407
        max_items_per_column: int = ...,
5408
        max_colname_length: int = ...,
5409
        return_as_string: Literal[False] = ...,
5410
    ) -> None: ...
5411

5412
    @overload
5413
    def glimpse(
5414
        self,
5415
        *,
5416
        max_items_per_column: int = ...,
5417
        max_colname_length: int = ...,
5418
        return_as_string: Literal[True],
5419
    ) -> str: ...
5420

5421
    @overload
5422
    def glimpse(
5423
        self,
5424
        *,
5425
        max_items_per_column: int = ...,
5426
        max_colname_length: int = ...,
5427
        return_as_string: bool,
5428
    ) -> str | None: ...
5429

5430
    def glimpse(
5431
        self,
5432
        *,
5433
        max_items_per_column: int = 10,
5434
        max_colname_length: int = 50,
5435
        return_as_string: bool = False,
5436
    ) -> str | None:
5437
        """
5438
        Return a dense preview of the DataFrame.
5439

5440
        The formatting shows one line per column so that wide dataframes display
5441
        cleanly. Each line shows the column name, the data type, and the first
5442
        few values.
5443

5444
        Parameters
5445
        ----------
5446
        max_items_per_column
5447
            Maximum number of items to show per column.
5448
        max_colname_length
5449
            Maximum length of the displayed column names; values that exceed this
5450
            value are truncated with a trailing ellipsis.
5451
        return_as_string
5452
            If True, return the preview as a string instead of printing to stdout.
5453

5454
        See Also
5455
        --------
5456
        describe, head, tail
5457

5458
        Examples
5459
        --------
5460
        >>> from datetime import date
5461
        >>> df = pl.DataFrame(
5462
        ...     {
5463
        ...         "a": [1.0, 2.8, 3.0],
5464
        ...         "b": [4, 5, None],
5465
        ...         "c": [True, False, True],
5466
        ...         "d": [None, "b", "c"],
5467
        ...         "e": ["usd", "eur", None],
5468
        ...         "f": [date(2020, 1, 1), date(2021, 1, 2), date(2022, 1, 1)],
5469
        ...     }
5470
        ... )
5471
        >>> df.glimpse()
5472
        Rows: 3
5473
        Columns: 6
5474
        $ a  <f64> 1.0, 2.8, 3.0
5475
        $ b  <i64> 4, 5, None
5476
        $ c <bool> True, False, True
5477
        $ d  <str> None, 'b', 'c'
5478
        $ e  <str> 'usd', 'eur', None
5479
        $ f <date> 2020-01-01, 2021-01-02, 2022-01-01
5480
        """
5481
        # always print at most this number of values (mainly ensures that
5482
        # we do not cast long arrays to strings, which would be slow)
5483
        max_n_values = min(max_items_per_column, self.height)
5484
        schema = self.schema
5485

5486
        def _parse_column(col_name: str, dtype: PolarsDataType) -> tuple[str, str, str]:
5487
            fn = repr if schema[col_name] == String else str
5488
            values = self[:max_n_values, col_name].to_list()
5489
            val_str = ", ".join(fn(v) for v in values)
5490
            if len(col_name) > max_colname_length:
5491
                col_name = col_name[: (max_colname_length - 1)] + "…"
5492
            return col_name, f"<{_dtype_str_repr(dtype)}>", val_str
5493

5494
        data = [_parse_column(s, dtype) for s, dtype in self.schema.items()]
5495

5496
        # determine column layout widths
5497
        max_col_name = max((len(col_name) for col_name, _, _ in data))
5498
        max_col_dtype = max((len(dtype_str) for _, dtype_str, _ in data))
5499

5500
        # print header
5501
        output = StringIO()
5502
        output.write(f"Rows: {self.height}\nColumns: {self.width}\n")
5503

5504
        # print individual columns: one row per column
5505
        for col_name, dtype_str, val_str in data:
5506
            output.write(
5507
                f"$ {col_name:<{max_col_name}} {dtype_str:>{max_col_dtype}} {val_str}\n"
5508
            )
5509

5510
        s = output.getvalue()
5511
        if return_as_string:
5512
            return s
5513

5514
        print(s, end=None)
5515
        return None
5516

5517
    def describe(
5518
        self,
5519
        percentiles: Sequence[float] | float | None = (0.25, 0.50, 0.75),
5520
        *,
5521
        interpolation: QuantileMethod = "nearest",
5522
    ) -> DataFrame:
5523
        """
5524
        Summary statistics for a DataFrame.
5525

5526
        Parameters
5527
        ----------
5528
        percentiles
5529
            One or more percentiles to include in the summary statistics.
5530
            All values must be in the range `[0, 1]`.
5531

5532
        interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear', 'equiprobable'}
5533
            Interpolation method used when calculating percentiles.
5534

5535
        Notes
5536
        -----
5537
        The median is included by default as the 50% percentile.
5538

5539
        Warnings
5540
        --------
5541
        We do not guarantee the output of `describe` to be stable. It will show
5542
        statistics that we deem informative, and may be updated in the future.
5543
        Using `describe` programmatically (versus interactive exploration) is
5544
        not recommended for this reason.
5545

5546
        See Also
5547
        --------
5548
        glimpse
5549

5550
        Examples
5551
        --------
5552
        >>> from datetime import date, time
5553
        >>> df = pl.DataFrame(
5554
        ...     {
5555
        ...         "float": [1.0, 2.8, 3.0],
5556
        ...         "int": [40, 50, None],
5557
        ...         "bool": [True, False, True],
5558
        ...         "str": ["zz", "xx", "yy"],
5559
        ...         "date": [date(2020, 1, 1), date(2021, 7, 5), date(2022, 12, 31)],
5560
        ...         "time": [time(10, 20, 30), time(14, 45, 50), time(23, 15, 10)],
5561
        ...     }
5562
        ... )
5563

5564
        Show default frame statistics:
5565

5566
        >>> df.describe()
5567
        shape: (9, 7)
5568
        ┌────────────┬──────────┬──────────┬──────────┬──────┬─────────────────────┬──────────┐
5569
        │ statistic  ┆ float    ┆ int      ┆ bool     ┆ str  ┆ date                ┆ time     │
5570
        │ ---        ┆ ---      ┆ ---      ┆ ---      ┆ ---  ┆ ---                 ┆ ---      │
5571
        │ str        ┆ f64      ┆ f64      ┆ f64      ┆ str  ┆ str                 ┆ str      │
5572
        ╞════════════╪══════════╪══════════╪══════════╪══════╪═════════════════════╪══════════╡
5573
        │ count      ┆ 3.0      ┆ 2.0      ┆ 3.0      ┆ 3    ┆ 3                   ┆ 3        │
5574
        │ null_count ┆ 0.0      ┆ 1.0      ┆ 0.0      ┆ 0    ┆ 0                   ┆ 0        │
5575
        │ mean       ┆ 2.266667 ┆ 45.0     ┆ 0.666667 ┆ null ┆ 2021-07-02 16:00:00 ┆ 16:07:10 │
5576
        │ std        ┆ 1.101514 ┆ 7.071068 ┆ null     ┆ null ┆ null                ┆ null     │
5577
        │ min        ┆ 1.0      ┆ 40.0     ┆ 0.0      ┆ xx   ┆ 2020-01-01          ┆ 10:20:30 │
5578
        │ 25%        ┆ 2.8      ┆ 40.0     ┆ null     ┆ null ┆ 2021-07-05          ┆ 14:45:50 │
5579
        │ 50%        ┆ 2.8      ┆ 50.0     ┆ null     ┆ null ┆ 2021-07-05          ┆ 14:45:50 │
5580
        │ 75%        ┆ 3.0      ┆ 50.0     ┆ null     ┆ null ┆ 2022-12-31          ┆ 23:15:10 │
5581
        │ max        ┆ 3.0      ┆ 50.0     ┆ 1.0      ┆ zz   ┆ 2022-12-31          ┆ 23:15:10 │
5582
        └────────────┴──────────┴──────────┴──────────┴──────┴─────────────────────┴──────────┘
5583

5584
        Customize which percentiles are displayed, applying linear interpolation:
5585

5586
        >>> with pl.Config(tbl_rows=12):
5587
        ...     df.describe(
5588
        ...         percentiles=[0.1, 0.3, 0.5, 0.7, 0.9],
5589
        ...         interpolation="linear",
5590
        ...     )
5591
        shape: (11, 7)
5592
        ┌────────────┬──────────┬──────────┬──────────┬──────┬─────────────────────┬──────────┐
5593
        │ statistic  ┆ float    ┆ int      ┆ bool     ┆ str  ┆ date                ┆ time     │
5594
        │ ---        ┆ ---      ┆ ---      ┆ ---      ┆ ---  ┆ ---                 ┆ ---      │
5595
        │ str        ┆ f64      ┆ f64      ┆ f64      ┆ str  ┆ str                 ┆ str      │
5596
        ╞════════════╪══════════╪══════════╪══════════╪══════╪═════════════════════╪══════════╡
5597
        │ count      ┆ 3.0      ┆ 2.0      ┆ 3.0      ┆ 3    ┆ 3                   ┆ 3        │
5598
        │ null_count ┆ 0.0      ┆ 1.0      ┆ 0.0      ┆ 0    ┆ 0                   ┆ 0        │
5599
        │ mean       ┆ 2.266667 ┆ 45.0     ┆ 0.666667 ┆ null ┆ 2021-07-02 16:00:00 ┆ 16:07:10 │
5600
        │ std        ┆ 1.101514 ┆ 7.071068 ┆ null     ┆ null ┆ null                ┆ null     │
5601
        │ min        ┆ 1.0      ┆ 40.0     ┆ 0.0      ┆ xx   ┆ 2020-01-01          ┆ 10:20:30 │
5602
        │ 10%        ┆ 1.36     ┆ 41.0     ┆ null     ┆ null ┆ 2020-04-20          ┆ 11:13:34 │
5603
        │ 30%        ┆ 2.08     ┆ 43.0     ┆ null     ┆ null ┆ 2020-11-26          ┆ 12:59:42 │
5604
        │ 50%        ┆ 2.8      ┆ 45.0     ┆ null     ┆ null ┆ 2021-07-05          ┆ 14:45:50 │
5605
        │ 70%        ┆ 2.88     ┆ 47.0     ┆ null     ┆ null ┆ 2022-02-07          ┆ 18:09:34 │
5606
        │ 90%        ┆ 2.96     ┆ 49.0     ┆ null     ┆ null ┆ 2022-09-13          ┆ 21:33:18 │
5607
        │ max        ┆ 3.0      ┆ 50.0     ┆ 1.0      ┆ zz   ┆ 2022-12-31          ┆ 23:15:10 │
5608
        └────────────┴──────────┴──────────┴──────────┴──────┴─────────────────────┴──────────┘
5609
        """  # noqa: W505
5610
        if not self.columns:
5611
            msg = "cannot describe a DataFrame that has no columns"
5612
            raise TypeError(msg)
5613

5614
        return self.lazy().describe(
5615
            percentiles=percentiles, interpolation=interpolation
5616
        )
5617

5618
    def get_column_index(self, name: str) -> int:
5619
        """
5620
        Find the index of a column by name.
5621

5622
        Parameters
5623
        ----------
5624
        name
5625
            Name of the column to find.
5626

5627
        Examples
5628
        --------
5629
        >>> df = pl.DataFrame(
5630
        ...     {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]}
5631
        ... )
5632
        >>> df.get_column_index("ham")
5633
        2
5634
        >>> df.get_column_index("sandwich")  # doctest: +SKIP
5635
        ColumnNotFoundError: sandwich
5636
        """
5637
        return self._df.get_column_index(name)
5638

5639
    def replace_column(self, index: int, column: Series) -> DataFrame:
5640
        """
5641
        Replace a column at an index location.
5642

5643
        This operation is in place.
5644

5645
        Parameters
5646
        ----------
5647
        index
5648
            Column index.
5649
        column
5650
            Series that will replace the column.
5651

5652
        Examples
5653
        --------
5654
        >>> df = pl.DataFrame(
5655
        ...     {
5656
        ...         "foo": [1, 2, 3],
5657
        ...         "bar": [6, 7, 8],
5658
        ...         "ham": ["a", "b", "c"],
5659
        ...     }
5660
        ... )
5661
        >>> s = pl.Series("apple", [10, 20, 30])
5662
        >>> df.replace_column(0, s)
5663
        shape: (3, 3)
5664
        ┌───────┬─────┬─────┐
5665
        │ apple ┆ bar ┆ ham │
5666
        │ ---   ┆ --- ┆ --- │
5667
        │ i64   ┆ i64 ┆ str │
5668
        ╞═══════╪═════╪═════╡
5669
        │ 10    ┆ 6   ┆ a   │
5670
        │ 20    ┆ 7   ┆ b   │
5671
        │ 30    ┆ 8   ┆ c   │
5672
        └───────┴─────┴─────┘
5673
        """
5674
        if index < 0:
5675
            index = self.width + index
5676
        self._df.replace_column(index, column._s)
5677
        return self
5678

5679
    def sort(
5680
        self,
5681
        by: IntoExpr | Iterable[IntoExpr],
5682
        *more_by: IntoExpr,
5683
        descending: bool | Sequence[bool] = False,
5684
        nulls_last: bool | Sequence[bool] = False,
5685
        multithreaded: bool = True,
5686
        maintain_order: bool = False,
5687
    ) -> DataFrame:
5688
        """
5689
        Sort the dataframe by the given columns.
5690

5691
        Parameters
5692
        ----------
5693
        by
5694
            Column(s) to sort by. Accepts expression input, including selectors. Strings
5695
            are parsed as column names.
5696
        *more_by
5697
            Additional columns to sort by, specified as positional arguments.
5698
        descending
5699
            Sort in descending order. When sorting by multiple columns, can be specified
5700
            per column by passing a sequence of booleans.
5701
        nulls_last
5702
            Place null values last; can specify a single boolean applying to all columns
5703
            or a sequence of booleans for per-column control.
5704
        multithreaded
5705
            Sort using multiple threads.
5706
        maintain_order
5707
            Whether the order should be maintained if elements are equal.
5708

5709
        Examples
5710
        --------
5711
        Pass a single column name to sort by that column.
5712

5713
        >>> df = pl.DataFrame(
5714
        ...     {
5715
        ...         "a": [1, 2, None],
5716
        ...         "b": [6.0, 5.0, 4.0],
5717
        ...         "c": ["a", "c", "b"],
5718
        ...     }
5719
        ... )
5720
        >>> df.sort("a")
5721
        shape: (3, 3)
5722
        ┌──────┬─────┬─────┐
5723
        │ a    ┆ b   ┆ c   │
5724
        │ ---  ┆ --- ┆ --- │
5725
        │ i64  ┆ f64 ┆ str │
5726
        ╞══════╪═════╪═════╡
5727
        │ null ┆ 4.0 ┆ b   │
5728
        │ 1    ┆ 6.0 ┆ a   │
5729
        │ 2    ┆ 5.0 ┆ c   │
5730
        └──────┴─────┴─────┘
5731

5732
        Sorting by expressions is also supported.
5733

5734
        >>> df.sort(pl.col("a") + pl.col("b") * 2, nulls_last=True)
5735
        shape: (3, 3)
5736
        ┌──────┬─────┬─────┐
5737
        │ a    ┆ b   ┆ c   │
5738
        │ ---  ┆ --- ┆ --- │
5739
        │ i64  ┆ f64 ┆ str │
5740
        ╞══════╪═════╪═════╡
5741
        │ 2    ┆ 5.0 ┆ c   │
5742
        │ 1    ┆ 6.0 ┆ a   │
5743
        │ null ┆ 4.0 ┆ b   │
5744
        └──────┴─────┴─────┘
5745

5746
        Sort by multiple columns by passing a list of columns.
5747

5748
        >>> df.sort(["c", "a"], descending=True)
5749
        shape: (3, 3)
5750
        ┌──────┬─────┬─────┐
5751
        │ a    ┆ b   ┆ c   │
5752
        │ ---  ┆ --- ┆ --- │
5753
        │ i64  ┆ f64 ┆ str │
5754
        ╞══════╪═════╪═════╡
5755
        │ 2    ┆ 5.0 ┆ c   │
5756
        │ null ┆ 4.0 ┆ b   │
5757
        │ 1    ┆ 6.0 ┆ a   │
5758
        └──────┴─────┴─────┘
5759

5760
        Or use positional arguments to sort by multiple columns in the same way.
5761

5762
        >>> df.sort("c", "a", descending=[False, True])
5763
        shape: (3, 3)
5764
        ┌──────┬─────┬─────┐
5765
        │ a    ┆ b   ┆ c   │
5766
        │ ---  ┆ --- ┆ --- │
5767
        │ i64  ┆ f64 ┆ str │
5768
        ╞══════╪═════╪═════╡
5769
        │ 1    ┆ 6.0 ┆ a   │
5770
        │ null ┆ 4.0 ┆ b   │
5771
        │ 2    ┆ 5.0 ┆ c   │
5772
        └──────┴─────┴─────┘
5773
        """
5774
        from polars.lazyframe import QueryOptFlags
5775

5776
        return (
5777
            self.lazy()
5778
            .sort(
5779
                by,
5780
                *more_by,
5781
                descending=descending,
5782
                nulls_last=nulls_last,
5783
                multithreaded=multithreaded,
5784
                maintain_order=maintain_order,
5785
            )
5786
            .collect(optimizations=QueryOptFlags._eager())
5787
        )
5788

5789
    def sql(self, query: str, *, table_name: str = "self") -> DataFrame:
5790
        """
5791
        Execute a SQL query against the DataFrame.
5792

5793
        .. versionadded:: 0.20.24
5794

5795
        .. warning::
5796
            This functionality is considered **unstable**, although it is close to
5797
            being considered stable. It may be changed at any point without it being
5798
            considered a breaking change.
5799

5800
        Parameters
5801
        ----------
5802
        query
5803
            SQL query to execute.
5804
        table_name
5805
            Optionally provide an explicit name for the table that represents the
5806
            calling frame (defaults to "self").
5807

5808
        Notes
5809
        -----
5810
        * The calling frame is automatically registered as a table in the SQL context
5811
          under the name "self". If you want access to the DataFrames and LazyFrames
5812
          found in the current globals, use the top-level :meth:`pl.sql <polars.sql>`.
5813
        * More control over registration and execution behaviour is available by
5814
          using the :class:`SQLContext` object.
5815
        * The SQL query executes in lazy mode before being collected and returned
5816
          as a DataFrame.
5817

5818
        See Also
5819
        --------
5820
        SQLContext
5821

5822
        Examples
5823
        --------
5824
        >>> from datetime import date
5825
        >>> df1 = pl.DataFrame(
5826
        ...     {
5827
        ...         "a": [1, 2, 3],
5828
        ...         "b": ["zz", "yy", "xx"],
5829
        ...         "c": [date(1999, 12, 31), date(2010, 10, 10), date(2077, 8, 8)],
5830
        ...     }
5831
        ... )
5832

5833
        Query the DataFrame using SQL:
5834

5835
        >>> df1.sql("SELECT c, b FROM self WHERE a > 1")
5836
        shape: (2, 2)
5837
        ┌────────────┬─────┐
5838
        │ c          ┆ b   │
5839
        │ ---        ┆ --- │
5840
        │ date       ┆ str │
5841
        ╞════════════╪═════╡
5842
        │ 2010-10-10 ┆ yy  │
5843
        │ 2077-08-08 ┆ xx  │
5844
        └────────────┴─────┘
5845

5846
        Apply transformations to a DataFrame using SQL, aliasing "self" to "frame".
5847

5848
        >>> df1.sql(
5849
        ...     query='''
5850
        ...         SELECT
5851
        ...             a,
5852
        ...             (a % 2 == 0) AS a_is_even,
5853
        ...             CONCAT_WS(':', b, b) AS b_b,
5854
        ...             EXTRACT(year FROM c) AS year,
5855
        ...             0::float4 AS "zero",
5856
        ...         FROM frame
5857
        ...     ''',
5858
        ...     table_name="frame",
5859
        ... )
5860
        shape: (3, 5)
5861
        ┌─────┬───────────┬───────┬──────┬──────┐
5862
        │ a   ┆ a_is_even ┆ b_b   ┆ year ┆ zero │
5863
        │ --- ┆ ---       ┆ ---   ┆ ---  ┆ ---  │
5864
        │ i64 ┆ bool      ┆ str   ┆ i32  ┆ f32  │
5865
        ╞═════╪═══════════╪═══════╪══════╪══════╡
5866
        │ 1   ┆ false     ┆ zz:zz ┆ 1999 ┆ 0.0  │
5867
        │ 2   ┆ true      ┆ yy:yy ┆ 2010 ┆ 0.0  │
5868
        │ 3   ┆ false     ┆ xx:xx ┆ 2077 ┆ 0.0  │
5869
        └─────┴───────────┴───────┴──────┴──────┘
5870
        """
5871
        from polars.sql import SQLContext
5872

5873
        issue_unstable_warning(
5874
            "`sql` is considered **unstable** (although it is close to being considered stable)."
5875
        )
5876
        with SQLContext(register_globals=False, eager=True) as ctx:
5877
            name = table_name if table_name else "self"
5878
            ctx.register(name=name, frame=self)
5879
            return ctx.execute(query)
5880

5881
    @deprecate_renamed_parameter("descending", "reverse", version="1.0.0")
5882
    def top_k(
5883
        self,
5884
        k: int,
5885
        *,
5886
        by: IntoExpr | Iterable[IntoExpr],
5887
        reverse: bool | Sequence[bool] = False,
5888
    ) -> DataFrame:
5889
        """
5890
        Return the `k` largest rows.
5891

5892
        Non-null elements are always preferred over null elements, regardless of
5893
        the value of `reverse`. The output is not guaranteed to be in any
5894
        particular order, call :func:`sort` after this function if you wish the
5895
        output to be sorted.
5896

5897
        .. versionchanged:: 1.0.0
5898
            The `descending` parameter was renamed `reverse`.
5899

5900
        Parameters
5901
        ----------
5902
        k
5903
            Number of rows to return.
5904
        by
5905
            Column(s) used to determine the top rows.
5906
            Accepts expression input. Strings are parsed as column names.
5907
        reverse
5908
            Consider the `k` smallest elements of the `by` column(s) (instead of the `k`
5909
            largest). This can be specified per column by passing a sequence of
5910
            booleans.
5911

5912
        See Also
5913
        --------
5914
        bottom_k
5915

5916
        Examples
5917
        --------
5918
        >>> df = pl.DataFrame(
5919
        ...     {
5920
        ...         "a": ["a", "b", "a", "b", "b", "c"],
5921
        ...         "b": [2, 1, 1, 3, 2, 1],
5922
        ...     }
5923
        ... )
5924

5925
        Get the rows which contain the 4 largest values in column b.
5926

5927
        >>> df.top_k(4, by="b")
5928
        shape: (4, 2)
5929
        ┌─────┬─────┐
5930
        │ a   ┆ b   │
5931
        │ --- ┆ --- │
5932
        │ str ┆ i64 │
5933
        ╞═════╪═════╡
5934
        │ b   ┆ 3   │
5935
        │ a   ┆ 2   │
5936
        │ b   ┆ 2   │
5937
        │ b   ┆ 1   │
5938
        └─────┴─────┘
5939

5940
        Get the rows which contain the 4 largest values when sorting on column b and a.
5941

5942
        >>> df.top_k(4, by=["b", "a"])
5943
        shape: (4, 2)
5944
        ┌─────┬─────┐
5945
        │ a   ┆ b   │
5946
        │ --- ┆ --- │
5947
        │ str ┆ i64 │
5948
        ╞═════╪═════╡
5949
        │ b   ┆ 3   │
5950
        │ b   ┆ 2   │
5951
        │ a   ┆ 2   │
5952
        │ c   ┆ 1   │
5953
        └─────┴─────┘
5954
        """
5955
        from polars.lazyframe.opt_flags import QueryOptFlags
5956

5957
        return (
5958
            self.lazy()
5959
            .top_k(k, by=by, reverse=reverse)
5960
            .collect(
5961
                optimizations=QueryOptFlags(
5962
                    projection_pushdown=False,
5963
                    predicate_pushdown=False,
5964
                    comm_subplan_elim=False,
5965
                    slice_pushdown=True,
5966
                )
5967
            )
5968
        )
5969

5970
    @deprecate_renamed_parameter("descending", "reverse", version="1.0.0")
5971
    def bottom_k(
5972
        self,
5973
        k: int,
5974
        *,
5975
        by: IntoExpr | Iterable[IntoExpr],
5976
        reverse: bool | Sequence[bool] = False,
5977
    ) -> DataFrame:
5978
        """
5979
        Return the `k` smallest rows.
5980

5981
        Non-null elements are always preferred over null elements, regardless of
5982
        the value of `reverse`. The output is not guaranteed to be in any
5983
        particular order, call :func:`sort` after this function if you wish the
5984
        output to be sorted.
5985

5986
        .. versionchanged:: 1.0.0
5987
            The `descending` parameter was renamed `reverse`.
5988

5989
        Parameters
5990
        ----------
5991
        k
5992
            Number of rows to return.
5993
        by
5994
            Column(s) used to determine the bottom rows.
5995
            Accepts expression input. Strings are parsed as column names.
5996
        reverse
5997
            Consider the `k` largest elements of the `by` column(s) (instead of the `k`
5998
            smallest). This can be specified per column by passing a sequence of
5999
            booleans.
6000

6001
        See Also
6002
        --------
6003
        top_k
6004

6005
        Examples
6006
        --------
6007
        >>> df = pl.DataFrame(
6008
        ...     {
6009
        ...         "a": ["a", "b", "a", "b", "b", "c"],
6010
        ...         "b": [2, 1, 1, 3, 2, 1],
6011
        ...     }
6012
        ... )
6013

6014
        Get the rows which contain the 4 smallest values in column b.
6015

6016
        >>> df.bottom_k(4, by="b")
6017
        shape: (4, 2)
6018
        ┌─────┬─────┐
6019
        │ a   ┆ b   │
6020
        │ --- ┆ --- │
6021
        │ str ┆ i64 │
6022
        ╞═════╪═════╡
6023
        │ b   ┆ 1   │
6024
        │ a   ┆ 1   │
6025
        │ c   ┆ 1   │
6026
        │ a   ┆ 2   │
6027
        └─────┴─────┘
6028

6029
        Get the rows which contain the 4 smallest values when sorting on column a and b.
6030

6031
        >>> df.bottom_k(4, by=["a", "b"])
6032
        shape: (4, 2)
6033
        ┌─────┬─────┐
6034
        │ a   ┆ b   │
6035
        │ --- ┆ --- │
6036
        │ str ┆ i64 │
6037
        ╞═════╪═════╡
6038
        │ a   ┆ 1   │
6039
        │ a   ┆ 2   │
6040
        │ b   ┆ 1   │
6041
        │ b   ┆ 2   │
6042
        └─────┴─────┘
6043
        """
6044
        from polars.lazyframe.opt_flags import QueryOptFlags
6045

6046
        return (
6047
            self.lazy()
6048
            .bottom_k(k, by=by, reverse=reverse)
6049
            .collect(
6050
                optimizations=QueryOptFlags(
6051
                    projection_pushdown=False,
6052
                    predicate_pushdown=False,
6053
                    comm_subplan_elim=False,
6054
                    slice_pushdown=True,
6055
                )
6056
            )
6057
        )
6058

6059
    def equals(self, other: DataFrame, *, null_equal: bool = True) -> bool:
6060
        """
6061
        Check whether the DataFrame is equal to another DataFrame.
6062

6063
        Parameters
6064
        ----------
6065
        other
6066
            DataFrame to compare with.
6067
        null_equal
6068
            Consider null values as equal.
6069

6070
        See Also
6071
        --------
6072
        polars.testing.assert_frame_equal
6073

6074
        Examples
6075
        --------
6076
        >>> df1 = pl.DataFrame(
6077
        ...     {
6078
        ...         "foo": [1, 2, 3],
6079
        ...         "bar": [6.0, 7.0, 8.0],
6080
        ...         "ham": ["a", "b", "c"],
6081
        ...     }
6082
        ... )
6083
        >>> df2 = pl.DataFrame(
6084
        ...     {
6085
        ...         "foo": [3, 2, 1],
6086
        ...         "bar": [8.0, 7.0, 6.0],
6087
        ...         "ham": ["c", "b", "a"],
6088
        ...     }
6089
        ... )
6090
        >>> df1.equals(df1)
6091
        True
6092
        >>> df1.equals(df2)
6093
        False
6094
        """
6095
        require_same_type(self, other)
6096
        return self._df.equals(other._df, null_equal=null_equal)
6097

6098
    def slice(self, offset: int, length: int | None = None) -> DataFrame:
6099
        """
6100
        Get a slice of this DataFrame.
6101

6102
        Parameters
6103
        ----------
6104
        offset
6105
            Start index. Negative indexing is supported.
6106
        length
6107
            Length of the slice. If set to `None`, all rows starting at the offset
6108
            will be selected.
6109

6110
        Examples
6111
        --------
6112
        >>> df = pl.DataFrame(
6113
        ...     {
6114
        ...         "foo": [1, 2, 3],
6115
        ...         "bar": [6.0, 7.0, 8.0],
6116
        ...         "ham": ["a", "b", "c"],
6117
        ...     }
6118
        ... )
6119
        >>> df.slice(1, 2)
6120
        shape: (2, 3)
6121
        ┌─────┬─────┬─────┐
6122
        │ foo ┆ bar ┆ ham │
6123
        │ --- ┆ --- ┆ --- │
6124
        │ i64 ┆ f64 ┆ str │
6125
        ╞═════╪═════╪═════╡
6126
        │ 2   ┆ 7.0 ┆ b   │
6127
        │ 3   ┆ 8.0 ┆ c   │
6128
        └─────┴─────┴─────┘
6129
        """
6130
        if (length is not None) and length < 0:
6131
            length = self.height - offset + length
6132
        return self._from_pydf(self._df.slice(offset, length))
6133

6134
    def head(self, n: int = 5) -> DataFrame:
6135
        """
6136
        Get the first `n` rows.
6137

6138
        Parameters
6139
        ----------
6140
        n
6141
            Number of rows to return. If a negative value is passed, return all rows
6142
            except the last `abs(n)`.
6143

6144
        See Also
6145
        --------
6146
        tail, glimpse, slice
6147

6148
        Examples
6149
        --------
6150
        >>> df = pl.DataFrame(
6151
        ...     {
6152
        ...         "foo": [1, 2, 3, 4, 5],
6153
        ...         "bar": [6, 7, 8, 9, 10],
6154
        ...         "ham": ["a", "b", "c", "d", "e"],
6155
        ...     }
6156
        ... )
6157
        >>> df.head(3)
6158
        shape: (3, 3)
6159
        ┌─────┬─────┬─────┐
6160
        │ foo ┆ bar ┆ ham │
6161
        │ --- ┆ --- ┆ --- │
6162
        │ i64 ┆ i64 ┆ str │
6163
        ╞═════╪═════╪═════╡
6164
        │ 1   ┆ 6   ┆ a   │
6165
        │ 2   ┆ 7   ┆ b   │
6166
        │ 3   ┆ 8   ┆ c   │
6167
        └─────┴─────┴─────┘
6168

6169
        Pass a negative value to get all rows `except` the last `abs(n)`.
6170

6171
        >>> df.head(-3)
6172
        shape: (2, 3)
6173
        ┌─────┬─────┬─────┐
6174
        │ foo ┆ bar ┆ ham │
6175
        │ --- ┆ --- ┆ --- │
6176
        │ i64 ┆ i64 ┆ str │
6177
        ╞═════╪═════╪═════╡
6178
        │ 1   ┆ 6   ┆ a   │
6179
        │ 2   ┆ 7   ┆ b   │
6180
        └─────┴─────┴─────┘
6181
        """
6182
        if n < 0:
6183
            n = max(0, self.height + n)
6184
        return self._from_pydf(self._df.head(n))
6185

6186
    def tail(self, n: int = 5) -> DataFrame:
6187
        """
6188
        Get the last `n` rows.
6189

6190
        Parameters
6191
        ----------
6192
        n
6193
            Number of rows to return. If a negative value is passed, return all rows
6194
            except the first `abs(n)`.
6195

6196
        See Also
6197
        --------
6198
        head, slice
6199

6200
        Examples
6201
        --------
6202
        >>> df = pl.DataFrame(
6203
        ...     {
6204
        ...         "foo": [1, 2, 3, 4, 5],
6205
        ...         "bar": [6, 7, 8, 9, 10],
6206
        ...         "ham": ["a", "b", "c", "d", "e"],
6207
        ...     }
6208
        ... )
6209
        >>> df.tail(3)
6210
        shape: (3, 3)
6211
        ┌─────┬─────┬─────┐
6212
        │ foo ┆ bar ┆ ham │
6213
        │ --- ┆ --- ┆ --- │
6214
        │ i64 ┆ i64 ┆ str │
6215
        ╞═════╪═════╪═════╡
6216
        │ 3   ┆ 8   ┆ c   │
6217
        │ 4   ┆ 9   ┆ d   │
6218
        │ 5   ┆ 10  ┆ e   │
6219
        └─────┴─────┴─────┘
6220

6221
        Pass a negative value to get all rows `except` the first `abs(n)`.
6222

6223
        >>> df.tail(-3)
6224
        shape: (2, 3)
6225
        ┌─────┬─────┬─────┐
6226
        │ foo ┆ bar ┆ ham │
6227
        │ --- ┆ --- ┆ --- │
6228
        │ i64 ┆ i64 ┆ str │
6229
        ╞═════╪═════╪═════╡
6230
        │ 4   ┆ 9   ┆ d   │
6231
        │ 5   ┆ 10  ┆ e   │
6232
        └─────┴─────┴─────┘
6233
        """
6234
        if n < 0:
6235
            n = max(0, self.height + n)
6236
        return self._from_pydf(self._df.tail(n))
6237

6238
    def limit(self, n: int = 5) -> DataFrame:
6239
        """
6240
        Get the first `n` rows.
6241

6242
        Alias for :func:`DataFrame.head`.
6243

6244
        Parameters
6245
        ----------
6246
        n
6247
            Number of rows to return. If a negative value is passed, return all rows
6248
            except the last `abs(n)`.
6249

6250
        See Also
6251
        --------
6252
        head
6253

6254
        Examples
6255
        --------
6256
        Get the first 3 rows of a DataFrame.
6257

6258
        >>> df = pl.DataFrame(
6259
        ...     {
6260
        ...         "foo": [1, 2, 3, 4, 5],
6261
        ...         "bar": [6, 7, 8, 9, 10],
6262
        ...         "ham": ["a", "b", "c", "d", "e"],
6263
        ...     }
6264
        ... )
6265
        >>> df.limit(3)
6266
        shape: (3, 3)
6267
        ┌─────┬─────┬─────┐
6268
        │ foo ┆ bar ┆ ham │
6269
        │ --- ┆ --- ┆ --- │
6270
        │ i64 ┆ i64 ┆ str │
6271
        ╞═════╪═════╪═════╡
6272
        │ 1   ┆ 6   ┆ a   │
6273
        │ 2   ┆ 7   ┆ b   │
6274
        │ 3   ┆ 8   ┆ c   │
6275
        └─────┴─────┴─────┘
6276
        """
6277
        return self.head(n)
6278

6279
    def drop_nans(
6280
        self,
6281
        subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None,
6282
    ) -> DataFrame:
6283
        """
6284
        Drop all rows that contain one or more NaN values.
6285

6286
        The original order of the remaining rows is preserved.
6287

6288
        Parameters
6289
        ----------
6290
        subset
6291
            Column name(s) for which NaN values are considered; if set to `None`
6292
            (default), use all columns (note that only floating-point columns
6293
            can contain NaNs).
6294

6295
        See Also
6296
        --------
6297
        drop_nulls
6298

6299
        Notes
6300
        -----
6301
        A NaN value is not the same as a null value.
6302
        To drop null values, use :func:`drop_nulls`.
6303

6304
        Examples
6305
        --------
6306
        >>> df = pl.DataFrame(
6307
        ...     {
6308
        ...         "foo": [-20.5, float("nan"), 80.0],
6309
        ...         "bar": [float("nan"), 110.0, 25.5],
6310
        ...         "ham": ["xxx", "yyy", None],
6311
        ...     }
6312
        ... )
6313

6314
        The default behavior of this method is to drop rows where any single
6315
        value in the row is NaN:
6316

6317
        >>> df.drop_nans()
6318
        shape: (1, 3)
6319
        ┌──────┬──────┬──────┐
6320
        │ foo  ┆ bar  ┆ ham  │
6321
        │ ---  ┆ ---  ┆ ---  │
6322
        │ f64  ┆ f64  ┆ str  │
6323
        ╞══════╪══════╪══════╡
6324
        │ 80.0 ┆ 25.5 ┆ null │
6325
        └──────┴──────┴──────┘
6326

6327
        This behaviour can be constrained to consider only a subset of columns, as
6328
        defined by name, or with a selector. For example, dropping rows only if
6329
        there is a NaN in the "bar" column:
6330

6331
        >>> df.drop_nans(subset=["bar"])
6332
        shape: (2, 3)
6333
        ┌──────┬───────┬──────┐
6334
        │ foo  ┆ bar   ┆ ham  │
6335
        │ ---  ┆ ---   ┆ ---  │
6336
        │ f64  ┆ f64   ┆ str  │
6337
        ╞══════╪═══════╪══════╡
6338
        │ NaN  ┆ 110.0 ┆ yyy  │
6339
        │ 80.0 ┆ 25.5  ┆ null │
6340
        └──────┴───────┴──────┘
6341

6342
        Dropping a row only if *all* values are NaN requires a different formulation:
6343

6344
        >>> df = pl.DataFrame(
6345
        ...     {
6346
        ...         "a": [float("nan"), float("nan"), float("nan"), float("nan")],
6347
        ...         "b": [10.0, 2.5, float("nan"), 5.25],
6348
        ...         "c": [65.75, float("nan"), float("nan"), 10.5],
6349
        ...     }
6350
        ... )
6351
        >>> df.filter(~pl.all_horizontal(pl.all().is_nan()))
6352
        shape: (3, 3)
6353
        ┌─────┬──────┬───────┐
6354
        │ a   ┆ b    ┆ c     │
6355
        │ --- ┆ ---  ┆ ---   │
6356
        │ f64 ┆ f64  ┆ f64   │
6357
        ╞═════╪══════╪═══════╡
6358
        │ NaN ┆ 10.0 ┆ 65.75 │
6359
        │ NaN ┆ 2.5  ┆ NaN   │
6360
        │ NaN ┆ 5.25 ┆ 10.5  │
6361
        └─────┴──────┴───────┘
6362
        """
6363
        from polars.lazyframe.opt_flags import QueryOptFlags
6364

6365
        return (
6366
            self.lazy().drop_nans(subset).collect(optimizations=QueryOptFlags._eager())
6367
        )
6368

6369
    def drop_nulls(
6370
        self,
6371
        subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None,
6372
    ) -> DataFrame:
6373
        """
6374
        Drop all rows that contain one or more null values.
6375

6376
        The original order of the remaining rows is preserved.
6377

6378
        Parameters
6379
        ----------
6380
        subset
6381
            Column name(s) for which null values are considered.
6382
            If set to `None` (default), use all columns.
6383

6384
        See Also
6385
        --------
6386
        drop_nans
6387

6388
        Notes
6389
        -----
6390
        A null value is not the same as a NaN value.
6391
        To drop NaN values, use :func:`drop_nans`.
6392

6393
        Examples
6394
        --------
6395
        >>> df = pl.DataFrame(
6396
        ...     {
6397
        ...         "foo": [1, 2, 3],
6398
        ...         "bar": [6, None, 8],
6399
        ...         "ham": ["a", "b", None],
6400
        ...     }
6401
        ... )
6402

6403
        The default behavior of this method is to drop rows where any single
6404
        value of the row is null.
6405

6406
        >>> df.drop_nulls()
6407
        shape: (1, 3)
6408
        ┌─────┬─────┬─────┐
6409
        │ foo ┆ bar ┆ ham │
6410
        │ --- ┆ --- ┆ --- │
6411
        │ i64 ┆ i64 ┆ str │
6412
        ╞═════╪═════╪═════╡
6413
        │ 1   ┆ 6   ┆ a   │
6414
        └─────┴─────┴─────┘
6415

6416
        This behaviour can be constrained to consider only a subset of columns, as
6417
        defined by name or with a selector. For example, dropping rows if there is
6418
        a null in any of the integer columns:
6419

6420
        >>> import polars.selectors as cs
6421
        >>> df.drop_nulls(subset=cs.integer())
6422
        shape: (2, 3)
6423
        ┌─────┬─────┬──────┐
6424
        │ foo ┆ bar ┆ ham  │
6425
        │ --- ┆ --- ┆ ---  │
6426
        │ i64 ┆ i64 ┆ str  │
6427
        ╞═════╪═════╪══════╡
6428
        │ 1   ┆ 6   ┆ a    │
6429
        │ 3   ┆ 8   ┆ null │
6430
        └─────┴─────┴──────┘
6431

6432
        Below are some additional examples that show how to drop null
6433
        values based on other conditions.
6434

6435
        >>> df = pl.DataFrame(
6436
        ...     {
6437
        ...         "a": [None, None, None, None],
6438
        ...         "b": [1, 2, None, 1],
6439
        ...         "c": [1, None, None, 1],
6440
        ...     }
6441
        ... )
6442
        >>> df
6443
        shape: (4, 3)
6444
        ┌──────┬──────┬──────┐
6445
        │ a    ┆ b    ┆ c    │
6446
        │ ---  ┆ ---  ┆ ---  │
6447
        │ null ┆ i64  ┆ i64  │
6448
        ╞══════╪══════╪══════╡
6449
        │ null ┆ 1    ┆ 1    │
6450
        │ null ┆ 2    ┆ null │
6451
        │ null ┆ null ┆ null │
6452
        │ null ┆ 1    ┆ 1    │
6453
        └──────┴──────┴──────┘
6454

6455
        Drop a row only if all values are null:
6456

6457
        >>> df.filter(~pl.all_horizontal(pl.all().is_null()))
6458
        shape: (3, 3)
6459
        ┌──────┬─────┬──────┐
6460
        │ a    ┆ b   ┆ c    │
6461
        │ ---  ┆ --- ┆ ---  │
6462
        │ null ┆ i64 ┆ i64  │
6463
        ╞══════╪═════╪══════╡
6464
        │ null ┆ 1   ┆ 1    │
6465
        │ null ┆ 2   ┆ null │
6466
        │ null ┆ 1   ┆ 1    │
6467
        └──────┴─────┴──────┘
6468

6469
        Drop a column if all values are null:
6470

6471
        >>> df[[s.name for s in df if not (s.null_count() == df.height)]]
6472
        shape: (4, 2)
6473
        ┌──────┬──────┐
6474
        │ b    ┆ c    │
6475
        │ ---  ┆ ---  │
6476
        │ i64  ┆ i64  │
6477
        ╞══════╪══════╡
6478
        │ 1    ┆ 1    │
6479
        │ 2    ┆ null │
6480
        │ null ┆ null │
6481
        │ 1    ┆ 1    │
6482
        └──────┴──────┘
6483
        """
6484
        from polars.lazyframe.opt_flags import QueryOptFlags
6485

6486
        return (
6487
            self.lazy().drop_nulls(subset).collect(optimizations=QueryOptFlags._eager())
6488
        )
6489

6490
    def pipe(
6491
        self,
6492
        function: Callable[Concatenate[DataFrame, P], T],
6493
        *args: P.args,
6494
        **kwargs: P.kwargs,
6495
    ) -> T:
6496
        """
6497
        Offers a structured way to apply a sequence of user-defined functions (UDFs).
6498

6499
        Parameters
6500
        ----------
6501
        function
6502
            Callable; will receive the frame as the first parameter,
6503
            followed by any given args/kwargs.
6504
        *args
6505
            Arguments to pass to the UDF.
6506
        **kwargs
6507
            Keyword arguments to pass to the UDF.
6508

6509
        Notes
6510
        -----
6511
        It is recommended to use LazyFrame when piping operations, in order
6512
        to fully take advantage of query optimization and parallelization.
6513
        See :meth:`df.lazy() <polars.DataFrame.lazy>`.
6514

6515
        Examples
6516
        --------
6517
        >>> def cast_str_to_int(data, col_name):
6518
        ...     return data.with_columns(pl.col(col_name).cast(pl.Int64))
6519
        >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]})
6520
        >>> df.pipe(cast_str_to_int, col_name="b")
6521
        shape: (4, 2)
6522
        ┌─────┬─────┐
6523
        │ a   ┆ b   │
6524
        │ --- ┆ --- │
6525
        │ i64 ┆ i64 │
6526
        ╞═════╪═════╡
6527
        │ 1   ┆ 10  │
6528
        │ 2   ┆ 20  │
6529
        │ 3   ┆ 30  │
6530
        │ 4   ┆ 40  │
6531
        └─────┴─────┘
6532

6533
        >>> df = pl.DataFrame({"b": [1, 2], "a": [3, 4]})
6534
        >>> df
6535
        shape: (2, 2)
6536
        ┌─────┬─────┐
6537
        │ b   ┆ a   │
6538
        │ --- ┆ --- │
6539
        │ i64 ┆ i64 │
6540
        ╞═════╪═════╡
6541
        │ 1   ┆ 3   │
6542
        │ 2   ┆ 4   │
6543
        └─────┴─────┘
6544
        >>> df.pipe(lambda tdf: tdf.select(sorted(tdf.columns)))
6545
        shape: (2, 2)
6546
        ┌─────┬─────┐
6547
        │ a   ┆ b   │
6548
        │ --- ┆ --- │
6549
        │ i64 ┆ i64 │
6550
        ╞═════╪═════╡
6551
        │ 3   ┆ 1   │
6552
        │ 4   ┆ 2   │
6553
        └─────┴─────┘
6554
        """
6555
        return function(self, *args, **kwargs)
6556

6557
    def map_columns(
6558
        self,
6559
        column_names: str | Sequence[str] | pl.Selector,
6560
        function: Callable[[Series], Series],
6561
        *args: P.args,
6562
        **kwargs: P.kwargs,
6563
    ) -> DataFrame:
6564
        """
6565
        Apply eager functions to columns of a DataFrame.
6566

6567
        Users should always prefer :meth:`with_columns` unless they are using
6568
        expressions that are only possible on `Series` and not on `Expr`. This is almost
6569
        never the case, except for a very select few functions that cannot know the
6570
        output datatype without looking at the data.
6571

6572
        Parameters
6573
        ----------
6574
        column_names
6575
            The columns to apply the UDF to.
6576
        function
6577
            Callable; will receive a column series as the first parameter,
6578
            followed by any given args/kwargs.
6579
        *args
6580
            Arguments to pass to the UDF.
6581
        **kwargs
6582
            Keyword arguments to pass to the UDF.
6583

6584
        Examples
6585
        --------
6586
        >>> df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["10", "20", "30", "40"]})
6587
        >>> df.map_columns("a", lambda s: s.shrink_dtype())
6588
        shape: (4, 2)
6589
        ┌─────┬─────┐
6590
        │ a   ┆ b   │
6591
        │ --- ┆ --- │
6592
        │ i8  ┆ str │
6593
        ╞═════╪═════╡
6594
        │ 1   ┆ 10  │
6595
        │ 2   ┆ 20  │
6596
        │ 3   ┆ 30  │
6597
        │ 4   ┆ 40  │
6598
        └─────┴─────┘
6599

6600
        >>> df = pl.DataFrame(
6601
        ...     {
6602
        ...         "a": ['{"x":"a"}', None, '{"x":"b"}', None],
6603
        ...         "b": ['{"a":1, "b": true}', None, '{"a":2, "b": false}', None],
6604
        ...     }
6605
        ... )
6606
        >>> df.map_columns(["a", "b"], lambda s: s.str.json_decode())
6607
        shape: (4, 2)
6608
        ┌───────────┬───────────┐
6609
        │ a         ┆ b         │
6610
        │ ---       ┆ ---       │
6611
        │ struct[1] ┆ struct[2] │
6612
        ╞═══════════╪═══════════╡
6613
        │ {"a"}     ┆ {1,true}  │
6614
        │ null      ┆ null      │
6615
        │ {"b"}     ┆ {2,false} │
6616
        │ null      ┆ null      │
6617
        └───────────┴───────────┘
6618
        >>> import polars.selectors as cs
6619
        >>> df.map_columns(cs.all(), lambda s: s.str.json_decode())
6620
        shape: (4, 2)
6621
        ┌───────────┬───────────┐
6622
        │ a         ┆ b         │
6623
        │ ---       ┆ ---       │
6624
        │ struct[1] ┆ struct[2] │
6625
        ╞═══════════╪═══════════╡
6626
        │ {"a"}     ┆ {1,true}  │
6627
        │ null      ┆ null      │
6628
        │ {"b"}     ┆ {2,false} │
6629
        │ null      ┆ null      │
6630
        └───────────┴───────────┘
6631

6632
        See Also
6633
        --------
6634
        with_columns
6635
        """
6636
        c_names: list[str]
6637
        if isinstance(column_names, (pl.Selector, pl.Expr)):
6638
            from polars.selectors import expand_selector
6639

6640
            c_names = list(expand_selector(self, column_names))
6641
        elif isinstance(column_names, str):
6642
            c_names = [column_names]
6643
        else:
6644
            c_names = list(column_names)
6645

6646
        return self.with_columns(
6647
            **{c: function(self[c], *args, **kwargs) for c in c_names}
6648
        )
6649

6650
    def with_row_index(self, name: str = "index", offset: int = 0) -> DataFrame:
6651
        """
6652
        Add a row index as the first column in the DataFrame.
6653

6654
        Parameters
6655
        ----------
6656
        name
6657
            Name of the index column.
6658
        offset
6659
            Start the index at this offset. Cannot be negative.
6660

6661
        Notes
6662
        -----
6663
        The resulting column does not have any special properties. It is a regular
6664
        column of type `UInt32` (or `UInt64` in `polars-u64-idx`).
6665

6666
        Examples
6667
        --------
6668
        >>> df = pl.DataFrame(
6669
        ...     {
6670
        ...         "a": [1, 3, 5],
6671
        ...         "b": [2, 4, 6],
6672
        ...     }
6673
        ... )
6674
        >>> df.with_row_index()
6675
        shape: (3, 3)
6676
        ┌───────┬─────┬─────┐
6677
        │ index ┆ a   ┆ b   │
6678
        │ ---   ┆ --- ┆ --- │
6679
        │ u32   ┆ i64 ┆ i64 │
6680
        ╞═══════╪═════╪═════╡
6681
        │ 0     ┆ 1   ┆ 2   │
6682
        │ 1     ┆ 3   ┆ 4   │
6683
        │ 2     ┆ 5   ┆ 6   │
6684
        └───────┴─────┴─────┘
6685
        >>> df.with_row_index("id", offset=1000)
6686
        shape: (3, 3)
6687
        ┌──────┬─────┬─────┐
6688
        │ id   ┆ a   ┆ b   │
6689
        │ ---  ┆ --- ┆ --- │
6690
        │ u32  ┆ i64 ┆ i64 │
6691
        ╞══════╪═════╪═════╡
6692
        │ 1000 ┆ 1   ┆ 2   │
6693
        │ 1001 ┆ 3   ┆ 4   │
6694
        │ 1002 ┆ 5   ┆ 6   │
6695
        └──────┴─────┴─────┘
6696

6697
        An index column can also be created using the expressions :func:`int_range`
6698
        and :func:`len`.
6699

6700
        >>> df.select(
6701
        ...     pl.int_range(pl.len(), dtype=pl.UInt32).alias("index"),
6702
        ...     pl.all(),
6703
        ... )
6704
        shape: (3, 3)
6705
        ┌───────┬─────┬─────┐
6706
        │ index ┆ a   ┆ b   │
6707
        │ ---   ┆ --- ┆ --- │
6708
        │ u32   ┆ i64 ┆ i64 │
6709
        ╞═══════╪═════╪═════╡
6710
        │ 0     ┆ 1   ┆ 2   │
6711
        │ 1     ┆ 3   ┆ 4   │
6712
        │ 2     ┆ 5   ┆ 6   │
6713
        └───────┴─────┴─────┘
6714
        """
6715
        try:
6716
            return self._from_pydf(self._df.with_row_index(name, offset))
6717
        except OverflowError:
6718
            issue = "negative" if offset < 0 else "greater than the maximum index value"
6719
            msg = f"`offset` input for `with_row_index` cannot be {issue}, got {offset}"
6720
            raise ValueError(msg) from None
6721

6722
    @deprecated(
6723
        "`DataFrame.with_row_count` is deprecated; use `with_row_index` instead."
6724
        " Note that the default column name has changed from 'row_nr' to 'index'."
6725
    )
6726
    def with_row_count(self, name: str = "row_nr", offset: int = 0) -> DataFrame:
6727
        """
6728
        Add a column at index 0 that counts the rows.
6729

6730
        .. deprecated:: 0.20.4
6731
            Use the :meth:`with_row_index` method instead.
6732
            Note that the default column name has changed from 'row_nr' to 'index'.
6733

6734
        Parameters
6735
        ----------
6736
        name
6737
            Name of the column to add.
6738
        offset
6739
            Start the row count at this offset. Default = 0
6740

6741
        Examples
6742
        --------
6743
        >>> df = pl.DataFrame(
6744
        ...     {
6745
        ...         "a": [1, 3, 5],
6746
        ...         "b": [2, 4, 6],
6747
        ...     }
6748
        ... )
6749
        >>> df.with_row_count()  # doctest: +SKIP
6750
        shape: (3, 3)
6751
        ┌────────┬─────┬─────┐
6752
        │ row_nr ┆ a   ┆ b   │
6753
        │ ---    ┆ --- ┆ --- │
6754
        │ u32    ┆ i64 ┆ i64 │
6755
        ╞════════╪═════╪═════╡
6756
        │ 0      ┆ 1   ┆ 2   │
6757
        │ 1      ┆ 3   ┆ 4   │
6758
        │ 2      ┆ 5   ┆ 6   │
6759
        └────────┴─────┴─────┘
6760
        """
6761
        return self.with_row_index(name, offset)
6762

6763
    def group_by(
6764
        self,
6765
        *by: IntoExpr | Iterable[IntoExpr],
6766
        maintain_order: bool = False,
6767
        **named_by: IntoExpr,
6768
    ) -> GroupBy:
6769
        """
6770
        Start a group by operation.
6771

6772
        Parameters
6773
        ----------
6774
        *by
6775
            Column(s) to group by. Accepts expression input. Strings are parsed as
6776
            column names.
6777
        maintain_order
6778
            Ensure that the order of the groups is consistent with the input data.
6779
            This is slower than a default group by.
6780
            Settings this to `True` blocks the possibility
6781
            to run on the streaming engine.
6782

6783
            .. note::
6784
                Within each group, the order of rows is always preserved, regardless
6785
                of this argument.
6786
        **named_by
6787
            Additional columns to group by, specified as keyword arguments.
6788
            The columns will be renamed to the keyword used.
6789

6790
        Returns
6791
        -------
6792
        GroupBy
6793
            Object which can be used to perform aggregations.
6794

6795
        Examples
6796
        --------
6797
        Group by one column and call `agg` to compute the grouped sum of another
6798
        column.
6799

6800
        >>> df = pl.DataFrame(
6801
        ...     {
6802
        ...         "a": ["a", "b", "a", "b", "c"],
6803
        ...         "b": [1, 2, 1, 3, 3],
6804
        ...         "c": [5, 4, 3, 2, 1],
6805
        ...     }
6806
        ... )
6807
        >>> df.group_by("a").agg(pl.col("b").sum())  # doctest: +IGNORE_RESULT
6808
        shape: (3, 2)
6809
        ┌─────┬─────┐
6810
        │ a   ┆ b   │
6811
        │ --- ┆ --- │
6812
        │ str ┆ i64 │
6813
        ╞═════╪═════╡
6814
        │ a   ┆ 2   │
6815
        │ b   ┆ 5   │
6816
        │ c   ┆ 3   │
6817
        └─────┴─────┘
6818

6819
        Set `maintain_order=True` to ensure the order of the groups is consistent with
6820
        the input.
6821

6822
        >>> df.group_by("a", maintain_order=True).agg(pl.col("c"))
6823
        shape: (3, 2)
6824
        ┌─────┬───────────┐
6825
        │ a   ┆ c         │
6826
        │ --- ┆ ---       │
6827
        │ str ┆ list[i64] │
6828
        ╞═════╪═══════════╡
6829
        │ a   ┆ [5, 3]    │
6830
        │ b   ┆ [4, 2]    │
6831
        │ c   ┆ [1]       │
6832
        └─────┴───────────┘
6833

6834
        Group by multiple columns by passing a list of column names.
6835

6836
        >>> df.group_by(["a", "b"]).agg(pl.max("c"))  # doctest: +IGNORE_RESULT
6837
        shape: (4, 3)
6838
        ┌─────┬─────┬─────┐
6839
        │ a   ┆ b   ┆ c   │
6840
        │ --- ┆ --- ┆ --- │
6841
        │ str ┆ i64 ┆ i64 │
6842
        ╞═════╪═════╪═════╡
6843
        │ a   ┆ 1   ┆ 5   │
6844
        │ b   ┆ 2   ┆ 4   │
6845
        │ b   ┆ 3   ┆ 2   │
6846
        │ c   ┆ 3   ┆ 1   │
6847
        └─────┴─────┴─────┘
6848

6849
        Or use positional arguments to group by multiple columns in the same way.
6850
        Expressions are also accepted.
6851

6852
        >>> df.group_by("a", pl.col("b") // 2).agg(pl.col("c").mean())  # doctest: +SKIP
6853
        shape: (3, 3)
6854
        ┌─────┬─────┬─────┐
6855
        │ a   ┆ b   ┆ c   │
6856
        │ --- ┆ --- ┆ --- │
6857
        │ str ┆ i64 ┆ f64 │
6858
        ╞═════╪═════╪═════╡
6859
        │ a   ┆ 0   ┆ 4.0 │
6860
        │ b   ┆ 1   ┆ 3.0 │
6861
        │ c   ┆ 1   ┆ 1.0 │
6862
        └─────┴─────┴─────┘
6863

6864
        The `GroupBy` object returned by this method is iterable, returning the name
6865
        and data of each group.
6866

6867
        >>> for name, data in df.group_by("a"):  # doctest: +SKIP
6868
        ...     print(name)
6869
        ...     print(data)
6870
        ('a',)
6871
        shape: (2, 3)
6872
        ┌─────┬─────┬─────┐
6873
        │ a   ┆ b   ┆ c   │
6874
        │ --- ┆ --- ┆ --- │
6875
        │ str ┆ i64 ┆ i64 │
6876
        ╞═════╪═════╪═════╡
6877
        │ a   ┆ 1   ┆ 5   │
6878
        │ a   ┆ 1   ┆ 3   │
6879
        └─────┴─────┴─────┘
6880
        ('b',)
6881
        shape: (2, 3)
6882
        ┌─────┬─────┬─────┐
6883
        │ a   ┆ b   ┆ c   │
6884
        │ --- ┆ --- ┆ --- │
6885
        │ str ┆ i64 ┆ i64 │
6886
        ╞═════╪═════╪═════╡
6887
        │ b   ┆ 2   ┆ 4   │
6888
        │ b   ┆ 3   ┆ 2   │
6889
        └─────┴─────┴─────┘
6890
        ('c',)
6891
        shape: (1, 3)
6892
        ┌─────┬─────┬─────┐
6893
        │ a   ┆ b   ┆ c   │
6894
        │ --- ┆ --- ┆ --- │
6895
        │ str ┆ i64 ┆ i64 │
6896
        ╞═════╪═════╪═════╡
6897
        │ c   ┆ 3   ┆ 1   │
6898
        └─────┴─────┴─────┘
6899
        """
6900
        for value in named_by.values():
6901
            if not isinstance(value, (str, pl.Expr, pl.Series)):
6902
                msg = (
6903
                    f"Expected Polars expression or object convertible to one, got {type(value)}.\n\n"
6904
                    "Hint: if you tried\n"
6905
                    f"    group_by(by={value!r})\n"
6906
                    "then you probably want to use this instead:\n"
6907
                    f"    group_by({value!r})"
6908
                )
6909
                raise TypeError(msg)
6910
        return GroupBy(self, *by, **named_by, maintain_order=maintain_order)
6911

6912
    @deprecate_renamed_parameter("by", "group_by", version="0.20.14")
6913
    def rolling(
6914
        self,
6915
        index_column: IntoExpr,
6916
        *,
6917
        period: str | timedelta,
6918
        offset: str | timedelta | None = None,
6919
        closed: ClosedInterval = "right",
6920
        group_by: IntoExpr | Iterable[IntoExpr] | None = None,
6921
    ) -> RollingGroupBy:
6922
        """
6923
        Create rolling groups based on a temporal or integer column.
6924

6925
        Different from a `group_by_dynamic` the windows are now determined by the
6926
        individual values and are not of constant intervals. For constant intervals use
6927
        :func:`DataFrame.group_by_dynamic`.
6928

6929
        If you have a time series `<t_0, t_1, ..., t_n>`, then by default the
6930
        windows created will be
6931

6932
            * (t_0 - period, t_0]
6933
            * (t_1 - period, t_1]
6934
            * ...
6935
            * (t_n - period, t_n]
6936

6937
        whereas if you pass a non-default `offset`, then the windows will be
6938

6939
            * (t_0 + offset, t_0 + offset + period]
6940
            * (t_1 + offset, t_1 + offset + period]
6941
            * ...
6942
            * (t_n + offset, t_n + offset + period]
6943

6944
        The `period` and `offset` arguments are created either from a timedelta, or
6945
        by using the following string language:
6946

6947
        - 1ns   (1 nanosecond)
6948
        - 1us   (1 microsecond)
6949
        - 1ms   (1 millisecond)
6950
        - 1s    (1 second)
6951
        - 1m    (1 minute)
6952
        - 1h    (1 hour)
6953
        - 1d    (1 calendar day)
6954
        - 1w    (1 calendar week)
6955
        - 1mo   (1 calendar month)
6956
        - 1q    (1 calendar quarter)
6957
        - 1y    (1 calendar year)
6958
        - 1i    (1 index count)
6959

6960
        Or combine them:
6961
        "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
6962

6963
        By "calendar day", we mean the corresponding time on the next day (which may
6964
        not be 24 hours, due to daylight savings). Similarly for "calendar week",
6965
        "calendar month", "calendar quarter", and "calendar year".
6966

6967
        .. versionchanged:: 0.20.14
6968
            The `by` parameter was renamed `group_by`.
6969

6970
        Parameters
6971
        ----------
6972
        index_column
6973
            Column used to group based on the time window.
6974
            Often of type Date/Datetime.
6975
            This column must be sorted in ascending order (or, if `group_by` is
6976
            specified, then it must be sorted in ascending order within each group).
6977

6978
            In case of a rolling operation on indices, dtype needs to be one of
6979
            {UInt32, UInt64, Int32, Int64}. Note that the first three get temporarily
6980
            cast to Int64, so if performance matters use an Int64 column.
6981
        period
6982
            Length of the window - must be non-negative.
6983
        offset
6984
            Offset of the window. Default is `-period`.
6985
        closed : {'right', 'left', 'both', 'none'}
6986
            Define which sides of the temporal interval are closed (inclusive).
6987
        group_by
6988
            Also group by this column/these columns
6989

6990
        Returns
6991
        -------
6992
        RollingGroupBy
6993
            Object you can call `.agg` on to aggregate by groups, the result
6994
            of which will be sorted by `index_column` (but note that if `group_by`
6995
            columns are passed, it will only be sorted within each group).
6996

6997
        See Also
6998
        --------
6999
        group_by_dynamic
7000

7001
        Examples
7002
        --------
7003
        >>> dates = [
7004
        ...     "2020-01-01 13:45:48",
7005
        ...     "2020-01-01 16:42:13",
7006
        ...     "2020-01-01 16:45:09",
7007
        ...     "2020-01-02 18:12:48",
7008
        ...     "2020-01-03 19:45:32",
7009
        ...     "2020-01-08 23:16:43",
7010
        ... ]
7011
        >>> df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).with_columns(
7012
        ...     pl.col("dt").str.strptime(pl.Datetime).set_sorted()
7013
        ... )
7014
        >>> out = df.rolling(index_column="dt", period="2d").agg(
7015
        ...     [
7016
        ...         pl.sum("a").alias("sum_a"),
7017
        ...         pl.min("a").alias("min_a"),
7018
        ...         pl.max("a").alias("max_a"),
7019
        ...     ]
7020
        ... )
7021
        >>> assert out["sum_a"].to_list() == [3, 10, 15, 24, 11, 1]
7022
        >>> assert out["max_a"].to_list() == [3, 7, 7, 9, 9, 1]
7023
        >>> assert out["min_a"].to_list() == [3, 3, 3, 3, 2, 1]
7024
        >>> out
7025
        shape: (6, 4)
7026
        ┌─────────────────────┬───────┬───────┬───────┐
7027
        │ dt                  ┆ sum_a ┆ min_a ┆ max_a │
7028
        │ ---                 ┆ ---   ┆ ---   ┆ ---   │
7029
        │ datetime[μs]        ┆ i64   ┆ i64   ┆ i64   │
7030
        ╞═════════════════════╪═══════╪═══════╪═══════╡
7031
        │ 2020-01-01 13:45:48 ┆ 3     ┆ 3     ┆ 3     │
7032
        │ 2020-01-01 16:42:13 ┆ 10    ┆ 3     ┆ 7     │
7033
        │ 2020-01-01 16:45:09 ┆ 15    ┆ 3     ┆ 7     │
7034
        │ 2020-01-02 18:12:48 ┆ 24    ┆ 3     ┆ 9     │
7035
        │ 2020-01-03 19:45:32 ┆ 11    ┆ 2     ┆ 9     │
7036
        │ 2020-01-08 23:16:43 ┆ 1     ┆ 1     ┆ 1     │
7037
        └─────────────────────┴───────┴───────┴───────┘
7038

7039
        If you use an index count in `period` or `offset`, then it's based on the
7040
        values in `index_column`:
7041

7042
        >>> df = pl.DataFrame({"int": [0, 4, 5, 6, 8], "value": [1, 4, 2, 4, 1]})
7043
        >>> df.rolling("int", period="3i").agg(pl.col("int").alias("aggregated"))
7044
        shape: (5, 2)
7045
        ┌─────┬────────────┐
7046
        │ int ┆ aggregated │
7047
        │ --- ┆ ---        │
7048
        │ i64 ┆ list[i64]  │
7049
        ╞═════╪════════════╡
7050
        │ 0   ┆ [0]        │
7051
        │ 4   ┆ [4]        │
7052
        │ 5   ┆ [4, 5]     │
7053
        │ 6   ┆ [4, 5, 6]  │
7054
        │ 8   ┆ [6, 8]     │
7055
        └─────┴────────────┘
7056

7057
        If you want the index count to be based on row number, then you may want to
7058
        combine `rolling` with :meth:`.with_row_index`.
7059
        """
7060
        return RollingGroupBy(
7061
            self,
7062
            index_column=index_column,
7063
            period=period,
7064
            offset=offset,
7065
            closed=closed,
7066
            group_by=group_by,
7067
        )
7068

7069
    @deprecate_renamed_parameter("by", "group_by", version="0.20.14")
7070
    def group_by_dynamic(
7071
        self,
7072
        index_column: IntoExpr,
7073
        *,
7074
        every: str | timedelta,
7075
        period: str | timedelta | None = None,
7076
        offset: str | timedelta | None = None,
7077
        include_boundaries: bool = False,
7078
        closed: ClosedInterval = "left",
7079
        label: Label = "left",
7080
        group_by: IntoExpr | Iterable[IntoExpr] | None = None,
7081
        start_by: StartBy = "window",
7082
    ) -> DynamicGroupBy:
7083
        """
7084
        Group based on a time value (or index value of type Int32, Int64).
7085

7086
        Time windows are calculated and rows are assigned to windows. Different from a
7087
        normal group by is that a row can be member of multiple groups.
7088
        By default, the windows look like:
7089

7090
        - [start, start + period)
7091
        - [start + every, start + every + period)
7092
        - [start + 2*every, start + 2*every + period)
7093
        - ...
7094

7095
        where `start` is determined by `start_by`, `offset`, `every`, and the earliest
7096
        datapoint. See the `start_by` argument description for details.
7097

7098
        .. warning::
7099
            The index column must be sorted in ascending order. If `group_by` is passed, then
7100
            the index column must be sorted in ascending order within each group.
7101

7102
        .. versionchanged:: 0.20.14
7103
            The `by` parameter was renamed `group_by`.
7104

7105
        Parameters
7106
        ----------
7107
        index_column
7108
            Column used to group based on the time window.
7109
            Often of type Date/Datetime.
7110
            This column must be sorted in ascending order (or, if `group_by` is specified,
7111
            then it must be sorted in ascending order within each group).
7112

7113
            In case of a dynamic group by on indices, dtype needs to be one of
7114
            {Int32, Int64}. Note that Int32 gets temporarily cast to Int64, so if
7115
            performance matters use an Int64 column.
7116
        every
7117
            interval of the window
7118
        period
7119
            length of the window, if None it will equal 'every'
7120
        offset
7121
            offset of the window, does not take effect if `start_by` is 'datapoint'.
7122
            Defaults to zero.
7123
        include_boundaries
7124
            Add the lower and upper bound of the window to the "_lower_boundary" and
7125
            "_upper_boundary" columns. This will impact performance because it's harder to
7126
            parallelize
7127
        closed : {'left', 'right', 'both', 'none'}
7128
            Define which sides of the temporal interval are closed (inclusive).
7129
        label : {'left', 'right', 'datapoint'}
7130
            Define which label to use for the window:
7131

7132
            - 'left': lower boundary of the window
7133
            - 'right': upper boundary of the window
7134
            - 'datapoint': the first value of the index column in the given window.
7135
              If you don't need the label to be at one of the boundaries, choose this
7136
              option for maximum performance
7137
        group_by
7138
            Also group by this column/these columns
7139
        start_by : {'window', 'datapoint', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'}
7140
            The strategy to determine the start of the first window by.
7141

7142
            * 'window': Start by taking the earliest timestamp, truncating it with
7143
              `every`, and then adding `offset`.
7144
              Note that weekly windows start on Monday.
7145
            * 'datapoint': Start from the first encountered data point.
7146
            * a day of the week (only takes effect if `every` contains `'w'`):
7147

7148
              * 'monday': Start the window on the Monday before the first data point.
7149
              * 'tuesday': Start the window on the Tuesday before the first data point.
7150
              * ...
7151
              * 'sunday': Start the window on the Sunday before the first data point.
7152

7153
              The resulting window is then shifted back until the earliest datapoint
7154
              is in or in front of it.
7155

7156
        Returns
7157
        -------
7158
        DynamicGroupBy
7159
            Object you can call `.agg` on to aggregate by groups, the result
7160
            of which will be sorted by `index_column` (but note that if `group_by` columns are
7161
            passed, it will only be sorted within each group).
7162

7163
        See Also
7164
        --------
7165
        rolling
7166

7167
        Notes
7168
        -----
7169
        1) If you're coming from pandas, then
7170

7171
           .. code-block:: python
7172

7173
               # polars
7174
               df.group_by_dynamic("ts", every="1d").agg(pl.col("value").sum())
7175

7176
           is equivalent to
7177

7178
           .. code-block:: python
7179

7180
               # pandas
7181
               df.set_index("ts").resample("D")["value"].sum().reset_index()
7182

7183
           though note that, unlike pandas, polars doesn't add extra rows for empty
7184
           windows. If you need `index_column` to be evenly spaced, then please combine
7185
           with :func:`DataFrame.upsample`.
7186

7187
        2) The `every`, `period` and `offset` arguments are created with
7188
           the following string language:
7189

7190
           - 1ns   (1 nanosecond)
7191
           - 1us   (1 microsecond)
7192
           - 1ms   (1 millisecond)
7193
           - 1s    (1 second)
7194
           - 1m    (1 minute)
7195
           - 1h    (1 hour)
7196
           - 1d    (1 calendar day)
7197
           - 1w    (1 calendar week)
7198
           - 1mo   (1 calendar month)
7199
           - 1q    (1 calendar quarter)
7200
           - 1y    (1 calendar year)
7201
           - 1i    (1 index count)
7202

7203
           Or combine them (except in `every`):
7204
           "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
7205

7206
           By "calendar day", we mean the corresponding time on the next day (which may
7207
           not be 24 hours, due to daylight savings). Similarly for "calendar week",
7208
           "calendar month", "calendar quarter", and "calendar year".
7209

7210
           In case of a group_by_dynamic on an integer column, the windows are defined by:
7211

7212
           - "1i"      # length 1
7213
           - "10i"     # length 10
7214

7215
        Examples
7216
        --------
7217
        >>> from datetime import datetime
7218
        >>> df = pl.DataFrame(
7219
        ...     {
7220
        ...         "time": pl.datetime_range(
7221
        ...             start=datetime(2021, 12, 16),
7222
        ...             end=datetime(2021, 12, 16, 3),
7223
        ...             interval="30m",
7224
        ...             eager=True,
7225
        ...         ),
7226
        ...         "n": range(7),
7227
        ...     }
7228
        ... )
7229
        >>> df
7230
        shape: (7, 2)
7231
        ┌─────────────────────┬─────┐
7232
        │ time                ┆ n   │
7233
        │ ---                 ┆ --- │
7234
        │ datetime[μs]        ┆ i64 │
7235
        ╞═════════════════════╪═════╡
7236
        │ 2021-12-16 00:00:00 ┆ 0   │
7237
        │ 2021-12-16 00:30:00 ┆ 1   │
7238
        │ 2021-12-16 01:00:00 ┆ 2   │
7239
        │ 2021-12-16 01:30:00 ┆ 3   │
7240
        │ 2021-12-16 02:00:00 ┆ 4   │
7241
        │ 2021-12-16 02:30:00 ┆ 5   │
7242
        │ 2021-12-16 03:00:00 ┆ 6   │
7243
        └─────────────────────┴─────┘
7244

7245
        Group by windows of 1 hour.
7246

7247
        >>> df.group_by_dynamic("time", every="1h", closed="right").agg(pl.col("n"))
7248
        shape: (4, 2)
7249
        ┌─────────────────────┬───────────┐
7250
        │ time                ┆ n         │
7251
        │ ---                 ┆ ---       │
7252
        │ datetime[μs]        ┆ list[i64] │
7253
        ╞═════════════════════╪═══════════╡
7254
        │ 2021-12-15 23:00:00 ┆ [0]       │
7255
        │ 2021-12-16 00:00:00 ┆ [1, 2]    │
7256
        │ 2021-12-16 01:00:00 ┆ [3, 4]    │
7257
        │ 2021-12-16 02:00:00 ┆ [5, 6]    │
7258
        └─────────────────────┴───────────┘
7259

7260
        The window boundaries can also be added to the aggregation result
7261

7262
        >>> df.group_by_dynamic(
7263
        ...     "time", every="1h", include_boundaries=True, closed="right"
7264
        ... ).agg(pl.col("n").mean())
7265
        shape: (4, 4)
7266
        ┌─────────────────────┬─────────────────────┬─────────────────────┬─────┐
7267
        │ _lower_boundary     ┆ _upper_boundary     ┆ time                ┆ n   │
7268
        │ ---                 ┆ ---                 ┆ ---                 ┆ --- │
7269
        │ datetime[μs]        ┆ datetime[μs]        ┆ datetime[μs]        ┆ f64 │
7270
        ╞═════════════════════╪═════════════════════╪═════════════════════╪═════╡
7271
        │ 2021-12-15 23:00:00 ┆ 2021-12-16 00:00:00 ┆ 2021-12-15 23:00:00 ┆ 0.0 │
7272
        │ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ 1.5 │
7273
        │ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ 3.5 │
7274
        │ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ 5.5 │
7275
        └─────────────────────┴─────────────────────┴─────────────────────┴─────┘
7276

7277
        When closed="left", the window excludes the right end of interval:
7278
        [lower_bound, upper_bound)
7279

7280
        >>> df.group_by_dynamic("time", every="1h", closed="left").agg(pl.col("n"))
7281
        shape: (4, 2)
7282
        ┌─────────────────────┬───────────┐
7283
        │ time                ┆ n         │
7284
        │ ---                 ┆ ---       │
7285
        │ datetime[μs]        ┆ list[i64] │
7286
        ╞═════════════════════╪═══════════╡
7287
        │ 2021-12-16 00:00:00 ┆ [0, 1]    │
7288
        │ 2021-12-16 01:00:00 ┆ [2, 3]    │
7289
        │ 2021-12-16 02:00:00 ┆ [4, 5]    │
7290
        │ 2021-12-16 03:00:00 ┆ [6]       │
7291
        └─────────────────────┴───────────┘
7292

7293
        When closed="both" the time values at the window boundaries belong to 2 groups.
7294

7295
        >>> df.group_by_dynamic("time", every="1h", closed="both").agg(pl.col("n"))
7296
        shape: (4, 2)
7297
        ┌─────────────────────┬───────────┐
7298
        │ time                ┆ n         │
7299
        │ ---                 ┆ ---       │
7300
        │ datetime[μs]        ┆ list[i64] │
7301
        ╞═════════════════════╪═══════════╡
7302
        │ 2021-12-16 00:00:00 ┆ [0, 1, 2] │
7303
        │ 2021-12-16 01:00:00 ┆ [2, 3, 4] │
7304
        │ 2021-12-16 02:00:00 ┆ [4, 5, 6] │
7305
        │ 2021-12-16 03:00:00 ┆ [6]       │
7306
        └─────────────────────┴───────────┘
7307

7308
        Dynamic group bys can also be combined with grouping on normal keys
7309

7310
        >>> df = df.with_columns(groups=pl.Series(["a", "a", "a", "b", "b", "a", "a"]))
7311
        >>> df
7312
        shape: (7, 3)
7313
        ┌─────────────────────┬─────┬────────┐
7314
        │ time                ┆ n   ┆ groups │
7315
        │ ---                 ┆ --- ┆ ---    │
7316
        │ datetime[μs]        ┆ i64 ┆ str    │
7317
        ╞═════════════════════╪═════╪════════╡
7318
        │ 2021-12-16 00:00:00 ┆ 0   ┆ a      │
7319
        │ 2021-12-16 00:30:00 ┆ 1   ┆ a      │
7320
        │ 2021-12-16 01:00:00 ┆ 2   ┆ a      │
7321
        │ 2021-12-16 01:30:00 ┆ 3   ┆ b      │
7322
        │ 2021-12-16 02:00:00 ┆ 4   ┆ b      │
7323
        │ 2021-12-16 02:30:00 ┆ 5   ┆ a      │
7324
        │ 2021-12-16 03:00:00 ┆ 6   ┆ a      │
7325
        └─────────────────────┴─────┴────────┘
7326
        >>> df.group_by_dynamic(
7327
        ...     "time",
7328
        ...     every="1h",
7329
        ...     closed="both",
7330
        ...     group_by="groups",
7331
        ...     include_boundaries=True,
7332
        ... ).agg(pl.col("n"))
7333
        shape: (6, 5)
7334
        ┌────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┐
7335
        │ groups ┆ _lower_boundary     ┆ _upper_boundary     ┆ time                ┆ n         │
7336
        │ ---    ┆ ---                 ┆ ---                 ┆ ---                 ┆ ---       │
7337
        │ str    ┆ datetime[μs]        ┆ datetime[μs]        ┆ datetime[μs]        ┆ list[i64] │
7338
        ╞════════╪═════════════════════╪═════════════════════╪═════════════════════╪═══════════╡
7339
        │ a      ┆ 2021-12-16 00:00:00 ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 00:00:00 ┆ [0, 1, 2] │
7340
        │ a      ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [2]       │
7341
        │ a      ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [5, 6]    │
7342
        │ a      ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 04:00:00 ┆ 2021-12-16 03:00:00 ┆ [6]       │
7343
        │ b      ┆ 2021-12-16 01:00:00 ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 01:00:00 ┆ [3, 4]    │
7344
        │ b      ┆ 2021-12-16 02:00:00 ┆ 2021-12-16 03:00:00 ┆ 2021-12-16 02:00:00 ┆ [4]       │
7345
        └────────┴─────────────────────┴─────────────────────┴─────────────────────┴───────────┘
7346

7347
        Dynamic group by on an index column
7348

7349
        >>> df = pl.DataFrame(
7350
        ...     {
7351
        ...         "idx": pl.int_range(0, 6, eager=True),
7352
        ...         "A": ["A", "A", "B", "B", "B", "C"],
7353
        ...     }
7354
        ... )
7355
        >>> (
7356
        ...     df.group_by_dynamic(
7357
        ...         "idx",
7358
        ...         every="2i",
7359
        ...         period="3i",
7360
        ...         include_boundaries=True,
7361
        ...         closed="right",
7362
        ...     ).agg(pl.col("A").alias("A_agg_list"))
7363
        ... )
7364
        shape: (4, 4)
7365
        ┌─────────────────┬─────────────────┬─────┬─────────────────┐
7366
        │ _lower_boundary ┆ _upper_boundary ┆ idx ┆ A_agg_list      │
7367
        │ ---             ┆ ---             ┆ --- ┆ ---             │
7368
        │ i64             ┆ i64             ┆ i64 ┆ list[str]       │
7369
        ╞═════════════════╪═════════════════╪═════╪═════════════════╡
7370
        │ -2              ┆ 1               ┆ -2  ┆ ["A", "A"]      │
7371
        │ 0               ┆ 3               ┆ 0   ┆ ["A", "B", "B"] │
7372
        │ 2               ┆ 5               ┆ 2   ┆ ["B", "B", "C"] │
7373
        │ 4               ┆ 7               ┆ 4   ┆ ["C"]           │
7374
        └─────────────────┴─────────────────┴─────┴─────────────────┘
7375
        """  # noqa: W505
7376
        return DynamicGroupBy(
7377
            self,
7378
            index_column=index_column,
7379
            every=every,
7380
            period=period,
7381
            offset=offset,
7382
            label=label,
7383
            include_boundaries=include_boundaries,
7384
            closed=closed,
7385
            group_by=group_by,
7386
            start_by=start_by,
7387
        )
7388

7389
    @deprecate_renamed_parameter("by", "group_by", version="0.20.14")
7390
    def upsample(
7391
        self,
7392
        time_column: str,
7393
        *,
7394
        every: str | timedelta,
7395
        group_by: str | Sequence[str] | None = None,
7396
        maintain_order: bool = False,
7397
    ) -> DataFrame:
7398
        """
7399
        Upsample a DataFrame at a regular frequency.
7400

7401
        The `every` argument is created with the following string language:
7402

7403
        - 1ns   (1 nanosecond)
7404
        - 1us   (1 microsecond)
7405
        - 1ms   (1 millisecond)
7406
        - 1s    (1 second)
7407
        - 1m    (1 minute)
7408
        - 1h    (1 hour)
7409
        - 1d    (1 calendar day)
7410
        - 1w    (1 calendar week)
7411
        - 1mo   (1 calendar month)
7412
        - 1q    (1 calendar quarter)
7413
        - 1y    (1 calendar year)
7414
        - 1i    (1 index count)
7415

7416
        Or combine them:
7417

7418
        - "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
7419

7420
        By "calendar day", we mean the corresponding time on the next day (which may
7421
        not be 24 hours, due to daylight savings). Similarly for "calendar week",
7422
        "calendar month", "calendar quarter", and "calendar year".
7423

7424
        .. versionchanged:: 0.20.14
7425
            The `by` parameter was renamed `group_by`.
7426

7427
        Parameters
7428
        ----------
7429
        time_column
7430
            Time column will be used to determine a date_range.
7431
            Note that this column has to be sorted for the output to make sense.
7432
        every
7433
            Interval will start 'every' duration.
7434
        group_by
7435
            First group by these columns and then upsample for every group.
7436
        maintain_order
7437
            Keep the ordering predictable. This is slower.
7438

7439
        Returns
7440
        -------
7441
        DataFrame
7442
            Result will be sorted by `time_column` (but note that if `group_by` columns
7443
            are passed, it will only be sorted within each group).
7444

7445
        Examples
7446
        --------
7447
        Upsample a DataFrame by a certain interval.
7448

7449
        >>> from datetime import datetime
7450
        >>> df = pl.DataFrame(
7451
        ...     {
7452
        ...         "time": [
7453
        ...             datetime(2021, 2, 1),
7454
        ...             datetime(2021, 4, 1),
7455
        ...             datetime(2021, 5, 1),
7456
        ...             datetime(2021, 6, 1),
7457
        ...         ],
7458
        ...         "groups": ["A", "B", "A", "B"],
7459
        ...         "values": [0, 1, 2, 3],
7460
        ...     }
7461
        ... ).set_sorted("time")
7462
        >>> df.upsample(
7463
        ...     time_column="time", every="1mo", group_by="groups", maintain_order=True
7464
        ... ).select(pl.all().fill_null(strategy="forward"))
7465
        shape: (7, 3)
7466
        ┌─────────────────────┬────────┬────────┐
7467
        │ time                ┆ groups ┆ values │
7468
        │ ---                 ┆ ---    ┆ ---    │
7469
        │ datetime[μs]        ┆ str    ┆ i64    │
7470
        ╞═════════════════════╪════════╪════════╡
7471
        │ 2021-02-01 00:00:00 ┆ A      ┆ 0      │
7472
        │ 2021-03-01 00:00:00 ┆ A      ┆ 0      │
7473
        │ 2021-04-01 00:00:00 ┆ A      ┆ 0      │
7474
        │ 2021-05-01 00:00:00 ┆ A      ┆ 2      │
7475
        │ 2021-04-01 00:00:00 ┆ B      ┆ 1      │
7476
        │ 2021-05-01 00:00:00 ┆ B      ┆ 1      │
7477
        │ 2021-06-01 00:00:00 ┆ B      ┆ 3      │
7478
        └─────────────────────┴────────┴────────┘
7479
        """
7480
        if group_by is None:
7481
            group_by = []
7482
        if isinstance(group_by, str):
7483
            group_by = [group_by]
7484

7485
        every = parse_as_duration_string(every)
7486

7487
        return self._from_pydf(
7488
            self._df.upsample(group_by, time_column, every, maintain_order)
7489
        )
7490

7491
    def join_asof(
7492
        self,
7493
        other: DataFrame,
7494
        *,
7495
        left_on: str | None | Expr = None,
7496
        right_on: str | None | Expr = None,
7497
        on: str | None | Expr = None,
7498
        by_left: str | Sequence[str] | None = None,
7499
        by_right: str | Sequence[str] | None = None,
7500
        by: str | Sequence[str] | None = None,
7501
        strategy: AsofJoinStrategy = "backward",
7502
        suffix: str = "_right",
7503
        tolerance: str | int | float | timedelta | None = None,
7504
        allow_parallel: bool = True,
7505
        force_parallel: bool = False,
7506
        coalesce: bool = True,
7507
        allow_exact_matches: bool = True,
7508
        check_sortedness: bool = True,
7509
    ) -> DataFrame:
7510
        """
7511
        Perform an asof join.
7512

7513
        This is similar to a left-join except that we match on nearest key rather than
7514
        equal keys.
7515

7516
        Both DataFrames must be sorted by the `on` key (within each `by` group, if
7517
        specified).
7518

7519
        For each row in the left DataFrame:
7520

7521
          - A "backward" search selects the last row in the right DataFrame whose
7522
            'on' key is less than or equal to the left's key.
7523

7524
          - A "forward" search selects the first row in the right DataFrame whose
7525
            'on' key is greater than or equal to the left's key.
7526

7527
          - A "nearest" search selects the last row in the right DataFrame whose value
7528
            is nearest to the left's key. String keys are not currently supported for a
7529
            nearest search.
7530

7531
        The default is "backward".
7532

7533
        Parameters
7534
        ----------
7535
        other
7536
            Lazy DataFrame to join with.
7537
        left_on
7538
            Join column of the left DataFrame.
7539
        right_on
7540
            Join column of the right DataFrame.
7541
        on
7542
            Join column of both DataFrames. If set, `left_on` and `right_on` should be
7543
            None.
7544
        by
7545
            Join on these columns before doing asof join
7546
        by_left
7547
            Join on these columns before doing asof join
7548
        by_right
7549
            Join on these columns before doing asof join
7550
        strategy : {'backward', 'forward', 'nearest'}
7551
            Join strategy.
7552
        suffix
7553
            Suffix to append to columns with a duplicate name.
7554
        tolerance
7555
            Numeric tolerance. By setting this the join will only be done if the near
7556
            keys are within this distance. If an asof join is done on columns of dtype
7557
            "Date", "Datetime", "Duration" or "Time", use either a datetime.timedelta
7558
            object or the following string language:
7559

7560
                - 1ns   (1 nanosecond)
7561
                - 1us   (1 microsecond)
7562
                - 1ms   (1 millisecond)
7563
                - 1s    (1 second)
7564
                - 1m    (1 minute)
7565
                - 1h    (1 hour)
7566
                - 1d    (1 calendar day)
7567
                - 1w    (1 calendar week)
7568
                - 1mo   (1 calendar month)
7569
                - 1q    (1 calendar quarter)
7570
                - 1y    (1 calendar year)
7571

7572
                Or combine them:
7573
                "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
7574

7575
                By "calendar day", we mean the corresponding time on the next day
7576
                (which may not be 24 hours, due to daylight savings). Similarly for
7577
                "calendar week", "calendar month", "calendar quarter", and
7578
                "calendar year".
7579

7580
        allow_parallel
7581
            Allow the physical plan to optionally evaluate the computation of both
7582
            DataFrames up to the join in parallel.
7583
        force_parallel
7584
            Force the physical plan to evaluate the computation of both DataFrames up to
7585
            the join in parallel.
7586
        coalesce
7587
            Coalescing behavior (merging of `on` / `left_on` / `right_on` columns):
7588

7589
            - *True*: Always coalesce join columns.
7590
            - *False*: Never coalesce join columns.
7591

7592
            Note that joining on any other expressions than `col`
7593
            will turn off coalescing.
7594
        allow_exact_matches
7595
            Whether exact matches are valid join predicates.
7596

7597
            - If True, allow matching with the same ``on`` value
7598
                (i.e. less-than-or-equal-to / greater-than-or-equal-to)
7599
            - If False, don't match the same ``on`` value
7600
                (i.e., strictly less-than / strictly greater-than).
7601
        check_sortedness
7602
            Check the sortedness of the asof keys. If the keys are not sorted Polars
7603
            will error. Currently, sortedness cannot be checked if 'by' groups are
7604
            provided.
7605

7606
        Examples
7607
        --------
7608
        >>> from datetime import date
7609
        >>> gdp = pl.DataFrame(
7610
        ...     {
7611
        ...         "date": pl.date_range(
7612
        ...             date(2016, 1, 1),
7613
        ...             date(2020, 1, 1),
7614
        ...             "1y",
7615
        ...             eager=True,
7616
        ...         ),
7617
        ...         "gdp": [4164, 4411, 4566, 4696, 4827],
7618
        ...     }
7619
        ... )
7620
        >>> gdp
7621
        shape: (5, 2)
7622
        ┌────────────┬──────┐
7623
        │ date       ┆ gdp  │
7624
        │ ---        ┆ ---  │
7625
        │ date       ┆ i64  │
7626
        ╞════════════╪══════╡
7627
        │ 2016-01-01 ┆ 4164 │
7628
        │ 2017-01-01 ┆ 4411 │
7629
        │ 2018-01-01 ┆ 4566 │
7630
        │ 2019-01-01 ┆ 4696 │
7631
        │ 2020-01-01 ┆ 4827 │
7632
        └────────────┴──────┘
7633

7634
        >>> population = pl.DataFrame(
7635
        ...     {
7636
        ...         "date": [date(2016, 3, 1), date(2018, 8, 1), date(2019, 1, 1)],
7637
        ...         "population": [82.19, 82.66, 83.12],
7638
        ...     }
7639
        ... ).sort("date")
7640
        >>> population
7641
        shape: (3, 2)
7642
        ┌────────────┬────────────┐
7643
        │ date       ┆ population │
7644
        │ ---        ┆ ---        │
7645
        │ date       ┆ f64        │
7646
        ╞════════════╪════════════╡
7647
        │ 2016-03-01 ┆ 82.19      │
7648
        │ 2018-08-01 ┆ 82.66      │
7649
        │ 2019-01-01 ┆ 83.12      │
7650
        └────────────┴────────────┘
7651

7652
        Note how the dates don't quite match. If we join them using `join_asof` and
7653
        `strategy='backward'`, then each date from `population` which doesn't have an
7654
        exact match is matched with the closest earlier date from `gdp`:
7655

7656
        >>> population.join_asof(gdp, on="date", strategy="backward")
7657
        shape: (3, 3)
7658
        ┌────────────┬────────────┬──────┐
7659
        │ date       ┆ population ┆ gdp  │
7660
        │ ---        ┆ ---        ┆ ---  │
7661
        │ date       ┆ f64        ┆ i64  │
7662
        ╞════════════╪════════════╪══════╡
7663
        │ 2016-03-01 ┆ 82.19      ┆ 4164 │
7664
        │ 2018-08-01 ┆ 82.66      ┆ 4566 │
7665
        │ 2019-01-01 ┆ 83.12      ┆ 4696 │
7666
        └────────────┴────────────┴──────┘
7667

7668
        Note how:
7669

7670
        - date `2016-03-01` from `population` is matched with `2016-01-01` from `gdp`;
7671
        - date `2018-08-01` from `population` is matched with `2018-01-01` from `gdp`.
7672

7673
        You can verify this by passing `coalesce=False`:
7674

7675
        >>> population.join_asof(gdp, on="date", strategy="backward", coalesce=False)
7676
        shape: (3, 4)
7677
        ┌────────────┬────────────┬────────────┬──────┐
7678
        │ date       ┆ population ┆ date_right ┆ gdp  │
7679
        │ ---        ┆ ---        ┆ ---        ┆ ---  │
7680
        │ date       ┆ f64        ┆ date       ┆ i64  │
7681
        ╞════════════╪════════════╪════════════╪══════╡
7682
        │ 2016-03-01 ┆ 82.19      ┆ 2016-01-01 ┆ 4164 │
7683
        │ 2018-08-01 ┆ 82.66      ┆ 2018-01-01 ┆ 4566 │
7684
        │ 2019-01-01 ┆ 83.12      ┆ 2019-01-01 ┆ 4696 │
7685
        └────────────┴────────────┴────────────┴──────┘
7686

7687
        If we instead use `strategy='forward'`, then each date from `population` which
7688
        doesn't have an exact match is matched with the closest later date from `gdp`:
7689

7690
        >>> population.join_asof(gdp, on="date", strategy="forward")
7691
        shape: (3, 3)
7692
        ┌────────────┬────────────┬──────┐
7693
        │ date       ┆ population ┆ gdp  │
7694
        │ ---        ┆ ---        ┆ ---  │
7695
        │ date       ┆ f64        ┆ i64  │
7696
        ╞════════════╪════════════╪══════╡
7697
        │ 2016-03-01 ┆ 82.19      ┆ 4411 │
7698
        │ 2018-08-01 ┆ 82.66      ┆ 4696 │
7699
        │ 2019-01-01 ┆ 83.12      ┆ 4696 │
7700
        └────────────┴────────────┴──────┘
7701

7702
        Note how:
7703

7704
        - date `2016-03-01` from `population` is matched with `2017-01-01` from `gdp`;
7705
        - date `2018-08-01` from `population` is matched with `2019-01-01` from `gdp`.
7706

7707
        Finally, `strategy='nearest'` gives us a mix of the two results above, as each
7708
        date from `population` which doesn't have an exact match is matched with the
7709
        closest date from `gdp`, regardless of whether it's earlier or later:
7710

7711
        >>> population.join_asof(gdp, on="date", strategy="nearest")
7712
        shape: (3, 3)
7713
        ┌────────────┬────────────┬──────┐
7714
        │ date       ┆ population ┆ gdp  │
7715
        │ ---        ┆ ---        ┆ ---  │
7716
        │ date       ┆ f64        ┆ i64  │
7717
        ╞════════════╪════════════╪══════╡
7718
        │ 2016-03-01 ┆ 82.19      ┆ 4164 │
7719
        │ 2018-08-01 ┆ 82.66      ┆ 4696 │
7720
        │ 2019-01-01 ┆ 83.12      ┆ 4696 │
7721
        └────────────┴────────────┴──────┘
7722

7723
        Note how:
7724

7725
        - date `2016-03-01` from `population` is matched with `2016-01-01` from `gdp`;
7726
        - date `2018-08-01` from `population` is matched with `2019-01-01` from `gdp`.
7727

7728
        They `by` argument allows joining on another column first, before the asof join.
7729
        In this example we join by `country` first, then asof join by date, as above.
7730

7731
        >>> gdp_dates = pl.date_range(  # fmt: skip
7732
        ...     date(2016, 1, 1), date(2020, 1, 1), "1y", eager=True
7733
        ... )
7734
        >>> gdp2 = pl.DataFrame(
7735
        ...     {
7736
        ...         "country": ["Germany"] * 5 + ["Netherlands"] * 5,
7737
        ...         "date": pl.concat([gdp_dates, gdp_dates]),
7738
        ...         "gdp": [4164, 4411, 4566, 4696, 4827, 784, 833, 914, 910, 909],
7739
        ...     }
7740
        ... ).sort("country", "date")
7741
        >>>
7742
        >>> gdp2
7743
        shape: (10, 3)
7744
        ┌─────────────┬────────────┬──────┐
7745
        │ country     ┆ date       ┆ gdp  │
7746
        │ ---         ┆ ---        ┆ ---  │
7747
        │ str         ┆ date       ┆ i64  │
7748
        ╞═════════════╪════════════╪══════╡
7749
        │ Germany     ┆ 2016-01-01 ┆ 4164 │
7750
        │ Germany     ┆ 2017-01-01 ┆ 4411 │
7751
        │ Germany     ┆ 2018-01-01 ┆ 4566 │
7752
        │ Germany     ┆ 2019-01-01 ┆ 4696 │
7753
        │ Germany     ┆ 2020-01-01 ┆ 4827 │
7754
        │ Netherlands ┆ 2016-01-01 ┆ 784  │
7755
        │ Netherlands ┆ 2017-01-01 ┆ 833  │
7756
        │ Netherlands ┆ 2018-01-01 ┆ 914  │
7757
        │ Netherlands ┆ 2019-01-01 ┆ 910  │
7758
        │ Netherlands ┆ 2020-01-01 ┆ 909  │
7759
        └─────────────┴────────────┴──────┘
7760
        >>> pop2 = pl.DataFrame(
7761
        ...     {
7762
        ...         "country": ["Germany"] * 3 + ["Netherlands"] * 3,
7763
        ...         "date": [
7764
        ...             date(2016, 3, 1),
7765
        ...             date(2018, 8, 1),
7766
        ...             date(2019, 1, 1),
7767
        ...             date(2016, 3, 1),
7768
        ...             date(2018, 8, 1),
7769
        ...             date(2019, 1, 1),
7770
        ...         ],
7771
        ...         "population": [82.19, 82.66, 83.12, 17.11, 17.32, 17.40],
7772
        ...     }
7773
        ... ).sort("country", "date")
7774
        >>>
7775
        >>> pop2
7776
        shape: (6, 3)
7777
        ┌─────────────┬────────────┬────────────┐
7778
        │ country     ┆ date       ┆ population │
7779
        │ ---         ┆ ---        ┆ ---        │
7780
        │ str         ┆ date       ┆ f64        │
7781
        ╞═════════════╪════════════╪════════════╡
7782
        │ Germany     ┆ 2016-03-01 ┆ 82.19      │
7783
        │ Germany     ┆ 2018-08-01 ┆ 82.66      │
7784
        │ Germany     ┆ 2019-01-01 ┆ 83.12      │
7785
        │ Netherlands ┆ 2016-03-01 ┆ 17.11      │
7786
        │ Netherlands ┆ 2018-08-01 ┆ 17.32      │
7787
        │ Netherlands ┆ 2019-01-01 ┆ 17.4       │
7788
        └─────────────┴────────────┴────────────┘
7789
        >>> pop2.join_asof(gdp2, by="country", on="date", strategy="nearest")
7790
        shape: (6, 4)
7791
        ┌─────────────┬────────────┬────────────┬──────┐
7792
        │ country     ┆ date       ┆ population ┆ gdp  │
7793
        │ ---         ┆ ---        ┆ ---        ┆ ---  │
7794
        │ str         ┆ date       ┆ f64        ┆ i64  │
7795
        ╞═════════════╪════════════╪════════════╪══════╡
7796
        │ Germany     ┆ 2016-03-01 ┆ 82.19      ┆ 4164 │
7797
        │ Germany     ┆ 2018-08-01 ┆ 82.66      ┆ 4696 │
7798
        │ Germany     ┆ 2019-01-01 ┆ 83.12      ┆ 4696 │
7799
        │ Netherlands ┆ 2016-03-01 ┆ 17.11      ┆ 784  │
7800
        │ Netherlands ┆ 2018-08-01 ┆ 17.32      ┆ 910  │
7801
        │ Netherlands ┆ 2019-01-01 ┆ 17.4       ┆ 910  │
7802
        └─────────────┴────────────┴────────────┴──────┘
7803
        """
7804
        require_same_type(self, other)
7805

7806
        if on is not None:
7807
            if not isinstance(on, (str, pl.Expr)):
7808
                msg = (
7809
                    f"expected `on` to be str or Expr, got {qualified_type_name(on)!r}"
7810
                )
7811
                raise TypeError(msg)
7812
        else:
7813
            if not isinstance(left_on, (str, pl.Expr)):
7814
                msg = f"expected `left_on` to be str or Expr, got {qualified_type_name(left_on)!r}"
7815
                raise TypeError(msg)
7816
            elif not isinstance(right_on, (str, pl.Expr)):
7817
                msg = f"expected `right_on` to be str or Expr, got {qualified_type_name(right_on)!r}"
7818
                raise TypeError(msg)
7819

7820
        from polars.lazyframe.opt_flags import QueryOptFlags
7821

7822
        return (
7823
            self.lazy()
7824
            .join_asof(
7825
                other.lazy(),
7826
                left_on=left_on,
7827
                right_on=right_on,
7828
                on=on,
7829
                by_left=by_left,
7830
                by_right=by_right,
7831
                by=by,
7832
                strategy=strategy,
7833
                suffix=suffix,
7834
                tolerance=tolerance,
7835
                allow_parallel=allow_parallel,
7836
                force_parallel=force_parallel,
7837
                coalesce=coalesce,
7838
                allow_exact_matches=allow_exact_matches,
7839
                check_sortedness=check_sortedness,
7840
            )
7841
            .collect(optimizations=QueryOptFlags._eager())
7842
        )
7843

7844
    @deprecate_renamed_parameter("join_nulls", "nulls_equal", version="1.24")
7845
    def join(
7846
        self,
7847
        other: DataFrame,
7848
        on: str | Expr | Sequence[str | Expr] | None = None,
7849
        how: JoinStrategy = "inner",
7850
        *,
7851
        left_on: str | Expr | Sequence[str | Expr] | None = None,
7852
        right_on: str | Expr | Sequence[str | Expr] | None = None,
7853
        suffix: str = "_right",
7854
        validate: JoinValidation = "m:m",
7855
        nulls_equal: bool = False,
7856
        coalesce: bool | None = None,
7857
        maintain_order: MaintainOrderJoin | None = None,
7858
    ) -> DataFrame:
7859
        """
7860
        Join in SQL-like fashion.
7861

7862
        .. versionchanged:: 1.24
7863
            The `join_nulls` parameter was renamed `nulls_equal`.
7864

7865
        Parameters
7866
        ----------
7867
        other
7868
            DataFrame to join with.
7869
        on
7870
            Name(s) of the join columns in both DataFrames. If set, `left_on` and
7871
            `right_on` should be None. This should not be specified if `how='cross'`.
7872
        how : {'inner', 'left', 'right', 'full', 'semi', 'anti', 'cross'}
7873
            Join strategy.
7874

7875
            .. list-table ::
7876
               :header-rows: 0
7877

7878
               * - **inner**
7879
                 - *(Default)* Returns rows that have matching values in both tables.
7880
               * - **left**
7881
                 - Returns all rows from the left table, and the matched rows from
7882
                   the right table.
7883
               * - **full**
7884
                 - Returns all rows when there is a match in either left or right.
7885
               * - **cross**
7886
                 - Returns the Cartesian product of rows from both tables
7887
               * - **semi**
7888
                 - Returns rows from the left table that have a match in the right
7889
                   table.
7890
               * - **anti**
7891
                 - Returns rows from the left table that have no match in the right
7892
                   table.
7893

7894
        left_on
7895
            Name(s) of the left join column(s).
7896
        right_on
7897
            Name(s) of the right join column(s).
7898
        suffix
7899
            Suffix to append to columns with a duplicate name.
7900
        validate: {'m:m', 'm:1', '1:m', '1:1'}
7901
            Checks if join is of specified type.
7902

7903
            .. list-table ::
7904
               :header-rows: 0
7905

7906
               * - **m:m**
7907
                 - *(Default)* Many-to-many (default). Does not result in checks.
7908
               * - **1:1**
7909
                 - One-to-one. Checks if join keys are unique in both left and
7910
                   right datasets.
7911
               * - **1:m**
7912
                 - One-to-many. Checks if join keys are unique in left dataset.
7913
               * - **m:1**
7914
                 - Many-to-one. Check if join keys are unique in right dataset.
7915

7916
            .. note::
7917
                This is currently not supported by the streaming engine.
7918

7919
        nulls_equal
7920
            Join on null values. By default null values will never produce matches.
7921
        coalesce
7922
            Coalescing behavior (merging of join columns).
7923

7924
            .. list-table ::
7925
               :header-rows: 0
7926

7927
               * - **None**
7928
                 - *(Default)* Coalesce unless `how='full'` is specified.
7929
               * - **True**
7930
                 - Always coalesce join columns.
7931
               * - **False**
7932
                 - Never coalesce join columns.
7933

7934
            .. note::
7935
                Joining on any other expressions than `col`
7936
                will turn off coalescing.
7937
        maintain_order : {'none', 'left', 'right', 'left_right', 'right_left'}
7938
            Which DataFrame row order to preserve, if any.
7939
            Do not rely on any observed ordering without explicitly setting this
7940
            parameter, as your code may break in a future release.
7941
            Not specifying any ordering can improve performance.
7942
            Supported for inner, left, right and full joins
7943

7944
            .. list-table ::
7945
               :header-rows: 0
7946

7947
               * - **none**
7948
                 - *(Default)* No specific ordering is desired. The ordering might
7949
                   differ across Polars versions or even between different runs.
7950
               * - **left**
7951
                 - Preserves the order of the left DataFrame.
7952
               * - **right**
7953
                 - Preserves the order of the right DataFrame.
7954
               * - **left_right**
7955
                 - First preserves the order of the left DataFrame, then the right.
7956
               * - **right_left**
7957
                 - First preserves the order of the right DataFrame, then the left.
7958

7959
        See Also
7960
        --------
7961
        join_asof
7962

7963
        Examples
7964
        --------
7965
        >>> df = pl.DataFrame(
7966
        ...     {
7967
        ...         "foo": [1, 2, 3],
7968
        ...         "bar": [6.0, 7.0, 8.0],
7969
        ...         "ham": ["a", "b", "c"],
7970
        ...     }
7971
        ... )
7972
        >>> other_df = pl.DataFrame(
7973
        ...     {
7974
        ...         "apple": ["x", "y", "z"],
7975
        ...         "ham": ["a", "b", "d"],
7976
        ...     }
7977
        ... )
7978
        >>> df.join(other_df, on="ham")
7979
        shape: (2, 4)
7980
        ┌─────┬─────┬─────┬───────┐
7981
        │ foo ┆ bar ┆ ham ┆ apple │
7982
        │ --- ┆ --- ┆ --- ┆ ---   │
7983
        │ i64 ┆ f64 ┆ str ┆ str   │
7984
        ╞═════╪═════╪═════╪═══════╡
7985
        │ 1   ┆ 6.0 ┆ a   ┆ x     │
7986
        │ 2   ┆ 7.0 ┆ b   ┆ y     │
7987
        └─────┴─────┴─────┴───────┘
7988

7989
        >>> df.join(other_df, on="ham", how="full")
7990
        shape: (4, 5)
7991
        ┌──────┬──────┬──────┬───────┬───────────┐
7992
        │ foo  ┆ bar  ┆ ham  ┆ apple ┆ ham_right │
7993
        │ ---  ┆ ---  ┆ ---  ┆ ---   ┆ ---       │
7994
        │ i64  ┆ f64  ┆ str  ┆ str   ┆ str       │
7995
        ╞══════╪══════╪══════╪═══════╪═══════════╡
7996
        │ 1    ┆ 6.0  ┆ a    ┆ x     ┆ a         │
7997
        │ 2    ┆ 7.0  ┆ b    ┆ y     ┆ b         │
7998
        │ null ┆ null ┆ null ┆ z     ┆ d         │
7999
        │ 3    ┆ 8.0  ┆ c    ┆ null  ┆ null      │
8000
        └──────┴──────┴──────┴───────┴───────────┘
8001

8002
        >>> df.join(other_df, on="ham", how="full", coalesce=True)
8003
        shape: (4, 4)
8004
        ┌──────┬──────┬─────┬───────┐
8005
        │ foo  ┆ bar  ┆ ham ┆ apple │
8006
        │ ---  ┆ ---  ┆ --- ┆ ---   │
8007
        │ i64  ┆ f64  ┆ str ┆ str   │
8008
        ╞══════╪══════╪═════╪═══════╡
8009
        │ 1    ┆ 6.0  ┆ a   ┆ x     │
8010
        │ 2    ┆ 7.0  ┆ b   ┆ y     │
8011
        │ null ┆ null ┆ d   ┆ z     │
8012
        │ 3    ┆ 8.0  ┆ c   ┆ null  │
8013
        └──────┴──────┴─────┴───────┘
8014

8015
        >>> df.join(other_df, on="ham", how="left")
8016
        shape: (3, 4)
8017
        ┌─────┬─────┬─────┬───────┐
8018
        │ foo ┆ bar ┆ ham ┆ apple │
8019
        │ --- ┆ --- ┆ --- ┆ ---   │
8020
        │ i64 ┆ f64 ┆ str ┆ str   │
8021
        ╞═════╪═════╪═════╪═══════╡
8022
        │ 1   ┆ 6.0 ┆ a   ┆ x     │
8023
        │ 2   ┆ 7.0 ┆ b   ┆ y     │
8024
        │ 3   ┆ 8.0 ┆ c   ┆ null  │
8025
        └─────┴─────┴─────┴───────┘
8026

8027
        >>> df.join(other_df, on="ham", how="semi")
8028
        shape: (2, 3)
8029
        ┌─────┬─────┬─────┐
8030
        │ foo ┆ bar ┆ ham │
8031
        │ --- ┆ --- ┆ --- │
8032
        │ i64 ┆ f64 ┆ str │
8033
        ╞═════╪═════╪═════╡
8034
        │ 1   ┆ 6.0 ┆ a   │
8035
        │ 2   ┆ 7.0 ┆ b   │
8036
        └─────┴─────┴─────┘
8037

8038
        >>> df.join(other_df, on="ham", how="anti")
8039
        shape: (1, 3)
8040
        ┌─────┬─────┬─────┐
8041
        │ foo ┆ bar ┆ ham │
8042
        │ --- ┆ --- ┆ --- │
8043
        │ i64 ┆ f64 ┆ str │
8044
        ╞═════╪═════╪═════╡
8045
        │ 3   ┆ 8.0 ┆ c   │
8046
        └─────┴─────┴─────┘
8047

8048
        >>> df.join(other_df, how="cross")
8049
        shape: (9, 5)
8050
        ┌─────┬─────┬─────┬───────┬───────────┐
8051
        │ foo ┆ bar ┆ ham ┆ apple ┆ ham_right │
8052
        │ --- ┆ --- ┆ --- ┆ ---   ┆ ---       │
8053
        │ i64 ┆ f64 ┆ str ┆ str   ┆ str       │
8054
        ╞═════╪═════╪═════╪═══════╪═══════════╡
8055
        │ 1   ┆ 6.0 ┆ a   ┆ x     ┆ a         │
8056
        │ 1   ┆ 6.0 ┆ a   ┆ y     ┆ b         │
8057
        │ 1   ┆ 6.0 ┆ a   ┆ z     ┆ d         │
8058
        │ 2   ┆ 7.0 ┆ b   ┆ x     ┆ a         │
8059
        │ 2   ┆ 7.0 ┆ b   ┆ y     ┆ b         │
8060
        │ 2   ┆ 7.0 ┆ b   ┆ z     ┆ d         │
8061
        │ 3   ┆ 8.0 ┆ c   ┆ x     ┆ a         │
8062
        │ 3   ┆ 8.0 ┆ c   ┆ y     ┆ b         │
8063
        │ 3   ┆ 8.0 ┆ c   ┆ z     ┆ d         │
8064
        └─────┴─────┴─────┴───────┴───────────┘
8065

8066
        Notes
8067
        -----
8068
        For joining on columns with categorical data, see :class:`polars.StringCache`.
8069
        """
8070
        require_same_type(self, other)
8071

8072
        from polars.lazyframe.opt_flags import QueryOptFlags
8073

8074
        return (
8075
            self.lazy()
8076
            .join(
8077
                other=other.lazy(),
8078
                left_on=left_on,
8079
                right_on=right_on,
8080
                on=on,
8081
                how=how,
8082
                suffix=suffix,
8083
                validate=validate,
8084
                nulls_equal=nulls_equal,
8085
                coalesce=coalesce,
8086
                maintain_order=maintain_order,
8087
            )
8088
            .collect(optimizations=QueryOptFlags._eager())
8089
        )
8090

8091
    @unstable()
8092
    def join_where(
8093
        self,
8094
        other: DataFrame,
8095
        *predicates: Expr | Iterable[Expr],
8096
        suffix: str = "_right",
8097
    ) -> DataFrame:
8098
        """
8099
        Perform a join based on one or multiple (in)equality predicates.
8100

8101
        This performs an inner join, so only rows where all predicates are true
8102
        are included in the result, and a row from either DataFrame may be included
8103
        multiple times in the result.
8104

8105
        .. note::
8106
            The row order of the input DataFrames is not preserved.
8107

8108
        .. warning::
8109
            This functionality is experimental. It may be
8110
            changed at any point without it being considered a breaking change.
8111

8112
        Parameters
8113
        ----------
8114
        other
8115
            DataFrame to join with.
8116
        *predicates
8117
            (In)Equality condition to join the two tables on.
8118
            When a column name occurs in both tables, the proper suffix must
8119
            be applied in the predicate.
8120
        suffix
8121
            Suffix to append to columns with a duplicate name.
8122

8123
        Examples
8124
        --------
8125
        Join two dataframes together based on two predicates which get AND-ed together.
8126

8127
        >>> east = pl.DataFrame(
8128
        ...     {
8129
        ...         "id": [100, 101, 102],
8130
        ...         "dur": [120, 140, 160],
8131
        ...         "rev": [12, 14, 16],
8132
        ...         "cores": [2, 8, 4],
8133
        ...     }
8134
        ... )
8135
        >>> west = pl.DataFrame(
8136
        ...     {
8137
        ...         "t_id": [404, 498, 676, 742],
8138
        ...         "time": [90, 130, 150, 170],
8139
        ...         "cost": [9, 13, 15, 16],
8140
        ...         "cores": [4, 2, 1, 4],
8141
        ...     }
8142
        ... )
8143
        >>> east.join_where(
8144
        ...     west,
8145
        ...     pl.col("dur") < pl.col("time"),
8146
        ...     pl.col("rev") < pl.col("cost"),
8147
        ... )
8148
        shape: (5, 8)
8149
        ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
8150
        │ id  ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
8151
        │ --- ┆ --- ┆ --- ┆ ---   ┆ ---  ┆ ---  ┆ ---  ┆ ---         │
8152
        │ i64 ┆ i64 ┆ i64 ┆ i64   ┆ i64  ┆ i64  ┆ i64  ┆ i64         │
8153
        ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
8154
        │ 100 ┆ 120 ┆ 12  ┆ 2     ┆ 498  ┆ 130  ┆ 13   ┆ 2           │
8155
        │ 100 ┆ 120 ┆ 12  ┆ 2     ┆ 676  ┆ 150  ┆ 15   ┆ 1           │
8156
        │ 100 ┆ 120 ┆ 12  ┆ 2     ┆ 742  ┆ 170  ┆ 16   ┆ 4           │
8157
        │ 101 ┆ 140 ┆ 14  ┆ 8     ┆ 676  ┆ 150  ┆ 15   ┆ 1           │
8158
        │ 101 ┆ 140 ┆ 14  ┆ 8     ┆ 742  ┆ 170  ┆ 16   ┆ 4           │
8159
        └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
8160

8161
        To OR them together, use a single expression and the `|` operator.
8162

8163
        >>> east.join_where(
8164
        ...     west,
8165
        ...     (pl.col("dur") < pl.col("time")) | (pl.col("rev") < pl.col("cost")),
8166
        ... )
8167
        shape: (6, 8)
8168
        ┌─────┬─────┬─────┬───────┬──────┬──────┬──────┬─────────────┐
8169
        │ id  ┆ dur ┆ rev ┆ cores ┆ t_id ┆ time ┆ cost ┆ cores_right │
8170
        │ --- ┆ --- ┆ --- ┆ ---   ┆ ---  ┆ ---  ┆ ---  ┆ ---         │
8171
        │ i64 ┆ i64 ┆ i64 ┆ i64   ┆ i64  ┆ i64  ┆ i64  ┆ i64         │
8172
        ╞═════╪═════╪═════╪═══════╪══════╪══════╪══════╪═════════════╡
8173
        │ 100 ┆ 120 ┆ 12  ┆ 2     ┆ 498  ┆ 130  ┆ 13   ┆ 2           │
8174
        │ 100 ┆ 120 ┆ 12  ┆ 2     ┆ 676  ┆ 150  ┆ 15   ┆ 1           │
8175
        │ 100 ┆ 120 ┆ 12  ┆ 2     ┆ 742  ┆ 170  ┆ 16   ┆ 4           │
8176
        │ 101 ┆ 140 ┆ 14  ┆ 8     ┆ 676  ┆ 150  ┆ 15   ┆ 1           │
8177
        │ 101 ┆ 140 ┆ 14  ┆ 8     ┆ 742  ┆ 170  ┆ 16   ┆ 4           │
8178
        │ 102 ┆ 160 ┆ 16  ┆ 4     ┆ 742  ┆ 170  ┆ 16   ┆ 4           │
8179
        └─────┴─────┴─────┴───────┴──────┴──────┴──────┴─────────────┘
8180
        """
8181
        require_same_type(self, other)
8182

8183
        from polars.lazyframe.opt_flags import QueryOptFlags
8184

8185
        return (
8186
            self.lazy()
8187
            .join_where(
8188
                other.lazy(),
8189
                *predicates,
8190
                suffix=suffix,
8191
            )
8192
            .collect(optimizations=QueryOptFlags._eager())
8193
        )
8194

8195
    def map_rows(
8196
        self,
8197
        function: Callable[[tuple[Any, ...]], Any],
8198
        return_dtype: PolarsDataType | None = None,
8199
        *,
8200
        inference_size: int = 256,
8201
    ) -> DataFrame:
8202
        """
8203
        Apply a custom/user-defined function (UDF) over the rows of the DataFrame.
8204

8205
        .. warning::
8206
            This method is much slower than the native expressions API.
8207
            Only use it if you cannot implement your logic otherwise.
8208

8209
        The UDF will receive each row as a tuple of values: `udf(row)`.
8210

8211
        Implementing logic using a Python function is almost always *significantly*
8212
        slower and more memory intensive than implementing the same logic using
8213
        the native expression API because:
8214

8215
        - The native expression engine runs in Rust; UDFs run in Python.
8216
        - Use of Python UDFs forces the DataFrame to be materialized in memory.
8217
        - Polars-native expressions can be parallelised (UDFs typically cannot).
8218
        - Polars-native expressions can be logically optimised (UDFs cannot).
8219

8220
        Wherever possible you should strongly prefer the native expression API
8221
        to achieve the best performance.
8222

8223
        Parameters
8224
        ----------
8225
        function
8226
            Custom function or lambda.
8227
        return_dtype
8228
            Output type of the operation. If none given, Polars tries to infer the type.
8229
        inference_size
8230
            Only used in the case when the custom function returns rows.
8231
            This uses the first `n` rows to determine the output schema.
8232

8233
        Notes
8234
        -----
8235
        * The frame-level `map_rows` cannot track column names (as the UDF is a
8236
          black-box that may arbitrarily drop, rearrange, transform, or add new
8237
          columns); if you want to apply a UDF such that column names are preserved,
8238
          you should use the expression-level `map_elements` syntax instead.
8239

8240
        * If your function is expensive and you don't want it to be called more than
8241
          once for a given input, consider applying an `@lru_cache` decorator to it.
8242
          If your data is suitable you may achieve *significant* speedups.
8243

8244
        Examples
8245
        --------
8246
        >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [-1, 5, 8]})
8247

8248
        Return a DataFrame by mapping each row to a tuple:
8249

8250
        >>> df.map_rows(lambda t: (t[0] * 2, t[1] * 3))
8251
        shape: (3, 2)
8252
        ┌──────────┬──────────┐
8253
        │ column_0 ┆ column_1 │
8254
        │ ---      ┆ ---      │
8255
        │ i64      ┆ i64      │
8256
        ╞══════════╪══════════╡
8257
        │ 2        ┆ -3       │
8258
        │ 4        ┆ 15       │
8259
        │ 6        ┆ 24       │
8260
        └──────────┴──────────┘
8261

8262
        However, it is much better to implement this with a native expression:
8263

8264
        >>> df.select(
8265
        ...     pl.col("foo") * 2,
8266
        ...     pl.col("bar") * 3,
8267
        ... )  # doctest: +IGNORE_RESULT
8268

8269
        Return a DataFrame with a single column by mapping each row to a scalar:
8270

8271
        >>> df.map_rows(lambda t: (t[0] * 2 + t[1]))
8272
        shape: (3, 1)
8273
        ┌─────┐
8274
        │ map │
8275
        │ --- │
8276
        │ i64 │
8277
        ╞═════╡
8278
        │ 1   │
8279
        │ 9   │
8280
        │ 14  │
8281
        └─────┘
8282

8283
        In this case it is better to use the following native expression:
8284

8285
        >>> df.select(pl.col("foo") * 2 + pl.col("bar"))  # doctest: +IGNORE_RESULT
8286
        """
8287
        # TODO: Enable warning for inefficient map
8288
        # from polars._utils.udfs import warn_on_inefficient_map
8289
        # warn_on_inefficient_map(function, columns=self.columns, map_target="frame)
8290

8291
        out, is_df = self._df.map_rows(function, return_dtype, inference_size)
8292
        if is_df:
8293
            return self._from_pydf(out)
8294
        else:
8295
            return wrap_s(out).to_frame()
8296

8297
    def hstack(
8298
        self, columns: list[Series] | DataFrame, *, in_place: bool = False
8299
    ) -> DataFrame:
8300
        """
8301
        Return a new DataFrame grown horizontally by stacking multiple Series to it.
8302

8303
        Parameters
8304
        ----------
8305
        columns
8306
            Series to stack.
8307
        in_place
8308
            Modify in place.
8309

8310
        Examples
8311
        --------
8312
        >>> df = pl.DataFrame(
8313
        ...     {
8314
        ...         "foo": [1, 2, 3],
8315
        ...         "bar": [6, 7, 8],
8316
        ...         "ham": ["a", "b", "c"],
8317
        ...     }
8318
        ... )
8319
        >>> x = pl.Series("apple", [10, 20, 30])
8320
        >>> df.hstack([x])
8321
        shape: (3, 4)
8322
        ┌─────┬─────┬─────┬───────┐
8323
        │ foo ┆ bar ┆ ham ┆ apple │
8324
        │ --- ┆ --- ┆ --- ┆ ---   │
8325
        │ i64 ┆ i64 ┆ str ┆ i64   │
8326
        ╞═════╪═════╪═════╪═══════╡
8327
        │ 1   ┆ 6   ┆ a   ┆ 10    │
8328
        │ 2   ┆ 7   ┆ b   ┆ 20    │
8329
        │ 3   ┆ 8   ┆ c   ┆ 30    │
8330
        └─────┴─────┴─────┴───────┘
8331
        """
8332
        if not isinstance(columns, list):
8333
            columns = columns.get_columns()
8334
        if in_place:
8335
            self._df.hstack_mut([s._s for s in columns])
8336
            return self
8337
        else:
8338
            return self._from_pydf(self._df.hstack([s._s for s in columns]))
8339

8340
    def vstack(self, other: DataFrame, *, in_place: bool = False) -> DataFrame:
8341
        """
8342
        Grow this DataFrame vertically by stacking a DataFrame to it.
8343

8344
        Parameters
8345
        ----------
8346
        other
8347
            DataFrame to stack.
8348
        in_place
8349
            Modify in place.
8350

8351
        See Also
8352
        --------
8353
        extend
8354

8355
        Examples
8356
        --------
8357
        >>> df1 = pl.DataFrame(
8358
        ...     {
8359
        ...         "foo": [1, 2],
8360
        ...         "bar": [6, 7],
8361
        ...         "ham": ["a", "b"],
8362
        ...     }
8363
        ... )
8364
        >>> df2 = pl.DataFrame(
8365
        ...     {
8366
        ...         "foo": [3, 4],
8367
        ...         "bar": [8, 9],
8368
        ...         "ham": ["c", "d"],
8369
        ...     }
8370
        ... )
8371
        >>> df1.vstack(df2)
8372
        shape: (4, 3)
8373
        ┌─────┬─────┬─────┐
8374
        │ foo ┆ bar ┆ ham │
8375
        │ --- ┆ --- ┆ --- │
8376
        │ i64 ┆ i64 ┆ str │
8377
        ╞═════╪═════╪═════╡
8378
        │ 1   ┆ 6   ┆ a   │
8379
        │ 2   ┆ 7   ┆ b   │
8380
        │ 3   ┆ 8   ┆ c   │
8381
        │ 4   ┆ 9   ┆ d   │
8382
        └─────┴─────┴─────┘
8383
        """
8384
        require_same_type(self, other)
8385
        if in_place:
8386
            self._df.vstack_mut(other._df)
8387
            return self
8388

8389
        return self._from_pydf(self._df.vstack(other._df))
8390

8391
    def extend(self, other: DataFrame) -> DataFrame:
8392
        """
8393
        Extend the memory backed by this `DataFrame` with the values from `other`.
8394

8395
        Different from `vstack` which adds the chunks from `other` to the chunks of
8396
        this `DataFrame`, `extend` appends the data from `other` to the underlying
8397
        memory locations and thus may cause a reallocation.
8398

8399
        If this does not cause a reallocation, the resulting data structure will not
8400
        have any extra chunks and thus will yield faster queries.
8401

8402
        Prefer `extend` over `vstack` when you want to do a query after a single
8403
        append. For instance, during online operations where you add `n` rows and rerun
8404
        a query.
8405

8406
        Prefer `vstack` over `extend` when you want to append many times before
8407
        doing a query. For instance, when you read in multiple files and want to store
8408
        them in a single `DataFrame`. In the latter case, finish the sequence of
8409
        `vstack` operations with a `rechunk`.
8410

8411
        Parameters
8412
        ----------
8413
        other
8414
            DataFrame to vertically add.
8415

8416
        Warnings
8417
        --------
8418
        This method modifies the dataframe in-place. The dataframe is returned for
8419
        convenience only.
8420

8421
        See Also
8422
        --------
8423
        vstack
8424

8425
        Examples
8426
        --------
8427
        >>> df1 = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
8428
        >>> df2 = pl.DataFrame({"foo": [10, 20, 30], "bar": [40, 50, 60]})
8429
        >>> df1.extend(df2)
8430
        shape: (6, 2)
8431
        ┌─────┬─────┐
8432
        │ foo ┆ bar │
8433
        │ --- ┆ --- │
8434
        │ i64 ┆ i64 │
8435
        ╞═════╪═════╡
8436
        │ 1   ┆ 4   │
8437
        │ 2   ┆ 5   │
8438
        │ 3   ┆ 6   │
8439
        │ 10  ┆ 40  │
8440
        │ 20  ┆ 50  │
8441
        │ 30  ┆ 60  │
8442
        └─────┴─────┘
8443
        """
8444
        require_same_type(self, other)
8445
        self._df.extend(other._df)
8446
        return self
8447

8448
    def drop(
8449
        self,
8450
        *columns: ColumnNameOrSelector | Iterable[ColumnNameOrSelector],
8451
        strict: bool = True,
8452
    ) -> DataFrame:
8453
        """
8454
        Remove columns from the dataframe.
8455

8456
        Parameters
8457
        ----------
8458
        *columns
8459
            Names of the columns that should be removed from the dataframe.
8460
            Accepts column selector input.
8461
        strict
8462
            Validate that all column names exist in the current schema,
8463
            and throw an exception if any do not.
8464

8465
        Examples
8466
        --------
8467
        Drop a single column by passing the name of that column.
8468

8469
        >>> df = pl.DataFrame(
8470
        ...     {
8471
        ...         "foo": [1, 2, 3],
8472
        ...         "bar": [6.0, 7.0, 8.0],
8473
        ...         "ham": ["a", "b", "c"],
8474
        ...     }
8475
        ... )
8476
        >>> df.drop("ham")
8477
        shape: (3, 2)
8478
        ┌─────┬─────┐
8479
        │ foo ┆ bar │
8480
        │ --- ┆ --- │
8481
        │ i64 ┆ f64 │
8482
        ╞═════╪═════╡
8483
        │ 1   ┆ 6.0 │
8484
        │ 2   ┆ 7.0 │
8485
        │ 3   ┆ 8.0 │
8486
        └─────┴─────┘
8487

8488
        Drop multiple columns by passing a list of column names.
8489

8490
        >>> df.drop(["bar", "ham"])
8491
        shape: (3, 1)
8492
        ┌─────┐
8493
        │ foo │
8494
        │ --- │
8495
        │ i64 │
8496
        ╞═════╡
8497
        │ 1   │
8498
        │ 2   │
8499
        │ 3   │
8500
        └─────┘
8501

8502
        Drop multiple columns by passing a selector.
8503

8504
        >>> import polars.selectors as cs
8505
        >>> df.drop(cs.numeric())
8506
        shape: (3, 1)
8507
        ┌─────┐
8508
        │ ham │
8509
        │ --- │
8510
        │ str │
8511
        ╞═════╡
8512
        │ a   │
8513
        │ b   │
8514
        │ c   │
8515
        └─────┘
8516

8517
        Use positional arguments to drop multiple columns.
8518

8519
        >>> df.drop("foo", "ham")
8520
        shape: (3, 1)
8521
        ┌─────┐
8522
        │ bar │
8523
        │ --- │
8524
        │ f64 │
8525
        ╞═════╡
8526
        │ 6.0 │
8527
        │ 7.0 │
8528
        │ 8.0 │
8529
        └─────┘
8530
        """
8531
        from polars.lazyframe.opt_flags import QueryOptFlags
8532

8533
        return (
8534
            self.lazy()
8535
            .drop(*columns, strict=strict)
8536
            .collect(optimizations=QueryOptFlags._eager())
8537
        )
8538

8539
    def drop_in_place(self, name: str) -> Series:
8540
        """
8541
        Drop a single column in-place and return the dropped column.
8542

8543
        Parameters
8544
        ----------
8545
        name
8546
            Name of the column to drop.
8547

8548
        Returns
8549
        -------
8550
        Series
8551
            The dropped column.
8552

8553
        Examples
8554
        --------
8555
        >>> df = pl.DataFrame(
8556
        ...     {
8557
        ...         "foo": [1, 2, 3],
8558
        ...         "bar": [6, 7, 8],
8559
        ...         "ham": ["a", "b", "c"],
8560
        ...     }
8561
        ... )
8562
        >>> df.drop_in_place("ham")
8563
        shape: (3,)
8564
        Series: 'ham' [str]
8565
        [
8566
            "a"
8567
            "b"
8568
            "c"
8569
        ]
8570
        """
8571
        return wrap_s(self._df.drop_in_place(name))
8572

8573
    def cast(
8574
        self,
8575
        dtypes: (
8576
            Mapping[
8577
                ColumnNameOrSelector | PolarsDataType, PolarsDataType | PythonDataType
8578
            ]
8579
            | PolarsDataType
8580
        ),
8581
        *,
8582
        strict: bool = True,
8583
    ) -> DataFrame:
8584
        """
8585
        Cast DataFrame column(s) to the specified dtype(s).
8586

8587
        Parameters
8588
        ----------
8589
        dtypes
8590
            Mapping of column names (or selector) to dtypes, or a single dtype
8591
            to which all columns will be cast.
8592
        strict
8593
            Raise if cast is invalid on rows after predicates are pushed down.
8594
            If `False`, invalid casts will produce null values.
8595

8596
        Examples
8597
        --------
8598
        >>> from datetime import date
8599
        >>> df = pl.DataFrame(
8600
        ...     {
8601
        ...         "foo": [1, 2, 3],
8602
        ...         "bar": [6.0, 7.0, 8.0],
8603
        ...         "ham": [date(2020, 1, 2), date(2021, 3, 4), date(2022, 5, 6)],
8604
        ...     }
8605
        ... )
8606

8607
        Cast specific frame columns to the specified dtypes:
8608

8609
        >>> df.cast({"foo": pl.Float32, "bar": pl.UInt8})
8610
        shape: (3, 3)
8611
        ┌─────┬─────┬────────────┐
8612
        │ foo ┆ bar ┆ ham        │
8613
        │ --- ┆ --- ┆ ---        │
8614
        │ f32 ┆ u8  ┆ date       │
8615
        ╞═════╪═════╪════════════╡
8616
        │ 1.0 ┆ 6   ┆ 2020-01-02 │
8617
        │ 2.0 ┆ 7   ┆ 2021-03-04 │
8618
        │ 3.0 ┆ 8   ┆ 2022-05-06 │
8619
        └─────┴─────┴────────────┘
8620

8621
        Cast all frame columns matching one dtype (or dtype group) to another dtype:
8622

8623
        >>> df.cast({pl.Date: pl.Datetime})
8624
        shape: (3, 3)
8625
        ┌─────┬─────┬─────────────────────┐
8626
        │ foo ┆ bar ┆ ham                 │
8627
        │ --- ┆ --- ┆ ---                 │
8628
        │ i64 ┆ f64 ┆ datetime[μs]        │
8629
        ╞═════╪═════╪═════════════════════╡
8630
        │ 1   ┆ 6.0 ┆ 2020-01-02 00:00:00 │
8631
        │ 2   ┆ 7.0 ┆ 2021-03-04 00:00:00 │
8632
        │ 3   ┆ 8.0 ┆ 2022-05-06 00:00:00 │
8633
        └─────┴─────┴─────────────────────┘
8634

8635
        Use selectors to define the columns being cast:
8636

8637
        >>> import polars.selectors as cs
8638
        >>> df.cast({cs.numeric(): pl.UInt32, cs.temporal(): pl.String})
8639
        shape: (3, 3)
8640
        ┌─────┬─────┬────────────┐
8641
        │ foo ┆ bar ┆ ham        │
8642
        │ --- ┆ --- ┆ ---        │
8643
        │ u32 ┆ u32 ┆ str        │
8644
        ╞═════╪═════╪════════════╡
8645
        │ 1   ┆ 6   ┆ 2020-01-02 │
8646
        │ 2   ┆ 7   ┆ 2021-03-04 │
8647
        │ 3   ┆ 8   ┆ 2022-05-06 │
8648
        └─────┴─────┴────────────┘
8649

8650
        Cast all frame columns to the specified dtype:
8651

8652
        >>> df.cast(pl.String).to_dict(as_series=False)
8653
        {'foo': ['1', '2', '3'],
8654
         'bar': ['6.0', '7.0', '8.0'],
8655
         'ham': ['2020-01-02', '2021-03-04', '2022-05-06']}
8656
        """
8657
        from polars.lazyframe.opt_flags import QueryOptFlags
8658

8659
        return (
8660
            self.lazy()
8661
            .cast(dtypes, strict=strict)
8662
            .collect(optimizations=QueryOptFlags._eager())
8663
        )
8664

8665
    def clear(self, n: int = 0) -> DataFrame:
8666
        """
8667
        Create an empty (n=0) or `n`-row null-filled (n>0) copy of the DataFrame.
8668

8669
        Returns a `n`-row null-filled DataFrame with an identical schema.
8670
        `n` can be greater than the current number of rows in the DataFrame.
8671

8672
        Parameters
8673
        ----------
8674
        n
8675
            Number of (null-filled) rows to return in the cleared frame.
8676

8677
        See Also
8678
        --------
8679
        clone : Cheap deepcopy/clone.
8680

8681
        Examples
8682
        --------
8683
        >>> df = pl.DataFrame(
8684
        ...     {
8685
        ...         "a": [None, 2, 3, 4],
8686
        ...         "b": [0.5, None, 2.5, 13],
8687
        ...         "c": [True, True, False, None],
8688
        ...     }
8689
        ... )
8690
        >>> df.clear()
8691
        shape: (0, 3)
8692
        ┌─────┬─────┬──────┐
8693
        │ a   ┆ b   ┆ c    │
8694
        │ --- ┆ --- ┆ ---  │
8695
        │ i64 ┆ f64 ┆ bool │
8696
        ╞═════╪═════╪══════╡
8697
        └─────┴─────┴──────┘
8698

8699
        >>> df.clear(n=2)
8700
        shape: (2, 3)
8701
        ┌──────┬──────┬──────┐
8702
        │ a    ┆ b    ┆ c    │
8703
        │ ---  ┆ ---  ┆ ---  │
8704
        │ i64  ┆ f64  ┆ bool │
8705
        ╞══════╪══════╪══════╡
8706
        │ null ┆ null ┆ null │
8707
        │ null ┆ null ┆ null │
8708
        └──────┴──────┴──────┘
8709
        """
8710
        if n < 0:
8711
            msg = f"`n` should be greater than or equal to 0, got {n}"
8712
            raise ValueError(msg)
8713
        # faster path
8714
        if n == 0:
8715
            return self._from_pydf(self._df.clear())
8716
        return self.__class__(
8717
            {
8718
                nm: pl.Series(name=nm, dtype=tp).extend_constant(None, n)
8719
                for nm, tp in self.schema.items()
8720
            }
8721
        )
8722

8723
    def clone(self) -> DataFrame:
8724
        """
8725
        Create a copy of this DataFrame.
8726

8727
        This is a cheap operation that does not copy data.
8728

8729
        See Also
8730
        --------
8731
        clear : Create an empty copy of the current DataFrame, with identical
8732
            schema but no data.
8733

8734
        Examples
8735
        --------
8736
        >>> df = pl.DataFrame(
8737
        ...     {
8738
        ...         "a": [1, 2, 3, 4],
8739
        ...         "b": [0.5, 4, 10, 13],
8740
        ...         "c": [True, True, False, True],
8741
        ...     }
8742
        ... )
8743
        >>> df.clone()
8744
        shape: (4, 3)
8745
        ┌─────┬──────┬───────┐
8746
        │ a   ┆ b    ┆ c     │
8747
        │ --- ┆ ---  ┆ ---   │
8748
        │ i64 ┆ f64  ┆ bool  │
8749
        ╞═════╪══════╪═══════╡
8750
        │ 1   ┆ 0.5  ┆ true  │
8751
        │ 2   ┆ 4.0  ┆ true  │
8752
        │ 3   ┆ 10.0 ┆ false │
8753
        │ 4   ┆ 13.0 ┆ true  │
8754
        └─────┴──────┴───────┘
8755
        """
8756
        return self._from_pydf(self._df.clone())
8757

8758
    def get_columns(self) -> list[Series]:
8759
        """
8760
        Get the DataFrame as a List of Series.
8761

8762
        Examples
8763
        --------
8764
        >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
8765
        >>> df.get_columns()
8766
        [shape: (3,)
8767
        Series: 'foo' [i64]
8768
        [
8769
                1
8770
                2
8771
                3
8772
        ], shape: (3,)
8773
        Series: 'bar' [i64]
8774
        [
8775
                4
8776
                5
8777
                6
8778
        ]]
8779

8780
        >>> df = pl.DataFrame(
8781
        ...     {
8782
        ...         "a": [1, 2, 3, 4],
8783
        ...         "b": [0.5, 4, 10, 13],
8784
        ...         "c": [True, True, False, True],
8785
        ...     }
8786
        ... )
8787
        >>> df.get_columns()
8788
        [shape: (4,)
8789
        Series: 'a' [i64]
8790
        [
8791
            1
8792
            2
8793
            3
8794
            4
8795
        ], shape: (4,)
8796
        Series: 'b' [f64]
8797
        [
8798
            0.5
8799
            4.0
8800
            10.0
8801
            13.0
8802
        ], shape: (4,)
8803
        Series: 'c' [bool]
8804
        [
8805
            true
8806
            true
8807
            false
8808
            true
8809
        ]]
8810
        """
8811
        return [wrap_s(s) for s in self._df.get_columns()]
8812

8813
    @overload
8814
    def get_column(self, name: str, *, default: Series | NoDefault = ...) -> Series: ...
8815

8816
    @overload
8817
    def get_column(self, name: str, *, default: Any) -> Any: ...
8818

8819
    def get_column(
8820
        self, name: str, *, default: Any | NoDefault = no_default
8821
    ) -> Series | Any:
8822
        """
8823
        Get a single column by name.
8824

8825
        Parameters
8826
        ----------
8827
        name
8828
            String name of the column to retrieve.
8829
        default
8830
            Value to return if the column does not exist; if not explicitly set and
8831
            the column is not present a `ColumnNotFoundError` exception is raised.
8832

8833
        Returns
8834
        -------
8835
        Series (or arbitrary default value, if specified).
8836

8837
        See Also
8838
        --------
8839
        to_series
8840

8841
        Examples
8842
        --------
8843
        >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
8844
        >>> df.get_column("foo")
8845
        shape: (3,)
8846
        Series: 'foo' [i64]
8847
        [
8848
            1
8849
            2
8850
            3
8851
        ]
8852

8853
        Missing column handling; can optionally provide an arbitrary default value
8854
        to the method (otherwise a `ColumnNotFoundError` exception is raised).
8855

8856
        >>> df.get_column("baz", default=pl.Series("baz", ["?", "?", "?"]))
8857
        shape: (3,)
8858
        Series: 'baz' [str]
8859
        [
8860
            "?"
8861
            "?"
8862
            "?"
8863
        ]
8864
        >>> res = df.get_column("baz", default=None)
8865
        >>> res is None
8866
        True
8867
        """
8868
        try:
8869
            return wrap_s(self._df.get_column(name))
8870
        except ColumnNotFoundError:
8871
            if default is no_default:
8872
                raise
8873
            return default
8874

8875
    def fill_null(
8876
        self,
8877
        value: Any | Expr | None = None,
8878
        strategy: FillNullStrategy | None = None,
8879
        limit: int | None = None,
8880
        *,
8881
        matches_supertype: bool = True,
8882
    ) -> DataFrame:
8883
        """
8884
        Fill null values using the specified value or strategy.
8885

8886
        Parameters
8887
        ----------
8888
        value
8889
            Value used to fill null values.
8890
        strategy : {None, 'forward', 'backward', 'min', 'max', 'mean', 'zero', 'one'}
8891
            Strategy used to fill null values.
8892
        limit
8893
            Number of consecutive null values to fill when using the 'forward' or
8894
            'backward' strategy.
8895
        matches_supertype
8896
            Fill all matching supertype of the fill `value`.
8897

8898
        Returns
8899
        -------
8900
        DataFrame
8901
            DataFrame with None values replaced by the filling strategy.
8902

8903
        See Also
8904
        --------
8905
        fill_nan
8906

8907
        Notes
8908
        -----
8909
        A null value is not the same as a NaN value.
8910
        To fill NaN values, use :func:`fill_nan`.
8911

8912
        Examples
8913
        --------
8914
        >>> df = pl.DataFrame(
8915
        ...     {
8916
        ...         "a": [1, 2, None, 4],
8917
        ...         "b": [0.5, 4, None, 13],
8918
        ...     }
8919
        ... )
8920
        >>> df.fill_null(99)
8921
        shape: (4, 2)
8922
        ┌─────┬──────┐
8923
        │ a   ┆ b    │
8924
        │ --- ┆ ---  │
8925
        │ i64 ┆ f64  │
8926
        ╞═════╪══════╡
8927
        │ 1   ┆ 0.5  │
8928
        │ 2   ┆ 4.0  │
8929
        │ 99  ┆ 99.0 │
8930
        │ 4   ┆ 13.0 │
8931
        └─────┴──────┘
8932
        >>> df.fill_null(strategy="forward")
8933
        shape: (4, 2)
8934
        ┌─────┬──────┐
8935
        │ a   ┆ b    │
8936
        │ --- ┆ ---  │
8937
        │ i64 ┆ f64  │
8938
        ╞═════╪══════╡
8939
        │ 1   ┆ 0.5  │
8940
        │ 2   ┆ 4.0  │
8941
        │ 2   ┆ 4.0  │
8942
        │ 4   ┆ 13.0 │
8943
        └─────┴──────┘
8944

8945
        >>> df.fill_null(strategy="max")
8946
        shape: (4, 2)
8947
        ┌─────┬──────┐
8948
        │ a   ┆ b    │
8949
        │ --- ┆ ---  │
8950
        │ i64 ┆ f64  │
8951
        ╞═════╪══════╡
8952
        │ 1   ┆ 0.5  │
8953
        │ 2   ┆ 4.0  │
8954
        │ 4   ┆ 13.0 │
8955
        │ 4   ┆ 13.0 │
8956
        └─────┴──────┘
8957

8958
        >>> df.fill_null(strategy="zero")
8959
        shape: (4, 2)
8960
        ┌─────┬──────┐
8961
        │ a   ┆ b    │
8962
        │ --- ┆ ---  │
8963
        │ i64 ┆ f64  │
8964
        ╞═════╪══════╡
8965
        │ 1   ┆ 0.5  │
8966
        │ 2   ┆ 4.0  │
8967
        │ 0   ┆ 0.0  │
8968
        │ 4   ┆ 13.0 │
8969
        └─────┴──────┘
8970
        """
8971
        from polars.lazyframe.opt_flags import QueryOptFlags
8972

8973
        return (
8974
            self.lazy()
8975
            .fill_null(value, strategy, limit, matches_supertype=matches_supertype)
8976
            .collect(optimizations=QueryOptFlags._eager())
8977
        )
8978

8979
    def fill_nan(self, value: Expr | int | float | None) -> DataFrame:
8980
        """
8981
        Fill floating point NaN values by an Expression evaluation.
8982

8983
        Parameters
8984
        ----------
8985
        value
8986
            Value used to fill NaN values.
8987

8988
        Returns
8989
        -------
8990
        DataFrame
8991
            DataFrame with NaN values replaced by the given value.
8992

8993
        See Also
8994
        --------
8995
        fill_null
8996

8997
        Notes
8998
        -----
8999
        A NaN value is not the same as a null value.
9000
        To fill null values, use :func:`fill_null`.
9001

9002
        Examples
9003
        --------
9004
        >>> df = pl.DataFrame(
9005
        ...     {
9006
        ...         "a": [1.5, 2, float("nan"), 4],
9007
        ...         "b": [0.5, 4, float("nan"), 13],
9008
        ...     }
9009
        ... )
9010
        >>> df.fill_nan(99)
9011
        shape: (4, 2)
9012
        ┌──────┬──────┐
9013
        │ a    ┆ b    │
9014
        │ ---  ┆ ---  │
9015
        │ f64  ┆ f64  │
9016
        ╞══════╪══════╡
9017
        │ 1.5  ┆ 0.5  │
9018
        │ 2.0  ┆ 4.0  │
9019
        │ 99.0 ┆ 99.0 │
9020
        │ 4.0  ┆ 13.0 │
9021
        └──────┴──────┘
9022
        """
9023
        from polars.lazyframe.opt_flags import QueryOptFlags
9024

9025
        return self.lazy().fill_nan(value).collect(optimizations=QueryOptFlags._eager())
9026

9027
    def explode(
9028
        self,
9029
        columns: ColumnNameOrSelector | Iterable[ColumnNameOrSelector],
9030
        *more_columns: ColumnNameOrSelector,
9031
    ) -> DataFrame:
9032
        """
9033
        Explode the dataframe to long format by exploding the given columns.
9034

9035
        Parameters
9036
        ----------
9037
        columns
9038
            Column names, expressions, or a selector defining them. The underlying
9039
            columns being exploded must be of the `List` or `Array` data type.
9040
        *more_columns
9041
            Additional names of columns to explode, specified as positional arguments.
9042

9043
        Returns
9044
        -------
9045
        DataFrame
9046

9047
        Examples
9048
        --------
9049
        >>> df = pl.DataFrame(
9050
        ...     {
9051
        ...         "letters": ["a", "a", "b", "c"],
9052
        ...         "numbers": [[1], [2, 3], [4, 5], [6, 7, 8]],
9053
        ...     }
9054
        ... )
9055
        >>> df
9056
        shape: (4, 2)
9057
        ┌─────────┬───────────┐
9058
        │ letters ┆ numbers   │
9059
        │ ---     ┆ ---       │
9060
        │ str     ┆ list[i64] │
9061
        ╞═════════╪═══════════╡
9062
        │ a       ┆ [1]       │
9063
        │ a       ┆ [2, 3]    │
9064
        │ b       ┆ [4, 5]    │
9065
        │ c       ┆ [6, 7, 8] │
9066
        └─────────┴───────────┘
9067
        >>> df.explode("numbers")
9068
        shape: (8, 2)
9069
        ┌─────────┬─────────┐
9070
        │ letters ┆ numbers │
9071
        │ ---     ┆ ---     │
9072
        │ str     ┆ i64     │
9073
        ╞═════════╪═════════╡
9074
        │ a       ┆ 1       │
9075
        │ a       ┆ 2       │
9076
        │ a       ┆ 3       │
9077
        │ b       ┆ 4       │
9078
        │ b       ┆ 5       │
9079
        │ c       ┆ 6       │
9080
        │ c       ┆ 7       │
9081
        │ c       ┆ 8       │
9082
        └─────────┴─────────┘
9083
        """
9084
        from polars.lazyframe.opt_flags import QueryOptFlags
9085

9086
        return (
9087
            self.lazy()
9088
            .explode(columns, *more_columns)
9089
            .collect(optimizations=QueryOptFlags._eager())
9090
        )
9091

9092
    @deprecate_renamed_parameter("columns", "on", version="1.0.0")
9093
    def pivot(
9094
        self,
9095
        on: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
9096
        *,
9097
        index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None,
9098
        values: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None,
9099
        aggregate_function: PivotAgg | Expr | None = None,
9100
        maintain_order: bool = True,
9101
        sort_columns: bool = False,
9102
        separator: str = "_",
9103
    ) -> DataFrame:
9104
        """
9105
        Create a spreadsheet-style pivot table as a DataFrame.
9106

9107
        Only available in eager mode. See "Examples" section below for how to do a
9108
        "lazy pivot" if you know the unique column values in advance.
9109

9110
        .. versionchanged:: 1.0.0
9111
            The `columns` parameter was renamed `on`.
9112

9113
        Parameters
9114
        ----------
9115
        on
9116
            The column(s) whose values will be used as the new columns of the output
9117
            DataFrame.
9118
        index
9119
            The column(s) that remain from the input to the output. The output DataFrame will have one row
9120
            for each unique combination of the `index`'s values.
9121
            If None, all remaining columns not specified on `on` and `values` will be used. At least one
9122
            of `index` and `values` must be specified.
9123
        values
9124
            The existing column(s) of values which will be moved under the new columns from index. If an
9125
            aggregation is specified, these are the values on which the aggregation will be computed.
9126
            If None, all remaining columns not specified on `on` and `index` will be used.
9127
            At least one of `index` and `values` must be specified.
9128
        aggregate_function
9129
            Choose from:
9130

9131
            - None: no aggregation takes place, will raise error if multiple values are in group.
9132
            - A predefined aggregate function string, one of
9133
              {'min', 'max', 'first', 'last', 'sum', 'mean', 'median', 'len'}
9134
            - An expression to do the aggregation. The expression can only access data from the respective
9135
              'values' columns as generated by pivot, through `pl.element()`.
9136
        maintain_order
9137
            Ensure the values of `index` are sorted by discovery order.
9138
        sort_columns
9139
            Sort the transposed columns by name. Default is by order of discovery.
9140
        separator
9141
            Used as separator/delimiter in generated column names in case of multiple
9142
            `values` columns.
9143

9144
        Returns
9145
        -------
9146
        DataFrame
9147

9148
        Notes
9149
        -----
9150
        In some other frameworks, you might know this operation as `pivot_wider`.
9151

9152
        Examples
9153
        --------
9154
        You can use `pivot` to reshape a dataframe from "long" to "wide" format.
9155

9156
        For example, suppose we have a dataframe of test scores achieved by some
9157
        students, where each row represents a distinct test.
9158

9159
        >>> df = pl.DataFrame(
9160
        ...     {
9161
        ...         "name": ["Cady", "Cady", "Karen", "Karen"],
9162
        ...         "subject": ["maths", "physics", "maths", "physics"],
9163
        ...         "test_1": [98, 99, 61, 58],
9164
        ...         "test_2": [100, 100, 60, 60],
9165
        ...     }
9166
        ... )
9167
        >>> df
9168
        shape: (4, 4)
9169
        ┌───────┬─────────┬────────┬────────┐
9170
        │ name  ┆ subject ┆ test_1 ┆ test_2 │
9171
        │ ---   ┆ ---     ┆ ---    ┆ ---    │
9172
        │ str   ┆ str     ┆ i64    ┆ i64    │
9173
        ╞═══════╪═════════╪════════╪════════╡
9174
        │ Cady  ┆ maths   ┆ 98     ┆ 100    │
9175
        │ Cady  ┆ physics ┆ 99     ┆ 100    │
9176
        │ Karen ┆ maths   ┆ 61     ┆ 60     │
9177
        │ Karen ┆ physics ┆ 58     ┆ 60     │
9178
        └───────┴─────────┴────────┴────────┘
9179

9180
        Using `pivot`, we can reshape so we have one row per student, with different
9181
        subjects as columns, and their `test_1` scores as values:
9182

9183
        >>> df.pivot("subject", index="name", values="test_1")
9184
        shape: (2, 3)
9185
        ┌───────┬───────┬─────────┐
9186
        │ name  ┆ maths ┆ physics │
9187
        │ ---   ┆ ---   ┆ ---     │
9188
        │ str   ┆ i64   ┆ i64     │
9189
        ╞═══════╪═══════╪═════════╡
9190
        │ Cady  ┆ 98    ┆ 99      │
9191
        │ Karen ┆ 61    ┆ 58      │
9192
        └───────┴───────┴─────────┘
9193

9194
        You can use selectors too - here we include all test scores in the pivoted table:
9195

9196
        >>> import polars.selectors as cs
9197
        >>> df.pivot("subject", values=cs.starts_with("test"))
9198
        shape: (2, 5)
9199
        ┌───────┬──────────────┬────────────────┬──────────────┬────────────────┐
9200
        │ name  ┆ test_1_maths ┆ test_1_physics ┆ test_2_maths ┆ test_2_physics │
9201
        │ ---   ┆ ---          ┆ ---            ┆ ---          ┆ ---            │
9202
        │ str   ┆ i64          ┆ i64            ┆ i64          ┆ i64            │
9203
        ╞═══════╪══════════════╪════════════════╪══════════════╪════════════════╡
9204
        │ Cady  ┆ 98           ┆ 99             ┆ 100          ┆ 100            │
9205
        │ Karen ┆ 61           ┆ 58             ┆ 60           ┆ 60             │
9206
        └───────┴──────────────┴────────────────┴──────────────┴────────────────┘
9207

9208
        If you end up with multiple values per cell, you can specify how to aggregate
9209
        them with `aggregate_function`:
9210

9211
        >>> df = pl.DataFrame(
9212
        ...     {
9213
        ...         "ix": [1, 1, 2, 2, 1, 2],
9214
        ...         "col": ["a", "a", "a", "a", "b", "b"],
9215
        ...         "foo": [0, 1, 2, 2, 7, 1],
9216
        ...         "bar": [0, 2, 0, 0, 9, 4],
9217
        ...     }
9218
        ... )
9219
        >>> df.pivot("col", index="ix", aggregate_function="sum")
9220
        shape: (2, 5)
9221
        ┌─────┬───────┬───────┬───────┬───────┐
9222
        │ ix  ┆ foo_a ┆ foo_b ┆ bar_a ┆ bar_b │
9223
        │ --- ┆ ---   ┆ ---   ┆ ---   ┆ ---   │
9224
        │ i64 ┆ i64   ┆ i64   ┆ i64   ┆ i64   │
9225
        ╞═════╪═══════╪═══════╪═══════╪═══════╡
9226
        │ 1   ┆ 1     ┆ 7     ┆ 2     ┆ 9     │
9227
        │ 2   ┆ 4     ┆ 1     ┆ 0     ┆ 4     │
9228
        └─────┴───────┴───────┴───────┴───────┘
9229

9230
        You can also pass a custom aggregation function using
9231
        :meth:`polars.element`:
9232

9233
        >>> df = pl.DataFrame(
9234
        ...     {
9235
        ...         "col1": ["a", "a", "a", "b", "b", "b"],
9236
        ...         "col2": ["x", "x", "x", "x", "y", "y"],
9237
        ...         "col3": [6, 7, 3, 2, 5, 7],
9238
        ...     }
9239
        ... )
9240
        >>> df.pivot(
9241
        ...     "col2",
9242
        ...     index="col1",
9243
        ...     values="col3",
9244
        ...     aggregate_function=pl.element().tanh().mean(),
9245
        ... )
9246
        shape: (2, 3)
9247
        ┌──────┬──────────┬──────────┐
9248
        │ col1 ┆ x        ┆ y        │
9249
        │ ---  ┆ ---      ┆ ---      │
9250
        │ str  ┆ f64      ┆ f64      │
9251
        ╞══════╪══════════╪══════════╡
9252
        │ a    ┆ 0.998347 ┆ null     │
9253
        │ b    ┆ 0.964028 ┆ 0.999954 │
9254
        └──────┴──────────┴──────────┘
9255

9256
        Note that `pivot` is only available in eager mode. If you know the unique
9257
        column values in advance, you can use :meth:`polars.LazyFrame.group_by` to
9258
        get the same result as above in lazy mode:
9259

9260
        >>> index = pl.col("col1")
9261
        >>> on = pl.col("col2")
9262
        >>> values = pl.col("col3")
9263
        >>> unique_column_values = ["x", "y"]
9264
        >>> aggregate_function = lambda col: col.tanh().mean()
9265
        >>> df.lazy().group_by(index).agg(
9266
        ...     aggregate_function(values.filter(on == value)).alias(value)
9267
        ...     for value in unique_column_values
9268
        ... ).collect()  # doctest: +IGNORE_RESULT
9269
        shape: (2, 3)
9270
        ┌──────┬──────────┬──────────┐
9271
        │ col1 ┆ x        ┆ y        │
9272
        │ ---  ┆ ---      ┆ ---      │
9273
        │ str  ┆ f64      ┆ f64      │
9274
        ╞══════╪══════════╪══════════╡
9275
        │ a    ┆ 0.998347 ┆ null     │
9276
        │ b    ┆ 0.964028 ┆ 0.999954 │
9277
        └──────┴──────────┴──────────┘
9278
        """  # noqa: W505
9279
        on = _expand_selectors(self, on)
9280
        if values is not None:
9281
            values = _expand_selectors(self, values)
9282
        if index is not None:
9283
            index = _expand_selectors(self, index)
9284

9285
        if isinstance(aggregate_function, str):
9286
            if aggregate_function == "first":
9287
                aggregate_expr = F.element().first()._pyexpr
9288
            elif aggregate_function == "sum":
9289
                aggregate_expr = F.element().sum()._pyexpr
9290
            elif aggregate_function == "max":
9291
                aggregate_expr = F.element().max()._pyexpr
9292
            elif aggregate_function == "min":
9293
                aggregate_expr = F.element().min()._pyexpr
9294
            elif aggregate_function == "mean":
9295
                aggregate_expr = F.element().mean()._pyexpr
9296
            elif aggregate_function == "median":
9297
                aggregate_expr = F.element().median()._pyexpr
9298
            elif aggregate_function == "last":
9299
                aggregate_expr = F.element().last()._pyexpr
9300
            elif aggregate_function == "len":
9301
                aggregate_expr = F.len()._pyexpr
9302
            elif aggregate_function == "count":
9303
                issue_deprecation_warning(
9304
                    "`aggregate_function='count'` input for `pivot` is deprecated."
9305
                    " Please use `aggregate_function='len'`.",
9306
                    version="0.20.5",
9307
                )
9308
                aggregate_expr = F.len()._pyexpr
9309
            else:
9310
                msg = f"invalid input for `aggregate_function` argument: {aggregate_function!r}"
9311
                raise ValueError(msg)
9312
        elif aggregate_function is None:
9313
            aggregate_expr = None
9314
        else:
9315
            aggregate_expr = aggregate_function._pyexpr
9316

9317
        return self._from_pydf(
9318
            self._df.pivot_expr(
9319
                on,
9320
                index,
9321
                values,
9322
                maintain_order,
9323
                sort_columns,
9324
                aggregate_expr,
9325
                separator,
9326
            )
9327
        )
9328

9329
    def unpivot(
9330
        self,
9331
        on: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None,
9332
        *,
9333
        index: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None,
9334
        variable_name: str | None = None,
9335
        value_name: str | None = None,
9336
    ) -> DataFrame:
9337
        """
9338
        Unpivot a DataFrame from wide to long format.
9339

9340
        Optionally leaves identifiers set.
9341

9342
        This function is useful to massage a DataFrame into a format where one or more
9343
        columns are identifier variables (index) while all other columns, considered
9344
        measured variables (on), are "unpivoted" to the row axis leaving just
9345
        two non-identifier columns, 'variable' and 'value'.
9346

9347
        Parameters
9348
        ----------
9349
        on
9350
            Column(s) or selector(s) to use as values variables; if `on`
9351
            is empty all columns that are not in `index` will be used.
9352
        index
9353
            Column(s) or selector(s) to use as identifier variables.
9354
        variable_name
9355
            Name to give to the `variable` column. Defaults to "variable"
9356
        value_name
9357
            Name to give to the `value` column. Defaults to "value"
9358

9359
        Notes
9360
        -----
9361
        If you're coming from pandas, this is similar to `pandas.DataFrame.melt`,
9362
        but with `index` replacing `id_vars` and `on` replacing `value_vars`.
9363
        In other frameworks, you might know this operation as `pivot_longer`.
9364

9365
        Examples
9366
        --------
9367
        >>> df = pl.DataFrame(
9368
        ...     {
9369
        ...         "a": ["x", "y", "z"],
9370
        ...         "b": [1, 3, 5],
9371
        ...         "c": [2, 4, 6],
9372
        ...     }
9373
        ... )
9374
        >>> import polars.selectors as cs
9375
        >>> df.unpivot(cs.numeric(), index="a")
9376
        shape: (6, 3)
9377
        ┌─────┬──────────┬───────┐
9378
        │ a   ┆ variable ┆ value │
9379
        │ --- ┆ ---      ┆ ---   │
9380
        │ str ┆ str      ┆ i64   │
9381
        ╞═════╪══════════╪═══════╡
9382
        │ x   ┆ b        ┆ 1     │
9383
        │ y   ┆ b        ┆ 3     │
9384
        │ z   ┆ b        ┆ 5     │
9385
        │ x   ┆ c        ┆ 2     │
9386
        │ y   ┆ c        ┆ 4     │
9387
        │ z   ┆ c        ┆ 6     │
9388
        └─────┴──────────┴───────┘
9389
        """
9390
        on = [] if on is None else _expand_selectors(self, on)
9391
        index = [] if index is None else _expand_selectors(self, index)
9392

9393
        return self._from_pydf(self._df.unpivot(on, index, value_name, variable_name))
9394

9395
    def unstack(
9396
        self,
9397
        *,
9398
        step: int,
9399
        how: UnstackDirection = "vertical",
9400
        columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None,
9401
        fill_values: list[Any] | None = None,
9402
    ) -> DataFrame:
9403
        """
9404
        Unstack a long table to a wide form without doing an aggregation.
9405

9406
        This can be much faster than a pivot, because it can skip the grouping phase.
9407

9408
        Parameters
9409
        ----------
9410
        step
9411
            Number of rows in the unstacked frame.
9412
        how : { 'vertical', 'horizontal' }
9413
            Direction of the unstack.
9414
        columns
9415
            Column name(s) or selector(s) to include in the operation.
9416
            If set to `None` (default), use all columns.
9417
        fill_values
9418
            Fill values that don't fit the new size with this value.
9419

9420
        Examples
9421
        --------
9422
        >>> from string import ascii_uppercase
9423
        >>> df = pl.DataFrame(
9424
        ...     {
9425
        ...         "x": list(ascii_uppercase[0:8]),
9426
        ...         "y": pl.int_range(1, 9, eager=True),
9427
        ...     }
9428
        ... ).with_columns(
9429
        ...     z=pl.int_ranges(pl.col("y"), pl.col("y") + 2, dtype=pl.UInt8),
9430
        ... )
9431
        >>> df
9432
        shape: (8, 3)
9433
        ┌─────┬─────┬──────────┐
9434
        │ x   ┆ y   ┆ z        │
9435
        │ --- ┆ --- ┆ ---      │
9436
        │ str ┆ i64 ┆ list[u8] │
9437
        ╞═════╪═════╪══════════╡
9438
        │ A   ┆ 1   ┆ [1, 2]   │
9439
        │ B   ┆ 2   ┆ [2, 3]   │
9440
        │ C   ┆ 3   ┆ [3, 4]   │
9441
        │ D   ┆ 4   ┆ [4, 5]   │
9442
        │ E   ┆ 5   ┆ [5, 6]   │
9443
        │ F   ┆ 6   ┆ [6, 7]   │
9444
        │ G   ┆ 7   ┆ [7, 8]   │
9445
        │ H   ┆ 8   ┆ [8, 9]   │
9446
        └─────┴─────┴──────────┘
9447
        >>> df.unstack(step=4, how="vertical")
9448
        shape: (4, 6)
9449
        ┌─────┬─────┬─────┬─────┬──────────┬──────────┐
9450
        │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0      ┆ z_1      │
9451
        │ --- ┆ --- ┆ --- ┆ --- ┆ ---      ┆ ---      │
9452
        │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │
9453
        ╞═════╪═════╪═════╪═════╪══════════╪══════════╡
9454
        │ A   ┆ E   ┆ 1   ┆ 5   ┆ [1, 2]   ┆ [5, 6]   │
9455
        │ B   ┆ F   ┆ 2   ┆ 6   ┆ [2, 3]   ┆ [6, 7]   │
9456
        │ C   ┆ G   ┆ 3   ┆ 7   ┆ [3, 4]   ┆ [7, 8]   │
9457
        │ D   ┆ H   ┆ 4   ┆ 8   ┆ [4, 5]   ┆ [8, 9]   │
9458
        └─────┴─────┴─────┴─────┴──────────┴──────────┘
9459
        >>> df.unstack(step=2, how="horizontal")
9460
        shape: (4, 6)
9461
        ┌─────┬─────┬─────┬─────┬──────────┬──────────┐
9462
        │ x_0 ┆ x_1 ┆ y_0 ┆ y_1 ┆ z_0      ┆ z_1      │
9463
        │ --- ┆ --- ┆ --- ┆ --- ┆ ---      ┆ ---      │
9464
        │ str ┆ str ┆ i64 ┆ i64 ┆ list[u8] ┆ list[u8] │
9465
        ╞═════╪═════╪═════╪═════╪══════════╪══════════╡
9466
        │ A   ┆ B   ┆ 1   ┆ 2   ┆ [1, 2]   ┆ [2, 3]   │
9467
        │ C   ┆ D   ┆ 3   ┆ 4   ┆ [3, 4]   ┆ [4, 5]   │
9468
        │ E   ┆ F   ┆ 5   ┆ 6   ┆ [5, 6]   ┆ [6, 7]   │
9469
        │ G   ┆ H   ┆ 7   ┆ 8   ┆ [7, 8]   ┆ [8, 9]   │
9470
        └─────┴─────┴─────┴─────┴──────────┴──────────┘
9471
        >>> import polars.selectors as cs
9472
        >>> df.unstack(step=5, columns=cs.numeric(), fill_values=0)
9473
        shape: (5, 2)
9474
        ┌─────┬─────┐
9475
        │ y_0 ┆ y_1 │
9476
        │ --- ┆ --- │
9477
        │ i64 ┆ i64 │
9478
        ╞═════╪═════╡
9479
        │ 1   ┆ 6   │
9480
        │ 2   ┆ 7   │
9481
        │ 3   ┆ 8   │
9482
        │ 4   ┆ 0   │
9483
        │ 5   ┆ 0   │
9484
        └─────┴─────┘
9485
        """
9486
        import math
9487

9488
        df = self.select(columns) if columns is not None else self
9489

9490
        height = df.height
9491
        if how == "vertical":
9492
            n_rows = step
9493
            n_cols = math.ceil(height / n_rows)
9494
        else:
9495
            n_cols = step
9496
            n_rows = math.ceil(height / n_cols)
9497

9498
        if n_fill := n_cols * n_rows - height:
9499
            if not isinstance(fill_values, list):
9500
                fill_values = [fill_values for _ in range(df.width)]
9501

9502
            df = df.select(
9503
                s.extend_constant(next_fill, n_fill)
9504
                for s, next_fill in zip(df, fill_values)
9505
            )
9506

9507
        if how == "horizontal":
9508
            df = (
9509
                df.with_columns(
9510
                    (F.int_range(0, n_cols * n_rows, eager=True) % n_cols).alias(
9511
                        "__sort_order"
9512
                    ),
9513
                )
9514
                .sort("__sort_order")
9515
                .drop("__sort_order")
9516
            )
9517

9518
        zfill_val = math.floor(math.log10(n_cols)) + 1
9519
        slices = [
9520
            s.slice(slice_nbr * n_rows, n_rows).alias(
9521
                s.name + "_" + str(slice_nbr).zfill(zfill_val)
9522
            )
9523
            for s in df
9524
            for slice_nbr in range(n_cols)
9525
        ]
9526

9527
        return DataFrame(slices)
9528

9529
    @overload
9530
    def partition_by(
9531
        self,
9532
        by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
9533
        *more_by: ColumnNameOrSelector,
9534
        maintain_order: bool = ...,
9535
        include_key: bool = ...,
9536
        as_dict: Literal[False] = ...,
9537
    ) -> list[DataFrame]: ...
9538

9539
    @overload
9540
    def partition_by(
9541
        self,
9542
        by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
9543
        *more_by: ColumnNameOrSelector,
9544
        maintain_order: bool = ...,
9545
        include_key: bool = ...,
9546
        as_dict: Literal[True],
9547
    ) -> dict[tuple[Any, ...], DataFrame]: ...
9548

9549
    @overload
9550
    def partition_by(
9551
        self,
9552
        by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
9553
        *more_by: ColumnNameOrSelector,
9554
        maintain_order: bool = ...,
9555
        include_key: bool = ...,
9556
        as_dict: bool,
9557
    ) -> list[DataFrame] | dict[tuple[Any, ...], DataFrame]: ...
9558

9559
    def partition_by(
9560
        self,
9561
        by: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
9562
        *more_by: ColumnNameOrSelector,
9563
        maintain_order: bool = True,
9564
        include_key: bool = True,
9565
        as_dict: bool = False,
9566
    ) -> list[DataFrame] | dict[tuple[Any, ...], DataFrame]:
9567
        """
9568
        Group by the given columns and return the groups as separate dataframes.
9569

9570
        Parameters
9571
        ----------
9572
        by
9573
            Column name(s) or selector(s) to group by.
9574
        *more_by
9575
            Additional names of columns to group by, specified as positional arguments.
9576
        maintain_order
9577
            Ensure that the order of the groups is consistent with the input data.
9578
            This is slower than a default partition by operation.
9579
        include_key
9580
            Include the columns used to partition the DataFrame in the output.
9581
        as_dict
9582
            Return a dictionary instead of a list. The dictionary keys are tuples of
9583
            the distinct group values that identify each group.
9584

9585
        Examples
9586
        --------
9587
        Pass a single column name to partition by that column.
9588

9589
        >>> df = pl.DataFrame(
9590
        ...     {
9591
        ...         "a": ["a", "b", "a", "b", "c"],
9592
        ...         "b": [1, 2, 1, 3, 3],
9593
        ...         "c": [5, 4, 3, 2, 1],
9594
        ...     }
9595
        ... )
9596
        >>> df.partition_by("a")  # doctest: +IGNORE_RESULT
9597
        [shape: (2, 3)
9598
        ┌─────┬─────┬─────┐
9599
        │ a   ┆ b   ┆ c   │
9600
        │ --- ┆ --- ┆ --- │
9601
        │ str ┆ i64 ┆ i64 │
9602
        ╞═════╪═════╪═════╡
9603
        │ a   ┆ 1   ┆ 5   │
9604
        │ a   ┆ 1   ┆ 3   │
9605
        └─────┴─────┴─────┘,
9606
        shape: (2, 3)
9607
        ┌─────┬─────┬─────┐
9608
        │ a   ┆ b   ┆ c   │
9609
        │ --- ┆ --- ┆ --- │
9610
        │ str ┆ i64 ┆ i64 │
9611
        ╞═════╪═════╪═════╡
9612
        │ b   ┆ 2   ┆ 4   │
9613
        │ b   ┆ 3   ┆ 2   │
9614
        └─────┴─────┴─────┘,
9615
        shape: (1, 3)
9616
        ┌─────┬─────┬─────┐
9617
        │ a   ┆ b   ┆ c   │
9618
        │ --- ┆ --- ┆ --- │
9619
        │ str ┆ i64 ┆ i64 │
9620
        ╞═════╪═════╪═════╡
9621
        │ c   ┆ 3   ┆ 1   │
9622
        └─────┴─────┴─────┘]
9623

9624
        Partition by multiple columns by either passing a list of column names, or by
9625
        specifying each column name as a positional argument.
9626

9627
        >>> df.partition_by("a", "b")  # doctest: +IGNORE_RESULT
9628
        [shape: (2, 3)
9629
        ┌─────┬─────┬─────┐
9630
        │ a   ┆ b   ┆ c   │
9631
        │ --- ┆ --- ┆ --- │
9632
        │ str ┆ i64 ┆ i64 │
9633
        ╞═════╪═════╪═════╡
9634
        │ a   ┆ 1   ┆ 5   │
9635
        │ a   ┆ 1   ┆ 3   │
9636
        └─────┴─────┴─────┘,
9637
        shape: (1, 3)
9638
        ┌─────┬─────┬─────┐
9639
        │ a   ┆ b   ┆ c   │
9640
        │ --- ┆ --- ┆ --- │
9641
        │ str ┆ i64 ┆ i64 │
9642
        ╞═════╪═════╪═════╡
9643
        │ b   ┆ 2   ┆ 4   │
9644
        └─────┴─────┴─────┘,
9645
        shape: (1, 3)
9646
        ┌─────┬─────┬─────┐
9647
        │ a   ┆ b   ┆ c   │
9648
        │ --- ┆ --- ┆ --- │
9649
        │ str ┆ i64 ┆ i64 │
9650
        ╞═════╪═════╪═════╡
9651
        │ b   ┆ 3   ┆ 2   │
9652
        └─────┴─────┴─────┘,
9653
        shape: (1, 3)
9654
        ┌─────┬─────┬─────┐
9655
        │ a   ┆ b   ┆ c   │
9656
        │ --- ┆ --- ┆ --- │
9657
        │ str ┆ i64 ┆ i64 │
9658
        ╞═════╪═════╪═════╡
9659
        │ c   ┆ 3   ┆ 1   │
9660
        └─────┴─────┴─────┘]
9661

9662
        Return the partitions as a dictionary by specifying `as_dict=True`.
9663

9664
        >>> import polars.selectors as cs
9665
        >>> df.partition_by(cs.string(), as_dict=True)  # doctest: +IGNORE_RESULT
9666
        {('a',): shape: (2, 3)
9667
        ┌─────┬─────┬─────┐
9668
        │ a   ┆ b   ┆ c   │
9669
        │ --- ┆ --- ┆ --- │
9670
        │ str ┆ i64 ┆ i64 │
9671
        ╞═════╪═════╪═════╡
9672
        │ a   ┆ 1   ┆ 5   │
9673
        │ a   ┆ 1   ┆ 3   │
9674
        └─────┴─────┴─────┘,
9675
        ('b',): shape: (2, 3)
9676
        ┌─────┬─────┬─────┐
9677
        │ a   ┆ b   ┆ c   │
9678
        │ --- ┆ --- ┆ --- │
9679
        │ str ┆ i64 ┆ i64 │
9680
        ╞═════╪═════╪═════╡
9681
        │ b   ┆ 2   ┆ 4   │
9682
        │ b   ┆ 3   ┆ 2   │
9683
        └─────┴─────┴─────┘,
9684
        ('c',): shape: (1, 3)
9685
        ┌─────┬─────┬─────┐
9686
        │ a   ┆ b   ┆ c   │
9687
        │ --- ┆ --- ┆ --- │
9688
        │ str ┆ i64 ┆ i64 │
9689
        ╞═════╪═════╪═════╡
9690
        │ c   ┆ 3   ┆ 1   │
9691
        └─────┴─────┴─────┘}
9692
        """
9693
        by_parsed = _expand_selectors(self, by, *more_by)
9694

9695
        partitions = [
9696
            self._from_pydf(_df)
9697
            for _df in self._df.partition_by(by_parsed, maintain_order, include_key)
9698
        ]
9699

9700
        if as_dict:
9701
            if include_key:
9702
                names = [p.select(by_parsed).row(0) for p in partitions]
9703
            else:
9704
                if not maintain_order:  # Group keys cannot be matched to partitions
9705
                    msg = "cannot use `partition_by` with `maintain_order=False, include_key=False, as_dict=True`"
9706
                    raise ValueError(msg)
9707
                names = self.select(by_parsed).unique(maintain_order=True).rows()
9708

9709
            return dict(zip(names, partitions))
9710

9711
        return partitions
9712

9713
    def shift(self, n: int = 1, *, fill_value: IntoExpr | None = None) -> DataFrame:
9714
        """
9715
        Shift values by the given number of indices.
9716

9717
        Parameters
9718
        ----------
9719
        n
9720
            Number of indices to shift forward. If a negative value is passed, values
9721
            are shifted in the opposite direction instead.
9722
        fill_value
9723
            Fill the resulting null values with this value. Accepts scalar expression
9724
            input. Non-expression inputs are parsed as literals.
9725

9726
        Notes
9727
        -----
9728
        This method is similar to the `LAG` operation in SQL when the value for `n`
9729
        is positive. With a negative value for `n`, it is similar to `LEAD`.
9730

9731
        Examples
9732
        --------
9733
        By default, values are shifted forward by one index.
9734

9735
        >>> df = pl.DataFrame(
9736
        ...     {
9737
        ...         "a": [1, 2, 3, 4],
9738
        ...         "b": [5, 6, 7, 8],
9739
        ...     }
9740
        ... )
9741
        >>> df.shift()
9742
        shape: (4, 2)
9743
        ┌──────┬──────┐
9744
        │ a    ┆ b    │
9745
        │ ---  ┆ ---  │
9746
        │ i64  ┆ i64  │
9747
        ╞══════╪══════╡
9748
        │ null ┆ null │
9749
        │ 1    ┆ 5    │
9750
        │ 2    ┆ 6    │
9751
        │ 3    ┆ 7    │
9752
        └──────┴──────┘
9753

9754
        Pass a negative value to shift in the opposite direction instead.
9755

9756
        >>> df.shift(-2)
9757
        shape: (4, 2)
9758
        ┌──────┬──────┐
9759
        │ a    ┆ b    │
9760
        │ ---  ┆ ---  │
9761
        │ i64  ┆ i64  │
9762
        ╞══════╪══════╡
9763
        │ 3    ┆ 7    │
9764
        │ 4    ┆ 8    │
9765
        │ null ┆ null │
9766
        │ null ┆ null │
9767
        └──────┴──────┘
9768

9769
        Specify `fill_value` to fill the resulting null values.
9770

9771
        >>> df.shift(-2, fill_value=100)
9772
        shape: (4, 2)
9773
        ┌─────┬─────┐
9774
        │ a   ┆ b   │
9775
        │ --- ┆ --- │
9776
        │ i64 ┆ i64 │
9777
        ╞═════╪═════╡
9778
        │ 3   ┆ 7   │
9779
        │ 4   ┆ 8   │
9780
        │ 100 ┆ 100 │
9781
        │ 100 ┆ 100 │
9782
        └─────┴─────┘
9783
        """
9784
        from polars.lazyframe.opt_flags import QueryOptFlags
9785

9786
        return (
9787
            self.lazy()
9788
            .shift(n, fill_value=fill_value)
9789
            .collect(optimizations=QueryOptFlags._eager())
9790
        )
9791

9792
    def is_duplicated(self) -> Series:
9793
        """
9794
        Get a mask of all duplicated rows in this DataFrame.
9795

9796
        Examples
9797
        --------
9798
        >>> df = pl.DataFrame(
9799
        ...     {
9800
        ...         "a": [1, 2, 3, 1],
9801
        ...         "b": ["x", "y", "z", "x"],
9802
        ...     }
9803
        ... )
9804
        >>> df.is_duplicated()
9805
        shape: (4,)
9806
        Series: '' [bool]
9807
        [
9808
                true
9809
                false
9810
                false
9811
                true
9812
        ]
9813

9814
        This mask can be used to visualize the duplicated lines like this:
9815

9816
        >>> df.filter(df.is_duplicated())
9817
        shape: (2, 2)
9818
        ┌─────┬─────┐
9819
        │ a   ┆ b   │
9820
        │ --- ┆ --- │
9821
        │ i64 ┆ str │
9822
        ╞═════╪═════╡
9823
        │ 1   ┆ x   │
9824
        │ 1   ┆ x   │
9825
        └─────┴─────┘
9826
        """
9827
        return wrap_s(self._df.is_duplicated())
9828

9829
    def is_unique(self) -> Series:
9830
        """
9831
        Get a mask of all unique rows in this DataFrame.
9832

9833
        Examples
9834
        --------
9835
        >>> df = pl.DataFrame(
9836
        ...     {
9837
        ...         "a": [1, 2, 3, 1],
9838
        ...         "b": ["x", "y", "z", "x"],
9839
        ...     }
9840
        ... )
9841
        >>> df.is_unique()
9842
        shape: (4,)
9843
        Series: '' [bool]
9844
        [
9845
                false
9846
                true
9847
                true
9848
                false
9849
        ]
9850

9851
        This mask can be used to visualize the unique lines like this:
9852

9853
        >>> df.filter(df.is_unique())
9854
        shape: (2, 2)
9855
        ┌─────┬─────┐
9856
        │ a   ┆ b   │
9857
        │ --- ┆ --- │
9858
        │ i64 ┆ str │
9859
        ╞═════╪═════╡
9860
        │ 2   ┆ y   │
9861
        │ 3   ┆ z   │
9862
        └─────┴─────┘
9863
        """
9864
        return wrap_s(self._df.is_unique())
9865

9866
    def lazy(self) -> LazyFrame:
9867
        """
9868
        Start a lazy query from this point. This returns a `LazyFrame` object.
9869

9870
        Operations on a `LazyFrame` are not executed until this is triggered
9871
        by calling one of:
9872

9873
        * :meth:`.collect() <polars.LazyFrame.collect>`
9874
            (run on all data)
9875
        * :meth:`.explain() <polars.LazyFrame.explain>`
9876
            (print the query plan)
9877
        * :meth:`.show_graph() <polars.LazyFrame.show_graph>`
9878
            (show the query plan as graphviz graph)
9879
        * :meth:`.collect_schema() <polars.LazyFrame.collect_schema>`
9880
            (return the final frame schema)
9881

9882
        Lazy operations are recommended because they allow for query optimization and
9883
        additional parallelism.
9884

9885
        Returns
9886
        -------
9887
        LazyFrame
9888

9889
        Examples
9890
        --------
9891
        >>> df = pl.DataFrame(
9892
        ...     {
9893
        ...         "a": [None, 2, 3, 4],
9894
        ...         "b": [0.5, None, 2.5, 13],
9895
        ...         "c": [True, True, False, None],
9896
        ...     }
9897
        ... )
9898
        >>> df.lazy()
9899
        <LazyFrame at ...>
9900
        """
9901
        return wrap_ldf(self._df.lazy())
9902

9903
    def select(
9904
        self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr
9905
    ) -> DataFrame:
9906
        """
9907
        Select columns from this DataFrame.
9908

9909
        Parameters
9910
        ----------
9911
        *exprs
9912
            Column(s) to select, specified as positional arguments.
9913
            Accepts expression input. Strings are parsed as column names,
9914
            other non-expression inputs are parsed as literals.
9915
        **named_exprs
9916
            Additional columns to select, specified as keyword arguments.
9917
            The columns will be renamed to the keyword used.
9918

9919
        Examples
9920
        --------
9921
        Pass the name of a column to select that column.
9922

9923
        >>> df = pl.DataFrame(
9924
        ...     {
9925
        ...         "foo": [1, 2, 3],
9926
        ...         "bar": [6, 7, 8],
9927
        ...         "ham": ["a", "b", "c"],
9928
        ...     }
9929
        ... )
9930
        >>> df.select("foo")
9931
        shape: (3, 1)
9932
        ┌─────┐
9933
        │ foo │
9934
        │ --- │
9935
        │ i64 │
9936
        ╞═════╡
9937
        │ 1   │
9938
        │ 2   │
9939
        │ 3   │
9940
        └─────┘
9941

9942
        Multiple columns can be selected by passing a list of column names.
9943

9944
        >>> df.select(["foo", "bar"])
9945
        shape: (3, 2)
9946
        ┌─────┬─────┐
9947
        │ foo ┆ bar │
9948
        │ --- ┆ --- │
9949
        │ i64 ┆ i64 │
9950
        ╞═════╪═════╡
9951
        │ 1   ┆ 6   │
9952
        │ 2   ┆ 7   │
9953
        │ 3   ┆ 8   │
9954
        └─────┴─────┘
9955

9956
        Multiple columns can also be selected using positional arguments instead of a
9957
        list. Expressions are also accepted.
9958

9959
        >>> df.select(pl.col("foo"), pl.col("bar") + 1)
9960
        shape: (3, 2)
9961
        ┌─────┬─────┐
9962
        │ foo ┆ bar │
9963
        │ --- ┆ --- │
9964
        │ i64 ┆ i64 │
9965
        ╞═════╪═════╡
9966
        │ 1   ┆ 7   │
9967
        │ 2   ┆ 8   │
9968
        │ 3   ┆ 9   │
9969
        └─────┴─────┘
9970

9971
        Use keyword arguments to easily name your expression inputs.
9972

9973
        >>> df.select(threshold=pl.when(pl.col("foo") > 2).then(10).otherwise(0))
9974
        shape: (3, 1)
9975
        ┌───────────┐
9976
        │ threshold │
9977
        │ ---       │
9978
        │ i32       │
9979
        ╞═══════════╡
9980
        │ 0         │
9981
        │ 0         │
9982
        │ 10        │
9983
        └───────────┘
9984
        """
9985
        from polars.lazyframe.opt_flags import QueryOptFlags
9986

9987
        return (
9988
            self.lazy()
9989
            .select(*exprs, **named_exprs)
9990
            .collect(optimizations=QueryOptFlags._eager())
9991
        )
9992

9993
    def select_seq(
9994
        self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr
9995
    ) -> DataFrame:
9996
        """
9997
        Select columns from this DataFrame.
9998

9999
        This will run all expression sequentially instead of in parallel.
10000
        Use this when the work per expression is cheap.
10001

10002
        Parameters
10003
        ----------
10004
        *exprs
10005
            Column(s) to select, specified as positional arguments.
10006
            Accepts expression input. Strings are parsed as column names,
10007
            other non-expression inputs are parsed as literals.
10008
        **named_exprs
10009
            Additional columns to select, specified as keyword arguments.
10010
            The columns will be renamed to the keyword used.
10011

10012
        See Also
10013
        --------
10014
        select
10015
        """
10016
        from polars.lazyframe.opt_flags import QueryOptFlags
10017

10018
        return (
10019
            self.lazy()
10020
            .select_seq(*exprs, **named_exprs)
10021
            .collect(optimizations=QueryOptFlags._eager())
10022
        )
10023

10024
    def with_columns(
10025
        self,
10026
        *exprs: IntoExpr | Iterable[IntoExpr],
10027
        **named_exprs: IntoExpr,
10028
    ) -> DataFrame:
10029
        """
10030
        Add columns to this DataFrame.
10031

10032
        Added columns will replace existing columns with the same name.
10033

10034
        Parameters
10035
        ----------
10036
        *exprs
10037
            Column(s) to add, specified as positional arguments.
10038
            Accepts expression input. Strings are parsed as column names, other
10039
            non-expression inputs are parsed as literals.
10040
        **named_exprs
10041
            Additional columns to add, specified as keyword arguments.
10042
            The columns will be renamed to the keyword used.
10043

10044
        Returns
10045
        -------
10046
        DataFrame
10047
            A new DataFrame with the columns added.
10048

10049
        Notes
10050
        -----
10051
        Creating a new DataFrame using this method does not create a new copy of
10052
        existing data.
10053

10054
        Examples
10055
        --------
10056
        Pass an expression to add it as a new column.
10057

10058
        >>> df = pl.DataFrame(
10059
        ...     {
10060
        ...         "a": [1, 2, 3, 4],
10061
        ...         "b": [0.5, 4, 10, 13],
10062
        ...         "c": [True, True, False, True],
10063
        ...     }
10064
        ... )
10065
        >>> df.with_columns((pl.col("a") ** 2).alias("a^2"))
10066
        shape: (4, 4)
10067
        ┌─────┬──────┬───────┬─────┐
10068
        │ a   ┆ b    ┆ c     ┆ a^2 │
10069
        │ --- ┆ ---  ┆ ---   ┆ --- │
10070
        │ i64 ┆ f64  ┆ bool  ┆ i64 │
10071
        ╞═════╪══════╪═══════╪═════╡
10072
        │ 1   ┆ 0.5  ┆ true  ┆ 1   │
10073
        │ 2   ┆ 4.0  ┆ true  ┆ 4   │
10074
        │ 3   ┆ 10.0 ┆ false ┆ 9   │
10075
        │ 4   ┆ 13.0 ┆ true  ┆ 16  │
10076
        └─────┴──────┴───────┴─────┘
10077

10078
        Added columns will replace existing columns with the same name.
10079

10080
        >>> df.with_columns(pl.col("a").cast(pl.Float64))
10081
        shape: (4, 3)
10082
        ┌─────┬──────┬───────┐
10083
        │ a   ┆ b    ┆ c     │
10084
        │ --- ┆ ---  ┆ ---   │
10085
        │ f64 ┆ f64  ┆ bool  │
10086
        ╞═════╪══════╪═══════╡
10087
        │ 1.0 ┆ 0.5  ┆ true  │
10088
        │ 2.0 ┆ 4.0  ┆ true  │
10089
        │ 3.0 ┆ 10.0 ┆ false │
10090
        │ 4.0 ┆ 13.0 ┆ true  │
10091
        └─────┴──────┴───────┘
10092

10093
        Multiple columns can be added using positional arguments.
10094

10095
        >>> df.with_columns(
10096
        ...     (pl.col("a") ** 2).alias("a^2"),
10097
        ...     (pl.col("b") / 2).alias("b/2"),
10098
        ...     (pl.col("c").not_()).alias("not c"),
10099
        ... )
10100
        shape: (4, 6)
10101
        ┌─────┬──────┬───────┬─────┬──────┬───────┐
10102
        │ a   ┆ b    ┆ c     ┆ a^2 ┆ b/2  ┆ not c │
10103
        │ --- ┆ ---  ┆ ---   ┆ --- ┆ ---  ┆ ---   │
10104
        │ i64 ┆ f64  ┆ bool  ┆ i64 ┆ f64  ┆ bool  │
10105
        ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
10106
        │ 1   ┆ 0.5  ┆ true  ┆ 1   ┆ 0.25 ┆ false │
10107
        │ 2   ┆ 4.0  ┆ true  ┆ 4   ┆ 2.0  ┆ false │
10108
        │ 3   ┆ 10.0 ┆ false ┆ 9   ┆ 5.0  ┆ true  │
10109
        │ 4   ┆ 13.0 ┆ true  ┆ 16  ┆ 6.5  ┆ false │
10110
        └─────┴──────┴───────┴─────┴──────┴───────┘
10111

10112
        Multiple columns can also be added by passing a list of expressions.
10113

10114
        >>> df.with_columns(
10115
        ...     [
10116
        ...         (pl.col("a") ** 2).alias("a^2"),
10117
        ...         (pl.col("b") / 2).alias("b/2"),
10118
        ...         (pl.col("c").not_()).alias("not c"),
10119
        ...     ]
10120
        ... )
10121
        shape: (4, 6)
10122
        ┌─────┬──────┬───────┬─────┬──────┬───────┐
10123
        │ a   ┆ b    ┆ c     ┆ a^2 ┆ b/2  ┆ not c │
10124
        │ --- ┆ ---  ┆ ---   ┆ --- ┆ ---  ┆ ---   │
10125
        │ i64 ┆ f64  ┆ bool  ┆ i64 ┆ f64  ┆ bool  │
10126
        ╞═════╪══════╪═══════╪═════╪══════╪═══════╡
10127
        │ 1   ┆ 0.5  ┆ true  ┆ 1   ┆ 0.25 ┆ false │
10128
        │ 2   ┆ 4.0  ┆ true  ┆ 4   ┆ 2.0  ┆ false │
10129
        │ 3   ┆ 10.0 ┆ false ┆ 9   ┆ 5.0  ┆ true  │
10130
        │ 4   ┆ 13.0 ┆ true  ┆ 16  ┆ 6.5  ┆ false │
10131
        └─────┴──────┴───────┴─────┴──────┴───────┘
10132

10133
        Use keyword arguments to easily name your expression inputs.
10134

10135
        >>> df.with_columns(
10136
        ...     ab=pl.col("a") * pl.col("b"),
10137
        ...     not_c=pl.col("c").not_(),
10138
        ... )
10139
        shape: (4, 5)
10140
        ┌─────┬──────┬───────┬──────┬───────┐
10141
        │ a   ┆ b    ┆ c     ┆ ab   ┆ not_c │
10142
        │ --- ┆ ---  ┆ ---   ┆ ---  ┆ ---   │
10143
        │ i64 ┆ f64  ┆ bool  ┆ f64  ┆ bool  │
10144
        ╞═════╪══════╪═══════╪══════╪═══════╡
10145
        │ 1   ┆ 0.5  ┆ true  ┆ 0.5  ┆ false │
10146
        │ 2   ┆ 4.0  ┆ true  ┆ 8.0  ┆ false │
10147
        │ 3   ┆ 10.0 ┆ false ┆ 30.0 ┆ true  │
10148
        │ 4   ┆ 13.0 ┆ true  ┆ 52.0 ┆ false │
10149
        └─────┴──────┴───────┴──────┴───────┘
10150
        """
10151
        from polars.lazyframe.opt_flags import QueryOptFlags
10152

10153
        return (
10154
            self.lazy()
10155
            .with_columns(*exprs, **named_exprs)
10156
            .collect(optimizations=QueryOptFlags._eager())
10157
        )
10158

10159
    def with_columns_seq(
10160
        self,
10161
        *exprs: IntoExpr | Iterable[IntoExpr],
10162
        **named_exprs: IntoExpr,
10163
    ) -> DataFrame:
10164
        """
10165
        Add columns to this DataFrame.
10166

10167
        Added columns will replace existing columns with the same name.
10168

10169
        This will run all expression sequentially instead of in parallel.
10170
        Use this when the work per expression is cheap.
10171

10172
        Parameters
10173
        ----------
10174
        *exprs
10175
            Column(s) to add, specified as positional arguments.
10176
            Accepts expression input. Strings are parsed as column names, other
10177
            non-expression inputs are parsed as literals.
10178
        **named_exprs
10179
            Additional columns to add, specified as keyword arguments.
10180
            The columns will be renamed to the keyword used.
10181

10182
        Returns
10183
        -------
10184
        DataFrame
10185
            A new DataFrame with the columns added.
10186

10187
        See Also
10188
        --------
10189
        with_columns
10190
        """
10191
        from polars.lazyframe.opt_flags import QueryOptFlags
10192

10193
        return (
10194
            self.lazy()
10195
            .with_columns_seq(*exprs, **named_exprs)
10196
            .collect(optimizations=QueryOptFlags._eager())
10197
        )
10198

10199
    @overload
10200
    def n_chunks(self, strategy: Literal["first"] = ...) -> int: ...
10201

10202
    @overload
10203
    def n_chunks(self, strategy: Literal["all"]) -> list[int]: ...
10204

10205
    def n_chunks(self, strategy: Literal["first", "all"] = "first") -> int | list[int]:
10206
        """
10207
        Get number of chunks used by the ChunkedArrays of this DataFrame.
10208

10209
        Parameters
10210
        ----------
10211
        strategy : {'first', 'all'}
10212
            Return the number of chunks of the 'first' column,
10213
            or 'all' columns in this DataFrame.
10214

10215

10216
        Examples
10217
        --------
10218
        >>> df = pl.DataFrame(
10219
        ...     {
10220
        ...         "a": [1, 2, 3, 4],
10221
        ...         "b": [0.5, 4, 10, 13],
10222
        ...         "c": [True, True, False, True],
10223
        ...     }
10224
        ... )
10225
        >>> df.n_chunks()
10226
        1
10227
        >>> df.n_chunks(strategy="all")
10228
        [1, 1, 1]
10229
        """
10230
        if strategy == "first":
10231
            return self._df.n_chunks()
10232
        elif strategy == "all":
10233
            return [s.n_chunks() for s in self.__iter__()]
10234
        else:
10235
            msg = (
10236
                f"unexpected input for `strategy`: {strategy!r}"
10237
                f"\n\nChoose one of {{'first', 'all'}}"
10238
            )
10239
            raise ValueError(msg)
10240

10241
    def max(self) -> DataFrame:
10242
        """
10243
        Aggregate the columns of this DataFrame to their maximum value.
10244

10245
        Examples
10246
        --------
10247
        >>> df = pl.DataFrame(
10248
        ...     {
10249
        ...         "foo": [1, 2, 3],
10250
        ...         "bar": [6, 7, 8],
10251
        ...         "ham": ["a", "b", "c"],
10252
        ...     }
10253
        ... )
10254
        >>> df.max()
10255
        shape: (1, 3)
10256
        ┌─────┬─────┬─────┐
10257
        │ foo ┆ bar ┆ ham │
10258
        │ --- ┆ --- ┆ --- │
10259
        │ i64 ┆ i64 ┆ str │
10260
        ╞═════╪═════╪═════╡
10261
        │ 3   ┆ 8   ┆ c   │
10262
        └─────┴─────┴─────┘
10263
        """
10264
        from polars.lazyframe.opt_flags import QueryOptFlags
10265

10266
        return self.lazy().max().collect(optimizations=QueryOptFlags._eager())
10267

10268
    def max_horizontal(self) -> Series:
10269
        """
10270
        Get the maximum value horizontally across columns.
10271

10272
        Returns
10273
        -------
10274
        Series
10275
            A Series named `"max"`.
10276

10277
        Examples
10278
        --------
10279
        >>> df = pl.DataFrame(
10280
        ...     {
10281
        ...         "foo": [1, 2, 3],
10282
        ...         "bar": [4.0, 5.0, 6.0],
10283
        ...     }
10284
        ... )
10285
        >>> df.max_horizontal()
10286
        shape: (3,)
10287
        Series: 'max' [f64]
10288
        [
10289
                4.0
10290
                5.0
10291
                6.0
10292
        ]
10293
        """
10294
        return self.select(max=F.max_horizontal(F.all())).to_series()
10295

10296
    def min(self) -> DataFrame:
10297
        """
10298
        Aggregate the columns of this DataFrame to their minimum value.
10299

10300
        Examples
10301
        --------
10302
        >>> df = pl.DataFrame(
10303
        ...     {
10304
        ...         "foo": [1, 2, 3],
10305
        ...         "bar": [6, 7, 8],
10306
        ...         "ham": ["a", "b", "c"],
10307
        ...     }
10308
        ... )
10309
        >>> df.min()
10310
        shape: (1, 3)
10311
        ┌─────┬─────┬─────┐
10312
        │ foo ┆ bar ┆ ham │
10313
        │ --- ┆ --- ┆ --- │
10314
        │ i64 ┆ i64 ┆ str │
10315
        ╞═════╪═════╪═════╡
10316
        │ 1   ┆ 6   ┆ a   │
10317
        └─────┴─────┴─────┘
10318
        """
10319
        from polars.lazyframe.opt_flags import QueryOptFlags
10320

10321
        return self.lazy().min().collect(optimizations=QueryOptFlags._eager())
10322

10323
    def min_horizontal(self) -> Series:
10324
        """
10325
        Get the minimum value horizontally across columns.
10326

10327
        Returns
10328
        -------
10329
        Series
10330
            A Series named `"min"`.
10331

10332
        Examples
10333
        --------
10334
        >>> df = pl.DataFrame(
10335
        ...     {
10336
        ...         "foo": [1, 2, 3],
10337
        ...         "bar": [4.0, 5.0, 6.0],
10338
        ...     }
10339
        ... )
10340
        >>> df.min_horizontal()
10341
        shape: (3,)
10342
        Series: 'min' [f64]
10343
        [
10344
                1.0
10345
                2.0
10346
                3.0
10347
        ]
10348
        """
10349
        return self.select(min=F.min_horizontal(F.all())).to_series()
10350

10351
    def sum(self) -> DataFrame:
10352
        """
10353
        Aggregate the columns of this DataFrame to their sum value.
10354

10355
        Examples
10356
        --------
10357
        >>> df = pl.DataFrame(
10358
        ...     {
10359
        ...         "foo": [1, 2, 3],
10360
        ...         "bar": [6, 7, 8],
10361
        ...         "ham": ["a", "b", "c"],
10362
        ...     }
10363
        ... )
10364
        >>> df.sum()
10365
        shape: (1, 3)
10366
        ┌─────┬─────┬──────┐
10367
        │ foo ┆ bar ┆ ham  │
10368
        │ --- ┆ --- ┆ ---  │
10369
        │ i64 ┆ i64 ┆ str  │
10370
        ╞═════╪═════╪══════╡
10371
        │ 6   ┆ 21  ┆ null │
10372
        └─────┴─────┴──────┘
10373
        """
10374
        from polars.lazyframe.opt_flags import QueryOptFlags
10375

10376
        return self.lazy().sum().collect(optimizations=QueryOptFlags._eager())
10377

10378
    def sum_horizontal(self, *, ignore_nulls: bool = True) -> Series:
10379
        """
10380
        Sum all values horizontally across columns.
10381

10382
        Parameters
10383
        ----------
10384
        ignore_nulls
10385
            Ignore null values (default).
10386
            If set to `False`, any null value in the input will lead to a null output.
10387

10388
        Returns
10389
        -------
10390
        Series
10391
            A Series named `"sum"`.
10392

10393
        Examples
10394
        --------
10395
        >>> df = pl.DataFrame(
10396
        ...     {
10397
        ...         "foo": [1, 2, 3],
10398
        ...         "bar": [4.0, 5.0, 6.0],
10399
        ...     }
10400
        ... )
10401
        >>> df.sum_horizontal()
10402
        shape: (3,)
10403
        Series: 'sum' [f64]
10404
        [
10405
                5.0
10406
                7.0
10407
                9.0
10408
        ]
10409
        """
10410
        return self.select(
10411
            sum=F.sum_horizontal(F.all(), ignore_nulls=ignore_nulls)
10412
        ).to_series()
10413

10414
    def mean(self) -> DataFrame:
10415
        """
10416
        Aggregate the columns of this DataFrame to their mean value.
10417

10418
        Examples
10419
        --------
10420
        >>> df = pl.DataFrame(
10421
        ...     {
10422
        ...         "foo": [1, 2, 3],
10423
        ...         "bar": [6, 7, 8],
10424
        ...         "ham": ["a", "b", "c"],
10425
        ...         "spam": [True, False, None],
10426
        ...     }
10427
        ... )
10428
        >>> df.mean()
10429
        shape: (1, 4)
10430
        ┌─────┬─────┬──────┬──────┐
10431
        │ foo ┆ bar ┆ ham  ┆ spam │
10432
        │ --- ┆ --- ┆ ---  ┆ ---  │
10433
        │ f64 ┆ f64 ┆ str  ┆ f64  │
10434
        ╞═════╪═════╪══════╪══════╡
10435
        │ 2.0 ┆ 7.0 ┆ null ┆ 0.5  │
10436
        └─────┴─────┴──────┴──────┘
10437
        """
10438
        from polars.lazyframe.opt_flags import QueryOptFlags
10439

10440
        return self.lazy().mean().collect(optimizations=QueryOptFlags._eager())
10441

10442
    def mean_horizontal(self, *, ignore_nulls: bool = True) -> Series:
10443
        """
10444
        Take the mean of all values horizontally across columns.
10445

10446
        Parameters
10447
        ----------
10448
        ignore_nulls
10449
            Ignore null values (default).
10450
            If set to `False`, any null value in the input will lead to a null output.
10451

10452
        Returns
10453
        -------
10454
        Series
10455
            A Series named `"mean"`.
10456

10457
        Examples
10458
        --------
10459
        >>> df = pl.DataFrame(
10460
        ...     {
10461
        ...         "foo": [1, 2, 3],
10462
        ...         "bar": [4.0, 5.0, 6.0],
10463
        ...     }
10464
        ... )
10465
        >>> df.mean_horizontal()
10466
        shape: (3,)
10467
        Series: 'mean' [f64]
10468
        [
10469
                2.5
10470
                3.5
10471
                4.5
10472
        ]
10473
        """
10474
        return self.select(
10475
            mean=F.mean_horizontal(F.all(), ignore_nulls=ignore_nulls)
10476
        ).to_series()
10477

10478
    def std(self, ddof: int = 1) -> DataFrame:
10479
        """
10480
        Aggregate the columns of this DataFrame to their standard deviation value.
10481

10482
        Parameters
10483
        ----------
10484
        ddof
10485
            “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
10486
            where N represents the number of elements.
10487
            By default ddof is 1.
10488

10489
        Examples
10490
        --------
10491
        >>> df = pl.DataFrame(
10492
        ...     {
10493
        ...         "foo": [1, 2, 3],
10494
        ...         "bar": [6, 7, 8],
10495
        ...         "ham": ["a", "b", "c"],
10496
        ...     }
10497
        ... )
10498
        >>> df.std()
10499
        shape: (1, 3)
10500
        ┌─────┬─────┬──────┐
10501
        │ foo ┆ bar ┆ ham  │
10502
        │ --- ┆ --- ┆ ---  │
10503
        │ f64 ┆ f64 ┆ str  │
10504
        ╞═════╪═════╪══════╡
10505
        │ 1.0 ┆ 1.0 ┆ null │
10506
        └─────┴─────┴──────┘
10507
        >>> df.std(ddof=0)
10508
        shape: (1, 3)
10509
        ┌──────────┬──────────┬──────┐
10510
        │ foo      ┆ bar      ┆ ham  │
10511
        │ ---      ┆ ---      ┆ ---  │
10512
        │ f64      ┆ f64      ┆ str  │
10513
        ╞══════════╪══════════╪══════╡
10514
        │ 0.816497 ┆ 0.816497 ┆ null │
10515
        └──────────┴──────────┴──────┘
10516
        """
10517
        from polars.lazyframe.opt_flags import QueryOptFlags
10518

10519
        return self.lazy().std(ddof).collect(optimizations=QueryOptFlags._eager())
10520

10521
    def var(self, ddof: int = 1) -> DataFrame:
10522
        """
10523
        Aggregate the columns of this DataFrame to their variance value.
10524

10525
        Parameters
10526
        ----------
10527
        ddof
10528
            “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
10529
            where N represents the number of elements.
10530
            By default ddof is 1.
10531

10532
        Examples
10533
        --------
10534
        >>> df = pl.DataFrame(
10535
        ...     {
10536
        ...         "foo": [1, 2, 3],
10537
        ...         "bar": [6, 7, 8],
10538
        ...         "ham": ["a", "b", "c"],
10539
        ...     }
10540
        ... )
10541
        >>> df.var()
10542
        shape: (1, 3)
10543
        ┌─────┬─────┬──────┐
10544
        │ foo ┆ bar ┆ ham  │
10545
        │ --- ┆ --- ┆ ---  │
10546
        │ f64 ┆ f64 ┆ str  │
10547
        ╞═════╪═════╪══════╡
10548
        │ 1.0 ┆ 1.0 ┆ null │
10549
        └─────┴─────┴──────┘
10550
        >>> df.var(ddof=0)
10551
        shape: (1, 3)
10552
        ┌──────────┬──────────┬──────┐
10553
        │ foo      ┆ bar      ┆ ham  │
10554
        │ ---      ┆ ---      ┆ ---  │
10555
        │ f64      ┆ f64      ┆ str  │
10556
        ╞══════════╪══════════╪══════╡
10557
        │ 0.666667 ┆ 0.666667 ┆ null │
10558
        └──────────┴──────────┴──────┘
10559
        """
10560
        from polars.lazyframe.opt_flags import QueryOptFlags
10561

10562
        return self.lazy().var(ddof).collect(optimizations=QueryOptFlags._eager())
10563

10564
    def median(self) -> DataFrame:
10565
        """
10566
        Aggregate the columns of this DataFrame to their median value.
10567

10568
        Examples
10569
        --------
10570
        >>> df = pl.DataFrame(
10571
        ...     {
10572
        ...         "foo": [1, 2, 3],
10573
        ...         "bar": [6, 7, 8],
10574
        ...         "ham": ["a", "b", "c"],
10575
        ...     }
10576
        ... )
10577
        >>> df.median()
10578
        shape: (1, 3)
10579
        ┌─────┬─────┬──────┐
10580
        │ foo ┆ bar ┆ ham  │
10581
        │ --- ┆ --- ┆ ---  │
10582
        │ f64 ┆ f64 ┆ str  │
10583
        ╞═════╪═════╪══════╡
10584
        │ 2.0 ┆ 7.0 ┆ null │
10585
        └─────┴─────┴──────┘
10586
        """
10587
        from polars.lazyframe.opt_flags import QueryOptFlags
10588

10589
        return self.lazy().median().collect(optimizations=QueryOptFlags._eager())
10590

10591
    def product(self) -> DataFrame:
10592
        """
10593
        Aggregate the columns of this DataFrame to their product values.
10594

10595
        Examples
10596
        --------
10597
        >>> df = pl.DataFrame(
10598
        ...     {
10599
        ...         "a": [1, 2, 3],
10600
        ...         "b": [0.5, 4, 10],
10601
        ...         "c": [True, True, False],
10602
        ...     }
10603
        ... )
10604

10605
        >>> df.product()
10606
        shape: (1, 3)
10607
        ┌─────┬──────┬─────┐
10608
        │ a   ┆ b    ┆ c   │
10609
        │ --- ┆ ---  ┆ --- │
10610
        │ i64 ┆ f64  ┆ i64 │
10611
        ╞═════╪══════╪═════╡
10612
        │ 6   ┆ 20.0 ┆ 0   │
10613
        └─────┴──────┴─────┘
10614
        """
10615
        exprs = []
10616
        for name, dt in self.schema.items():
10617
            if dt.is_numeric() or isinstance(dt, Boolean):
10618
                exprs.append(F.col(name).product())
10619
            else:
10620
                exprs.append(F.lit(None).alias(name))
10621

10622
        return self.select(exprs)
10623

10624
    def quantile(
10625
        self, quantile: float, interpolation: QuantileMethod = "nearest"
10626
    ) -> DataFrame:
10627
        """
10628
        Aggregate the columns of this DataFrame to their quantile value.
10629

10630
        Parameters
10631
        ----------
10632
        quantile
10633
            Quantile between 0.0 and 1.0.
10634
        interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear', 'equiprobable'}
10635
            Interpolation method.
10636

10637
        Examples
10638
        --------
10639
        >>> df = pl.DataFrame(
10640
        ...     {
10641
        ...         "foo": [1, 2, 3],
10642
        ...         "bar": [6, 7, 8],
10643
        ...         "ham": ["a", "b", "c"],
10644
        ...     }
10645
        ... )
10646
        >>> df.quantile(0.5, "nearest")
10647
        shape: (1, 3)
10648
        ┌─────┬─────┬──────┐
10649
        │ foo ┆ bar ┆ ham  │
10650
        │ --- ┆ --- ┆ ---  │
10651
        │ f64 ┆ f64 ┆ str  │
10652
        ╞═════╪═════╪══════╡
10653
        │ 2.0 ┆ 7.0 ┆ null │
10654
        └─────┴─────┴──────┘
10655
        """  # noqa: W505
10656
        from polars.lazyframe.opt_flags import QueryOptFlags
10657

10658
        return (
10659
            self.lazy()
10660
            .quantile(quantile, interpolation)
10661
            .collect(optimizations=QueryOptFlags._eager())
10662
        )
10663

10664
    def to_dummies(
10665
        self,
10666
        columns: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None,
10667
        *,
10668
        separator: str = "_",
10669
        drop_first: bool = False,
10670
        drop_nulls: bool = False,
10671
    ) -> DataFrame:
10672
        """
10673
        Convert categorical variables into dummy/indicator variables.
10674

10675
        Parameters
10676
        ----------
10677
        columns
10678
            Column name(s) or selector(s) that should be converted to dummy
10679
            variables. If set to `None` (default), convert all columns.
10680
        separator
10681
            Separator/delimiter used when generating column names.
10682
        drop_first
10683
            Remove the first category from the variables being encoded.
10684
        drop_nulls
10685
            If there are `None` values in the series, a `null` column is not generated
10686

10687
        Examples
10688
        --------
10689
        >>> df = pl.DataFrame(
10690
        ...     {
10691
        ...         "foo": [1, 2],
10692
        ...         "bar": [3, 4],
10693
        ...         "ham": ["a", "b"],
10694
        ...     }
10695
        ... )
10696
        >>> df.to_dummies()
10697
        shape: (2, 6)
10698
        ┌───────┬───────┬───────┬───────┬───────┬───────┐
10699
        │ foo_1 ┆ foo_2 ┆ bar_3 ┆ bar_4 ┆ ham_a ┆ ham_b │
10700
        │ ---   ┆ ---   ┆ ---   ┆ ---   ┆ ---   ┆ ---   │
10701
        │ u8    ┆ u8    ┆ u8    ┆ u8    ┆ u8    ┆ u8    │
10702
        ╞═══════╪═══════╪═══════╪═══════╪═══════╪═══════╡
10703
        │ 1     ┆ 0     ┆ 1     ┆ 0     ┆ 1     ┆ 0     │
10704
        │ 0     ┆ 1     ┆ 0     ┆ 1     ┆ 0     ┆ 1     │
10705
        └───────┴───────┴───────┴───────┴───────┴───────┘
10706

10707
        >>> df.to_dummies(drop_first=True)
10708
        shape: (2, 3)
10709
        ┌───────┬───────┬───────┐
10710
        │ foo_2 ┆ bar_4 ┆ ham_b │
10711
        │ ---   ┆ ---   ┆ ---   │
10712
        │ u8    ┆ u8    ┆ u8    │
10713
        ╞═══════╪═══════╪═══════╡
10714
        │ 0     ┆ 0     ┆ 0     │
10715
        │ 1     ┆ 1     ┆ 1     │
10716
        └───────┴───────┴───────┘
10717

10718
        >>> import polars.selectors as cs
10719
        >>> df.to_dummies(cs.integer(), separator=":")
10720
        shape: (2, 5)
10721
        ┌───────┬───────┬───────┬───────┬─────┐
10722
        │ foo:1 ┆ foo:2 ┆ bar:3 ┆ bar:4 ┆ ham │
10723
        │ ---   ┆ ---   ┆ ---   ┆ ---   ┆ --- │
10724
        │ u8    ┆ u8    ┆ u8    ┆ u8    ┆ str │
10725
        ╞═══════╪═══════╪═══════╪═══════╪═════╡
10726
        │ 1     ┆ 0     ┆ 1     ┆ 0     ┆ a   │
10727
        │ 0     ┆ 1     ┆ 0     ┆ 1     ┆ b   │
10728
        └───────┴───────┴───────┴───────┴─────┘
10729

10730
        >>> df.to_dummies(cs.integer(), drop_first=True, separator=":")
10731
        shape: (2, 3)
10732
        ┌───────┬───────┬─────┐
10733
        │ foo:2 ┆ bar:4 ┆ ham │
10734
        │ ---   ┆ ---   ┆ --- │
10735
        │ u8    ┆ u8    ┆ str │
10736
        ╞═══════╪═══════╪═════╡
10737
        │ 0     ┆ 0     ┆ a   │
10738
        │ 1     ┆ 1     ┆ b   │
10739
        └───────┴───────┴─────┘
10740
        """
10741
        if columns is not None:
10742
            columns = _expand_selectors(self, columns)
10743
        return self._from_pydf(
10744
            self._df.to_dummies(columns, separator, drop_first, drop_nulls)
10745
        )
10746

10747
    def unique(
10748
        self,
10749
        subset: ColumnNameOrSelector | Collection[ColumnNameOrSelector] | None = None,
10750
        *,
10751
        keep: UniqueKeepStrategy = "any",
10752
        maintain_order: bool = False,
10753
    ) -> DataFrame:
10754
        """
10755
        Drop duplicate rows from this dataframe.
10756

10757
        Parameters
10758
        ----------
10759
        subset
10760
            Column name(s) or selector(s), to consider when identifying
10761
            duplicate rows. If set to `None` (default), use all columns.
10762
        keep : {'first', 'last', 'any', 'none'}
10763
            Which of the duplicate rows to keep.
10764

10765
            * 'any': Does not give any guarantee of which row is kept.
10766
                     This allows more optimizations.
10767
            * 'none': Don't keep duplicate rows.
10768
            * 'first': Keep first unique row.
10769
            * 'last': Keep last unique row.
10770
        maintain_order
10771
            Keep the same order as the original DataFrame. This is more expensive to
10772
            compute.
10773
            Settings this to `True` blocks the possibility
10774
            to run on the streaming engine.
10775

10776
        Returns
10777
        -------
10778
        DataFrame
10779
            DataFrame with unique rows.
10780

10781
        Warnings
10782
        --------
10783
        This method will fail if there is a column of type `List` in the DataFrame or
10784
        subset.
10785

10786
        Notes
10787
        -----
10788
        If you're coming from pandas, this is similar to
10789
        `pandas.DataFrame.drop_duplicates`.
10790

10791
        Examples
10792
        --------
10793
        >>> df = pl.DataFrame(
10794
        ...     {
10795
        ...         "foo": [1, 2, 3, 1],
10796
        ...         "bar": ["a", "a", "a", "a"],
10797
        ...         "ham": ["b", "b", "b", "b"],
10798
        ...     }
10799
        ... )
10800
        >>> df.unique(maintain_order=True)
10801
        shape: (3, 3)
10802
        ┌─────┬─────┬─────┐
10803
        │ foo ┆ bar ┆ ham │
10804
        │ --- ┆ --- ┆ --- │
10805
        │ i64 ┆ str ┆ str │
10806
        ╞═════╪═════╪═════╡
10807
        │ 1   ┆ a   ┆ b   │
10808
        │ 2   ┆ a   ┆ b   │
10809
        │ 3   ┆ a   ┆ b   │
10810
        └─────┴─────┴─────┘
10811
        >>> df.unique(subset=["bar", "ham"], maintain_order=True)
10812
        shape: (1, 3)
10813
        ┌─────┬─────┬─────┐
10814
        │ foo ┆ bar ┆ ham │
10815
        │ --- ┆ --- ┆ --- │
10816
        │ i64 ┆ str ┆ str │
10817
        ╞═════╪═════╪═════╡
10818
        │ 1   ┆ a   ┆ b   │
10819
        └─────┴─────┴─────┘
10820
        >>> df.unique(keep="last", maintain_order=True)
10821
        shape: (3, 3)
10822
        ┌─────┬─────┬─────┐
10823
        │ foo ┆ bar ┆ ham │
10824
        │ --- ┆ --- ┆ --- │
10825
        │ i64 ┆ str ┆ str │
10826
        ╞═════╪═════╪═════╡
10827
        │ 2   ┆ a   ┆ b   │
10828
        │ 3   ┆ a   ┆ b   │
10829
        │ 1   ┆ a   ┆ b   │
10830
        └─────┴─────┴─────┘
10831
        """
10832
        from polars.lazyframe.opt_flags import QueryOptFlags
10833

10834
        return (
10835
            self.lazy()
10836
            .unique(subset=subset, keep=keep, maintain_order=maintain_order)
10837
            .collect(optimizations=QueryOptFlags._eager())
10838
        )
10839

10840
    def n_unique(self, subset: str | Expr | Sequence[str | Expr] | None = None) -> int:
10841
        """
10842
        Return the number of unique rows, or the number of unique row-subsets.
10843

10844
        Parameters
10845
        ----------
10846
        subset
10847
            One or more columns/expressions that define what to count;
10848
            omit to return the count of unique rows.
10849

10850
        Notes
10851
        -----
10852
        This method operates at the `DataFrame` level; to operate on subsets at the
10853
        expression level you can make use of struct-packing instead, for example:
10854

10855
        >>> expr_unique_subset = pl.struct("a", "b").n_unique()
10856

10857
        If instead you want to count the number of unique values per-column, you can
10858
        also use expression-level syntax to return a new frame containing that result:
10859

10860
        >>> df = pl.DataFrame(
10861
        ...     [[1, 2, 3], [1, 2, 4]], schema=["a", "b", "c"], orient="row"
10862
        ... )
10863
        >>> df_nunique = df.select(pl.all().n_unique())
10864

10865
        In aggregate context there is also an equivalent method for returning the
10866
        unique values per-group:
10867

10868
        >>> df_agg_nunique = df.group_by("a").n_unique()
10869

10870
        Examples
10871
        --------
10872
        >>> df = pl.DataFrame(
10873
        ...     {
10874
        ...         "a": [1, 1, 2, 3, 4, 5],
10875
        ...         "b": [0.5, 0.5, 1.0, 2.0, 3.0, 3.0],
10876
        ...         "c": [True, True, True, False, True, True],
10877
        ...     }
10878
        ... )
10879
        >>> df.n_unique()
10880
        5
10881

10882
        Simple columns subset.
10883

10884
        >>> df.n_unique(subset=["b", "c"])
10885
        4
10886

10887
        Expression subset.
10888

10889
        >>> df.n_unique(
10890
        ...     subset=[
10891
        ...         (pl.col("a") // 2),
10892
        ...         (pl.col("c") | (pl.col("b") >= 2)),
10893
        ...     ],
10894
        ... )
10895
        3
10896
        """
10897
        if isinstance(subset, str):
10898
            expr = F.col(subset)
10899
        elif isinstance(subset, pl.Expr):
10900
            expr = subset
10901
        elif isinstance(subset, Sequence) and len(subset) == 1:
10902
            expr = wrap_expr(parse_into_expression(subset[0]))
10903
        else:
10904
            struct_fields = F.all() if (subset is None) else subset
10905
            expr = F.struct(struct_fields)
10906

10907
        from polars.lazyframe.opt_flags import QueryOptFlags
10908

10909
        df = (
10910
            self.lazy()
10911
            .select(expr.n_unique())
10912
            .collect(optimizations=QueryOptFlags._eager())
10913
        )
10914
        return 0 if df.is_empty() else df.row(0)[0]
10915

10916
    @deprecated(
10917
        "`DataFrame.approx_n_unique` is deprecated; "
10918
        "use `select(pl.all().approx_n_unique())` instead."
10919
    )
10920
    def approx_n_unique(self) -> DataFrame:
10921
        """
10922
        Approximate count of unique values.
10923

10924
        .. deprecated:: 0.20.11
10925
            Use the `select(pl.all().approx_n_unique())` method instead.
10926

10927
        This is done using the HyperLogLog++ algorithm for cardinality estimation.
10928

10929
        Examples
10930
        --------
10931
        >>> df = pl.DataFrame(
10932
        ...     {
10933
        ...         "a": [1, 2, 3, 4],
10934
        ...         "b": [1, 2, 1, 1],
10935
        ...     }
10936
        ... )
10937
        >>> df.approx_n_unique()  # doctest: +SKIP
10938
        shape: (1, 2)
10939
        ┌─────┬─────┐
10940
        │ a   ┆ b   │
10941
        │ --- ┆ --- │
10942
        │ u32 ┆ u32 │
10943
        ╞═════╪═════╡
10944
        │ 4   ┆ 2   │
10945
        └─────┴─────┘
10946
        """
10947
        from polars.lazyframe.opt_flags import QueryOptFlags
10948

10949
        return (
10950
            self.lazy().approx_n_unique().collect(optimizations=QueryOptFlags._eager())
10951
        )
10952

10953
    def rechunk(self) -> DataFrame:
10954
        """
10955
        Rechunk the data in this DataFrame to a contiguous allocation.
10956

10957
        This will make sure all subsequent operations have optimal and predictable
10958
        performance.
10959
        """
10960
        return self._from_pydf(self._df.rechunk())
10961

10962
    def null_count(self) -> DataFrame:
10963
        """
10964
        Create a new DataFrame that shows the null counts per column.
10965

10966
        Examples
10967
        --------
10968
        >>> df = pl.DataFrame(
10969
        ...     {
10970
        ...         "foo": [1, None, 3],
10971
        ...         "bar": [6, 7, None],
10972
        ...         "ham": ["a", "b", "c"],
10973
        ...     }
10974
        ... )
10975
        >>> df.null_count()
10976
        shape: (1, 3)
10977
        ┌─────┬─────┬─────┐
10978
        │ foo ┆ bar ┆ ham │
10979
        │ --- ┆ --- ┆ --- │
10980
        │ u32 ┆ u32 ┆ u32 │
10981
        ╞═════╪═════╪═════╡
10982
        │ 1   ┆ 1   ┆ 0   │
10983
        └─────┴─────┴─────┘
10984
        """
10985
        return self._from_pydf(self._df.null_count())
10986

10987
    def sample(
10988
        self,
10989
        n: int | Series | None = None,
10990
        *,
10991
        fraction: float | Series | None = None,
10992
        with_replacement: bool = False,
10993
        shuffle: bool = False,
10994
        seed: int | None = None,
10995
    ) -> DataFrame:
10996
        """
10997
        Sample from this DataFrame.
10998

10999
        Parameters
11000
        ----------
11001
        n
11002
            Number of items to return. Cannot be used with `fraction`. Defaults to 1 if
11003
            `fraction` is None.
11004
        fraction
11005
            Fraction of items to return. Cannot be used with `n`.
11006
        with_replacement
11007
            Allow values to be sampled more than once.
11008
        shuffle
11009
            If set to True, the order of the sampled rows will be shuffled. If
11010
            set to False (default), the order of the returned rows will be
11011
            neither stable nor fully random.
11012
        seed
11013
            Seed for the random number generator. If set to None (default), a
11014
            random seed is generated for each sample operation.
11015

11016
        Examples
11017
        --------
11018
        >>> df = pl.DataFrame(
11019
        ...     {
11020
        ...         "foo": [1, 2, 3],
11021
        ...         "bar": [6, 7, 8],
11022
        ...         "ham": ["a", "b", "c"],
11023
        ...     }
11024
        ... )
11025
        >>> df.sample(n=2, seed=0)  # doctest: +IGNORE_RESULT
11026
        shape: (2, 3)
11027
        ┌─────┬─────┬─────┐
11028
        │ foo ┆ bar ┆ ham │
11029
        │ --- ┆ --- ┆ --- │
11030
        │ i64 ┆ i64 ┆ str │
11031
        ╞═════╪═════╪═════╡
11032
        │ 3   ┆ 8   ┆ c   │
11033
        │ 2   ┆ 7   ┆ b   │
11034
        └─────┴─────┴─────┘
11035
        """
11036
        if n is not None and fraction is not None:
11037
            msg = "cannot specify both `n` and `fraction`"
11038
            raise ValueError(msg)
11039

11040
        if seed is None:
11041
            seed = random.randint(0, 10000)
11042

11043
        if n is None and fraction is not None:
11044
            if not isinstance(fraction, pl.Series):
11045
                fraction = pl.Series("frac", [fraction])
11046

11047
            return self._from_pydf(
11048
                self._df.sample_frac(fraction._s, with_replacement, shuffle, seed)
11049
            )
11050

11051
        if n is None:
11052
            n = 1
11053

11054
        if not isinstance(n, pl.Series):
11055
            n = pl.Series("", [n])
11056

11057
        return self._from_pydf(self._df.sample_n(n._s, with_replacement, shuffle, seed))
11058

11059
    def fold(self, operation: Callable[[Series, Series], Series]) -> Series:
11060
        """
11061
        Apply a horizontal reduction on a DataFrame.
11062

11063
        This can be used to effectively determine aggregations on a row level, and can
11064
        be applied to any DataType that can be supercast (cast to a similar parent
11065
        type).
11066

11067
        An example of the supercast rules when applying an arithmetic operation on two
11068
        DataTypes are for instance:
11069

11070
        - Int8 + String = String
11071
        - Float32 + Int64 = Float32
11072
        - Float32 + Float64 = Float64
11073

11074
        Examples
11075
        --------
11076
        A horizontal sum operation:
11077

11078
        >>> df = pl.DataFrame(
11079
        ...     {
11080
        ...         "a": [2, 1, 3],
11081
        ...         "b": [1, 2, 3],
11082
        ...         "c": [1.0, 2.0, 3.0],
11083
        ...     }
11084
        ... )
11085
        >>> df.fold(lambda s1, s2: s1 + s2)
11086
        shape: (3,)
11087
        Series: 'a' [f64]
11088
        [
11089
            4.0
11090
            5.0
11091
            9.0
11092
        ]
11093

11094
        A horizontal minimum operation:
11095

11096
        >>> df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1.0, 2.0, 3.0]})
11097
        >>> df.fold(lambda s1, s2: s1.zip_with(s1 < s2, s2))
11098
        shape: (3,)
11099
        Series: 'a' [f64]
11100
        [
11101
            1.0
11102
            1.0
11103
            3.0
11104
        ]
11105

11106
        A horizontal string concatenation:
11107

11108
        >>> df = pl.DataFrame(
11109
        ...     {
11110
        ...         "a": ["foo", "bar", None],
11111
        ...         "b": [1, 2, 3],
11112
        ...         "c": [1.0, 2.0, 3.0],
11113
        ...     }
11114
        ... )
11115
        >>> df.fold(lambda s1, s2: s1 + s2)
11116
        shape: (3,)
11117
        Series: 'a' [str]
11118
        [
11119
            "foo11.0"
11120
            "bar22.0"
11121
            null
11122
        ]
11123

11124
        A horizontal boolean or, similar to a row-wise .any():
11125

11126
        >>> df = pl.DataFrame(
11127
        ...     {
11128
        ...         "a": [False, False, True],
11129
        ...         "b": [False, True, False],
11130
        ...     }
11131
        ... )
11132
        >>> df.fold(lambda s1, s2: s1 | s2)
11133
        shape: (3,)
11134
        Series: 'a' [bool]
11135
        [
11136
                false
11137
                true
11138
                true
11139
        ]
11140

11141
        Parameters
11142
        ----------
11143
        operation
11144
            function that takes two `Series` and returns a `Series`.
11145
        """
11146
        acc = self.to_series(0)
11147

11148
        for i in range(1, self.width):
11149
            acc = operation(acc, self.to_series(i))
11150
        return acc
11151

11152
    @overload
11153
    def row(
11154
        self,
11155
        index: int | None = ...,
11156
        *,
11157
        by_predicate: Expr | None = ...,
11158
        named: Literal[False] = ...,
11159
    ) -> tuple[Any, ...]: ...
11160

11161
    @overload
11162
    def row(
11163
        self,
11164
        index: int | None = ...,
11165
        *,
11166
        by_predicate: Expr | None = ...,
11167
        named: Literal[True],
11168
    ) -> dict[str, Any]: ...
11169

11170
    def row(
11171
        self,
11172
        index: int | None = None,
11173
        *,
11174
        by_predicate: Expr | None = None,
11175
        named: bool = False,
11176
    ) -> tuple[Any, ...] | dict[str, Any]:
11177
        """
11178
        Get the values of a single row, either by index or by predicate.
11179

11180
        Parameters
11181
        ----------
11182
        index
11183
            Row index.
11184
        by_predicate
11185
            Select the row according to a given expression/predicate.
11186
        named
11187
            Return a dictionary instead of a tuple. The dictionary is a mapping of
11188
            column name to row value. This is more expensive than returning a regular
11189
            tuple, but allows for accessing values by column name.
11190

11191
        Returns
11192
        -------
11193
        tuple (default) or dictionary of row values
11194

11195
        Notes
11196
        -----
11197
        The `index` and `by_predicate` params are mutually exclusive. Additionally,
11198
        to ensure clarity, the `by_predicate` parameter must be supplied by keyword.
11199

11200
        When using `by_predicate` it is an error condition if anything other than
11201
        one row is returned; more than one row raises `TooManyRowsReturnedError`, and
11202
        zero rows will raise `NoRowsReturnedError` (both inherit from `RowsError`).
11203

11204
        Warnings
11205
        --------
11206
        You should NEVER use this method to iterate over a DataFrame; if you require
11207
        row-iteration you should strongly prefer use of `iter_rows()` instead.
11208

11209
        See Also
11210
        --------
11211
        iter_rows : Row iterator over frame data (does not materialise all rows).
11212
        rows : Materialise all frame data as a list of rows (potentially expensive).
11213
        item: Return dataframe element as a scalar.
11214

11215
        Examples
11216
        --------
11217
        Specify an index to return the row at the given index as a tuple.
11218

11219
        >>> df = pl.DataFrame(
11220
        ...     {
11221
        ...         "foo": [1, 2, 3],
11222
        ...         "bar": [6, 7, 8],
11223
        ...         "ham": ["a", "b", "c"],
11224
        ...     }
11225
        ... )
11226
        >>> df.row(2)
11227
        (3, 8, 'c')
11228

11229
        Specify `named=True` to get a dictionary instead with a mapping of column
11230
        names to row values.
11231

11232
        >>> df.row(2, named=True)
11233
        {'foo': 3, 'bar': 8, 'ham': 'c'}
11234

11235
        Use `by_predicate` to return the row that matches the given predicate.
11236

11237
        >>> df.row(by_predicate=(pl.col("ham") == "b"))
11238
        (2, 7, 'b')
11239
        """
11240
        if index is not None and by_predicate is not None:
11241
            msg = "cannot set both 'index' and 'by_predicate'; mutually exclusive"
11242
            raise ValueError(msg)
11243
        elif isinstance(index, pl.Expr):
11244
            msg = "expressions should be passed to the `by_predicate` parameter"
11245
            raise TypeError(msg)
11246

11247
        if index is not None:
11248
            row = self._df.row_tuple(index)
11249
            if named:
11250
                return dict(zip(self.columns, row))
11251
            else:
11252
                return row
11253

11254
        elif by_predicate is not None:
11255
            if not isinstance(by_predicate, pl.Expr):
11256
                msg = f"expected `by_predicate` to be an expression, got {qualified_type_name(by_predicate)!r}"
11257
                raise TypeError(msg)
11258
            rows = self.filter(by_predicate).rows()
11259
            n_rows = len(rows)
11260
            if n_rows > 1:
11261
                msg = f"predicate <{by_predicate!s}> returned {n_rows} rows"
11262
                raise TooManyRowsReturnedError(msg)
11263
            elif n_rows == 0:
11264
                msg = f"predicate <{by_predicate!s}> returned no rows"
11265
                raise NoRowsReturnedError(msg)
11266

11267
            row = rows[0]
11268
            if named:
11269
                return dict(zip(self.columns, row))
11270
            else:
11271
                return row
11272
        else:
11273
            msg = "one of `index` or `by_predicate` must be set"
11274
            raise ValueError(msg)
11275

11276
    @overload
11277
    def rows(self, *, named: Literal[False] = ...) -> list[tuple[Any, ...]]: ...
11278

11279
    @overload
11280
    def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ...
11281

11282
    def rows(
11283
        self, *, named: bool = False
11284
    ) -> list[tuple[Any, ...]] | list[dict[str, Any]]:
11285
        """
11286
        Returns all data in the DataFrame as a list of rows of python-native values.
11287

11288
        By default, each row is returned as a tuple of values given in the same order
11289
        as the frame columns. Setting `named=True` will return rows of dictionaries
11290
        instead.
11291

11292
        Parameters
11293
        ----------
11294
        named
11295
            Return dictionaries instead of tuples. The dictionaries are a mapping of
11296
            column name to row value. This is more expensive than returning a regular
11297
            tuple, but allows for accessing values by column name.
11298

11299
        Notes
11300
        -----
11301
        If you have `ns`-precision temporal values you should be aware that Python
11302
        natively only supports up to `μs`-precision; `ns`-precision values will be
11303
        truncated to microseconds on conversion to Python. If this matters to your
11304
        use-case you should export to a different format (such as Arrow or NumPy).
11305

11306
        Warnings
11307
        --------
11308
        Row-iteration is not optimal as the underlying data is stored in columnar form;
11309
        where possible, prefer export via one of the dedicated export/output methods.
11310
        You should also consider using `iter_rows` instead, to avoid materialising all
11311
        the data at once; there is little performance difference between the two, but
11312
        peak memory can be reduced if processing rows in batches.
11313

11314
        Returns
11315
        -------
11316
        list of row value tuples (default), or list of dictionaries (if `named=True`).
11317

11318
        See Also
11319
        --------
11320
        iter_rows : Row iterator over frame data (does not materialise all rows).
11321
        rows_by_key : Materialises frame data as a key-indexed dictionary.
11322

11323
        Examples
11324
        --------
11325
        >>> df = pl.DataFrame(
11326
        ...     {
11327
        ...         "x": ["a", "b", "b", "a"],
11328
        ...         "y": [1, 2, 3, 4],
11329
        ...         "z": [0, 3, 6, 9],
11330
        ...     }
11331
        ... )
11332
        >>> df.rows()
11333
        [('a', 1, 0), ('b', 2, 3), ('b', 3, 6), ('a', 4, 9)]
11334
        >>> df.rows(named=True)
11335
        [{'x': 'a', 'y': 1, 'z': 0},
11336
         {'x': 'b', 'y': 2, 'z': 3},
11337
         {'x': 'b', 'y': 3, 'z': 6},
11338
         {'x': 'a', 'y': 4, 'z': 9}]
11339
        """
11340
        if named:
11341
            # Load these into the local namespace for a minor performance boost
11342
            dict_, zip_, columns = dict, zip, self.columns
11343
            return [dict_(zip_(columns, row)) for row in self._df.row_tuples()]
11344
        else:
11345
            return self._df.row_tuples()
11346

11347
    @overload
11348
    def rows_by_key(
11349
        self,
11350
        key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
11351
        *,
11352
        named: Literal[False] = ...,
11353
        include_key: bool = ...,
11354
        unique: Literal[False] = ...,
11355
    ) -> dict[Any, list[Any]]: ...
11356

11357
    @overload
11358
    def rows_by_key(
11359
        self,
11360
        key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
11361
        *,
11362
        named: Literal[False] = ...,
11363
        include_key: bool = ...,
11364
        unique: Literal[True],
11365
    ) -> dict[Any, Any]: ...
11366

11367
    @overload
11368
    def rows_by_key(
11369
        self,
11370
        key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
11371
        *,
11372
        named: Literal[True],
11373
        include_key: bool = ...,
11374
        unique: Literal[False] = ...,
11375
    ) -> dict[Any, list[dict[str, Any]]]: ...
11376

11377
    @overload
11378
    def rows_by_key(
11379
        self,
11380
        key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
11381
        *,
11382
        named: Literal[True],
11383
        include_key: bool = ...,
11384
        unique: Literal[True],
11385
    ) -> dict[Any, dict[str, Any]]: ...
11386

11387
    def rows_by_key(
11388
        self,
11389
        key: ColumnNameOrSelector | Sequence[ColumnNameOrSelector],
11390
        *,
11391
        named: bool = False,
11392
        include_key: bool = False,
11393
        unique: bool = False,
11394
    ) -> dict[Any, Any]:
11395
        """
11396
        Returns all data as a dictionary of python-native values keyed by some column.
11397

11398
        This method is like `rows`, but instead of returning rows in a flat list, rows
11399
        are grouped by the values in the `key` column(s) and returned as a dictionary.
11400

11401
        Note that this method should not be used in place of native operations, due to
11402
        the high cost of materializing all frame data out into a dictionary; it should
11403
        be used only when you need to move the values out into a Python data structure
11404
        or other object that cannot operate directly with Polars/Arrow.
11405

11406
        Parameters
11407
        ----------
11408
        key
11409
            The column(s) to use as the key for the returned dictionary. If multiple
11410
            columns are specified, the key will be a tuple of those values, otherwise
11411
            it will be a string.
11412
        named
11413
            Return dictionary rows instead of tuples, mapping column name to row value.
11414
        include_key
11415
            Include key values inline with the associated data (by default the key
11416
            values are omitted as a memory/performance optimisation, as they can be
11417
            reoconstructed from the key).
11418
        unique
11419
            Indicate that the key is unique; this will result in a 1:1 mapping from
11420
            key to a single associated row. Note that if the key is *not* actually
11421
            unique the last row with the given key will be returned.
11422

11423
        Notes
11424
        -----
11425
        If you have `ns`-precision temporal values you should be aware that Python
11426
        natively only supports up to `μs`-precision; `ns`-precision values will be
11427
        truncated to microseconds on conversion to Python. If this matters to your
11428
        use-case you should export to a different format (such as Arrow or NumPy).
11429

11430
        See Also
11431
        --------
11432
        rows : Materialize all frame data as a list of rows (potentially expensive).
11433
        iter_rows : Row iterator over frame data (does not materialize all rows).
11434
        to_dict : Convert DataFrame to a dictionary mapping column name to values.
11435

11436
        Examples
11437
        --------
11438
        >>> df = pl.DataFrame(
11439
        ...     {
11440
        ...         "w": ["a", "b", "b", "a"],
11441
        ...         "x": ["q", "q", "q", "k"],
11442
        ...         "y": [1.0, 2.5, 3.0, 4.5],
11443
        ...         "z": [9, 8, 7, 6],
11444
        ...     }
11445
        ... )
11446

11447
        Group rows by the given key column(s):
11448

11449
        >>> df.rows_by_key(key=["w"])
11450
        defaultdict(<class 'list'>,
11451
            {'a': [('q', 1.0, 9), ('k', 4.5, 6)],
11452
             'b': [('q', 2.5, 8), ('q', 3.0, 7)]})
11453

11454
        Return the same row groupings as dictionaries:
11455

11456
        >>> df.rows_by_key(key=["w"], named=True)
11457
        defaultdict(<class 'list'>,
11458
            {'a': [{'x': 'q', 'y': 1.0, 'z': 9},
11459
                   {'x': 'k', 'y': 4.5, 'z': 6}],
11460
             'b': [{'x': 'q', 'y': 2.5, 'z': 8},
11461
                   {'x': 'q', 'y': 3.0, 'z': 7}]})
11462

11463
        Return row groupings, assuming keys are unique:
11464

11465
        >>> df.rows_by_key(key=["z"], unique=True)
11466
        {9: ('a', 'q', 1.0),
11467
         8: ('b', 'q', 2.5),
11468
         7: ('b', 'q', 3.0),
11469
         6: ('a', 'k', 4.5)}
11470

11471
        Return row groupings as dictionaries, assuming keys are unique:
11472

11473
        >>> df.rows_by_key(key=["z"], named=True, unique=True)
11474
        {9: {'w': 'a', 'x': 'q', 'y': 1.0},
11475
         8: {'w': 'b', 'x': 'q', 'y': 2.5},
11476
         7: {'w': 'b', 'x': 'q', 'y': 3.0},
11477
         6: {'w': 'a', 'x': 'k', 'y': 4.5}}
11478

11479
        Return dictionary rows grouped by a compound key, including key values:
11480

11481
        >>> df.rows_by_key(key=["w", "x"], named=True, include_key=True)
11482
        defaultdict(<class 'list'>,
11483
            {('a', 'q'): [{'w': 'a', 'x': 'q', 'y': 1.0, 'z': 9}],
11484
             ('b', 'q'): [{'w': 'b', 'x': 'q', 'y': 2.5, 'z': 8},
11485
                          {'w': 'b', 'x': 'q', 'y': 3.0, 'z': 7}],
11486
             ('a', 'k'): [{'w': 'a', 'x': 'k', 'y': 4.5, 'z': 6}]})
11487
        """
11488
        key = _expand_selectors(self, key)
11489

11490
        keys = (
11491
            iter(self.get_column(key[0]))
11492
            if len(key) == 1
11493
            else self.select(key).iter_rows()
11494
        )
11495

11496
        if include_key:
11497
            values = self
11498
        else:
11499
            data_cols = [k for k in self.schema if k not in key]
11500
            values = self.select(data_cols)
11501

11502
        zipped = zip(keys, values.iter_rows(named=named))  # type: ignore[call-overload]
11503

11504
        # if unique, we expect to write just one entry per key; otherwise, we're
11505
        # returning a list of rows for each key, so append into a defaultdict.
11506
        if unique:
11507
            rows = dict(zipped)
11508
        else:
11509
            rows = defaultdict(list)
11510
            for key, data in zipped:
11511
                rows[key].append(data)
11512

11513
        return rows
11514

11515
    @overload
11516
    def iter_rows(
11517
        self, *, named: Literal[False] = ..., buffer_size: int = ...
11518
    ) -> Iterator[tuple[Any, ...]]: ...
11519

11520
    @overload
11521
    def iter_rows(
11522
        self, *, named: Literal[True], buffer_size: int = ...
11523
    ) -> Iterator[dict[str, Any]]: ...
11524

11525
    def iter_rows(
11526
        self, *, named: bool = False, buffer_size: int = 512
11527
    ) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]:
11528
        """
11529
        Returns an iterator over the DataFrame of rows of python-native values.
11530

11531
        Parameters
11532
        ----------
11533
        named
11534
            Return dictionaries instead of tuples. The dictionaries are a mapping of
11535
            column name to row value. This is more expensive than returning a regular
11536
            tuple, but allows for accessing values by column name.
11537
        buffer_size
11538
            Determines the number of rows that are buffered internally while iterating
11539
            over the data; you should only modify this in very specific cases where the
11540
            default value is determined not to be a good fit to your access pattern, as
11541
            the speedup from using the buffer is significant (~2-4x). Setting this
11542
            value to zero disables row buffering (not recommended).
11543

11544
        Notes
11545
        -----
11546
        If you have `ns`-precision temporal values you should be aware that Python
11547
        natively only supports up to `μs`-precision; `ns`-precision values will be
11548
        truncated to microseconds on conversion to Python. If this matters to your
11549
        use-case you should export to a different format (such as Arrow or NumPy).
11550

11551
        Warnings
11552
        --------
11553
        Row iteration is not optimal as the underlying data is stored in columnar form;
11554
        where possible, prefer export via one of the dedicated export/output methods
11555
        that deals with columnar data.
11556

11557
        Returns
11558
        -------
11559
        iterator of tuples (default) or dictionaries (if named) of python row values
11560

11561
        See Also
11562
        --------
11563
        rows : Materialises all frame data as a list of rows (potentially expensive).
11564
        rows_by_key : Materialises frame data as a key-indexed dictionary.
11565

11566
        Examples
11567
        --------
11568
        >>> df = pl.DataFrame(
11569
        ...     {
11570
        ...         "a": [1, 3, 5],
11571
        ...         "b": [2, 4, 6],
11572
        ...     }
11573
        ... )
11574
        >>> [row[0] for row in df.iter_rows()]
11575
        [1, 3, 5]
11576
        >>> [row["b"] for row in df.iter_rows(named=True)]
11577
        [2, 4, 6]
11578
        """
11579
        # load into the local namespace for a (minor) performance boost in the hot loops
11580
        columns, get_row, dict_, zip_ = self.columns, self.row, dict, zip
11581
        has_object = Object in self.dtypes
11582

11583
        # note: buffering rows results in a 2-4x speedup over individual calls
11584
        # to ".row(i)", so it should only be disabled in extremely specific cases.
11585
        if buffer_size and not has_object:
11586
            for offset in range(0, self.height, buffer_size):
11587
                zerocopy_slice = self.slice(offset, buffer_size)
11588
                if named:
11589
                    for row in zerocopy_slice.rows(named=False):
11590
                        yield dict_(zip_(columns, row))
11591
                else:
11592
                    yield from zerocopy_slice.rows(named=False)
11593
        elif named:
11594
            for i in range(self.height):
11595
                yield dict_(zip_(columns, get_row(i)))
11596
        else:
11597
            for i in range(self.height):
11598
                yield get_row(i)
11599

11600
    def iter_columns(self) -> Iterator[Series]:
11601
        """
11602
        Returns an iterator over the columns of this DataFrame.
11603

11604
        Yields
11605
        ------
11606
        Series
11607

11608
        Notes
11609
        -----
11610
        Consider whether you can use :func:`all` instead.
11611
        If you can, it will be more efficient.
11612

11613
        Examples
11614
        --------
11615
        >>> df = pl.DataFrame(
11616
        ...     {
11617
        ...         "a": [1, 3, 5],
11618
        ...         "b": [2, 4, 6],
11619
        ...     }
11620
        ... )
11621
        >>> [s.name for s in df.iter_columns()]
11622
        ['a', 'b']
11623

11624
        If you're using this to modify a dataframe's columns, e.g.
11625

11626
        >>> # Do NOT do this
11627
        >>> pl.DataFrame(column * 2 for column in df.iter_columns())
11628
        shape: (3, 2)
11629
        ┌─────┬─────┐
11630
        │ a   ┆ b   │
11631
        │ --- ┆ --- │
11632
        │ i64 ┆ i64 │
11633
        ╞═════╪═════╡
11634
        │ 2   ┆ 4   │
11635
        │ 6   ┆ 8   │
11636
        │ 10  ┆ 12  │
11637
        └─────┴─────┘
11638

11639
        then consider whether you can use :func:`all` instead:
11640

11641
        >>> df.select(pl.all() * 2)
11642
        shape: (3, 2)
11643
        ┌─────┬─────┐
11644
        │ a   ┆ b   │
11645
        │ --- ┆ --- │
11646
        │ i64 ┆ i64 │
11647
        ╞═════╪═════╡
11648
        │ 2   ┆ 4   │
11649
        │ 6   ┆ 8   │
11650
        │ 10  ┆ 12  │
11651
        └─────┴─────┘
11652
        """
11653
        for s in self._df.get_columns():
11654
            yield wrap_s(s)
11655

11656
    def iter_slices(self, n_rows: int = 10_000) -> Iterator[DataFrame]:
11657
        r"""
11658
        Returns a non-copying iterator of slices over the underlying DataFrame.
11659

11660
        Parameters
11661
        ----------
11662
        n_rows
11663
            Determines the number of rows contained in each DataFrame slice.
11664

11665
        Examples
11666
        --------
11667
        >>> from datetime import date
11668
        >>> df = pl.DataFrame(
11669
        ...     data={
11670
        ...         "a": range(17_500),
11671
        ...         "b": date(2023, 1, 1),
11672
        ...         "c": "klmnoopqrstuvwxyz",
11673
        ...     },
11674
        ...     schema_overrides={"a": pl.Int32},
11675
        ... )
11676
        >>> for idx, frame in enumerate(df.iter_slices()):
11677
        ...     print(f"{type(frame).__name__}:[{idx}]:{len(frame)}")
11678
        DataFrame:[0]:10000
11679
        DataFrame:[1]:7500
11680

11681
        Using `iter_slices` is an efficient way to chunk-iterate over DataFrames and
11682
        any supported frame export/conversion types; for example, as RecordBatches:
11683

11684
        >>> for frame in df.iter_slices(n_rows=15_000):
11685
        ...     record_batch = frame.to_arrow().to_batches()[0]
11686
        ...     print(f"{record_batch.schema}\n<< {len(record_batch)}")
11687
        a: int32
11688
        b: date32[day]
11689
        c: large_string
11690
        << 15000
11691
        a: int32
11692
        b: date32[day]
11693
        c: large_string
11694
        << 2500
11695

11696
        See Also
11697
        --------
11698
        iter_rows : Row iterator over frame data (does not materialise all rows).
11699
        partition_by : Split into multiple DataFrames, partitioned by groups.
11700
        """
11701
        for offset in range(0, self.height, n_rows):
11702
            yield self.slice(offset, n_rows)
11703

11704
    def shrink_to_fit(self, *, in_place: bool = False) -> DataFrame:
11705
        """
11706
        Shrink DataFrame memory usage.
11707

11708
        Shrinks to fit the exact capacity needed to hold the data.
11709
        """
11710
        if in_place:
11711
            self._df.shrink_to_fit()
11712
            return self
11713
        else:
11714
            df = self.clone()
11715
            df._df.shrink_to_fit()
11716
            return df
11717

11718
    def gather_every(self, n: int, offset: int = 0) -> DataFrame:
11719
        """
11720
        Take every nth row in the DataFrame and return as a new DataFrame.
11721

11722
        Parameters
11723
        ----------
11724
        n
11725
            Gather every *n*-th row.
11726
        offset
11727
            Starting index.
11728

11729
        Examples
11730
        --------
11731
        >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})
11732
        >>> s.gather_every(2)
11733
        shape: (2, 2)
11734
        ┌─────┬─────┐
11735
        │ a   ┆ b   │
11736
        │ --- ┆ --- │
11737
        │ i64 ┆ i64 │
11738
        ╞═════╪═════╡
11739
        │ 1   ┆ 5   │
11740
        │ 3   ┆ 7   │
11741
        └─────┴─────┘
11742

11743
        >>> s.gather_every(2, offset=1)
11744
        shape: (2, 2)
11745
        ┌─────┬─────┐
11746
        │ a   ┆ b   │
11747
        │ --- ┆ --- │
11748
        │ i64 ┆ i64 │
11749
        ╞═════╪═════╡
11750
        │ 2   ┆ 6   │
11751
        │ 4   ┆ 8   │
11752
        └─────┴─────┘
11753
        """
11754
        return self.select(F.col("*").gather_every(n, offset))
11755

11756
    def hash_rows(
11757
        self,
11758
        seed: int = 0,
11759
        seed_1: int | None = None,
11760
        seed_2: int | None = None,
11761
        seed_3: int | None = None,
11762
    ) -> Series:
11763
        """
11764
        Hash and combine the rows in this DataFrame.
11765

11766
        The hash value is of type `UInt64`.
11767

11768
        Parameters
11769
        ----------
11770
        seed
11771
            Random seed parameter. Defaults to 0.
11772
        seed_1
11773
            Random seed parameter. Defaults to `seed` if not set.
11774
        seed_2
11775
            Random seed parameter. Defaults to `seed` if not set.
11776
        seed_3
11777
            Random seed parameter. Defaults to `seed` if not set.
11778

11779
        Notes
11780
        -----
11781
        This implementation of `hash_rows` does not guarantee stable results
11782
        across different Polars versions. Its stability is only guaranteed within a
11783
        single version.
11784

11785
        Examples
11786
        --------
11787
        >>> df = pl.DataFrame(
11788
        ...     {
11789
        ...         "foo": [1, None, 3, 4],
11790
        ...         "ham": ["a", "b", None, "d"],
11791
        ...     }
11792
        ... )
11793
        >>> df.hash_rows(seed=42)  # doctest: +IGNORE_RESULT
11794
        shape: (4,)
11795
        Series: '' [u64]
11796
        [
11797
            10783150408545073287
11798
            1438741209321515184
11799
            10047419486152048166
11800
            2047317070637311557
11801
        ]
11802
        """
11803
        k0 = seed
11804
        k1 = seed_1 if seed_1 is not None else seed
11805
        k2 = seed_2 if seed_2 is not None else seed
11806
        k3 = seed_3 if seed_3 is not None else seed
11807
        return wrap_s(self._df.hash_rows(k0, k1, k2, k3))
11808

11809
    def interpolate(self) -> DataFrame:
11810
        """
11811
        Interpolate intermediate values. The interpolation method is linear.
11812

11813
        Nulls at the beginning and end of the series remain null.
11814

11815
        Examples
11816
        --------
11817
        >>> df = pl.DataFrame(
11818
        ...     {
11819
        ...         "foo": [1, None, 9, 10],
11820
        ...         "bar": [6, 7, 9, None],
11821
        ...         "baz": [1, None, None, 9],
11822
        ...     }
11823
        ... )
11824
        >>> df.interpolate()
11825
        shape: (4, 3)
11826
        ┌──────┬──────┬──────────┐
11827
        │ foo  ┆ bar  ┆ baz      │
11828
        │ ---  ┆ ---  ┆ ---      │
11829
        │ f64  ┆ f64  ┆ f64      │
11830
        ╞══════╪══════╪══════════╡
11831
        │ 1.0  ┆ 6.0  ┆ 1.0      │
11832
        │ 5.0  ┆ 7.0  ┆ 3.666667 │
11833
        │ 9.0  ┆ 9.0  ┆ 6.333333 │
11834
        │ 10.0 ┆ null ┆ 9.0      │
11835
        └──────┴──────┴──────────┘
11836
        """
11837
        return self.select(F.col("*").interpolate())
11838

11839
    def is_empty(self) -> bool:
11840
        """
11841
        Returns `True` if the DataFrame contains no rows.
11842

11843
        Examples
11844
        --------
11845
        >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
11846
        >>> df.is_empty()
11847
        False
11848
        >>> df.filter(pl.col("foo") > 99).is_empty()
11849
        True
11850
        """
11851
        return self._df.is_empty()
11852

11853
    def to_struct(self, name: str = "") -> Series:
11854
        """
11855
        Convert a `DataFrame` to a `Series` of type `Struct`.
11856

11857
        Parameters
11858
        ----------
11859
        name
11860
            Name for the struct Series
11861

11862
        Examples
11863
        --------
11864
        >>> df = pl.DataFrame(
11865
        ...     {
11866
        ...         "a": [1, 2, 3, 4, 5],
11867
        ...         "b": ["one", "two", "three", "four", "five"],
11868
        ...     }
11869
        ... )
11870
        >>> df.to_struct("nums")
11871
        shape: (5,)
11872
        Series: 'nums' [struct[2]]
11873
        [
11874
            {1,"one"}
11875
            {2,"two"}
11876
            {3,"three"}
11877
            {4,"four"}
11878
            {5,"five"}
11879
        ]
11880
        """
11881
        return wrap_s(self._df.to_struct(name, []))
11882

11883
    def unnest(
11884
        self,
11885
        columns: ColumnNameOrSelector | Collection[ColumnNameOrSelector],
11886
        *more_columns: ColumnNameOrSelector,
11887
    ) -> DataFrame:
11888
        """
11889
        Decompose struct columns into separate columns for each of their fields.
11890

11891
        The new columns will be inserted into the dataframe at the location of the
11892
        struct column.
11893

11894
        Parameters
11895
        ----------
11896
        columns
11897
            Name of the struct column(s) that should be unnested.
11898
        *more_columns
11899
            Additional columns to unnest, specified as positional arguments.
11900

11901
        Examples
11902
        --------
11903
        >>> df = pl.DataFrame(
11904
        ...     {
11905
        ...         "before": ["foo", "bar"],
11906
        ...         "t_a": [1, 2],
11907
        ...         "t_b": ["a", "b"],
11908
        ...         "t_c": [True, None],
11909
        ...         "t_d": [[1, 2], [3]],
11910
        ...         "after": ["baz", "womp"],
11911
        ...     }
11912
        ... ).select("before", pl.struct(pl.col("^t_.$")).alias("t_struct"), "after")
11913
        >>> df
11914
        shape: (2, 3)
11915
        ┌────────┬─────────────────────┬───────┐
11916
        │ before ┆ t_struct            ┆ after │
11917
        │ ---    ┆ ---                 ┆ ---   │
11918
        │ str    ┆ struct[4]           ┆ str   │
11919
        ╞════════╪═════════════════════╪═══════╡
11920
        │ foo    ┆ {1,"a",true,[1, 2]} ┆ baz   │
11921
        │ bar    ┆ {2,"b",null,[3]}    ┆ womp  │
11922
        └────────┴─────────────────────┴───────┘
11923
        >>> df.unnest("t_struct")
11924
        shape: (2, 6)
11925
        ┌────────┬─────┬─────┬──────┬───────────┬───────┐
11926
        │ before ┆ t_a ┆ t_b ┆ t_c  ┆ t_d       ┆ after │
11927
        │ ---    ┆ --- ┆ --- ┆ ---  ┆ ---       ┆ ---   │
11928
        │ str    ┆ i64 ┆ str ┆ bool ┆ list[i64] ┆ str   │
11929
        ╞════════╪═════╪═════╪══════╪═══════════╪═══════╡
11930
        │ foo    ┆ 1   ┆ a   ┆ true ┆ [1, 2]    ┆ baz   │
11931
        │ bar    ┆ 2   ┆ b   ┆ null ┆ [3]       ┆ womp  │
11932
        └────────┴─────┴─────┴──────┴───────────┴───────┘
11933
        """
11934
        from polars.lazyframe.opt_flags import QueryOptFlags
11935

11936
        return (
11937
            self.lazy()
11938
            .unnest(columns, *more_columns)
11939
            .collect(optimizations=QueryOptFlags._eager())
11940
        )
11941

11942
    def corr(self, **kwargs: Any) -> DataFrame:
11943
        """
11944
        Return pairwise Pearson product-moment correlation coefficients between columns.
11945

11946
        See numpy `corrcoef` for more information:
11947
        https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html
11948

11949
        Notes
11950
        -----
11951
        This functionality requires numpy to be installed.
11952

11953
        Parameters
11954
        ----------
11955
        **kwargs
11956
            Keyword arguments are passed to numpy `corrcoef`.
11957

11958
        Examples
11959
        --------
11960
        >>> df = pl.DataFrame({"foo": [1, 2, 3], "bar": [3, 2, 1], "ham": [7, 8, 9]})
11961
        >>> df.corr()
11962
        shape: (3, 3)
11963
        ┌──────┬──────┬──────┐
11964
        │ foo  ┆ bar  ┆ ham  │
11965
        │ ---  ┆ ---  ┆ ---  │
11966
        │ f64  ┆ f64  ┆ f64  │
11967
        ╞══════╪══════╪══════╡
11968
        │ 1.0  ┆ -1.0 ┆ 1.0  │
11969
        │ -1.0 ┆ 1.0  ┆ -1.0 │
11970
        │ 1.0  ┆ -1.0 ┆ 1.0  │
11971
        └──────┴──────┴──────┘
11972
        """
11973
        correlation_matrix = np.corrcoef(self.to_numpy(), rowvar=False, **kwargs)
11974
        if self.width == 1:
11975
            correlation_matrix = np.array([correlation_matrix])
11976
        return DataFrame(correlation_matrix, schema=self.columns)
11977

11978
    def merge_sorted(self, other: DataFrame, key: str) -> DataFrame:
11979
        """
11980
        Take two sorted DataFrames and merge them by the sorted key.
11981

11982
        The output of this operation will also be sorted.
11983
        It is the callers responsibility that the frames
11984
        are sorted in ascending order by that key otherwise
11985
        the output will not make sense.
11986

11987
        The schemas of both DataFrames must be equal.
11988

11989
        Parameters
11990
        ----------
11991
        other
11992
            Other DataFrame that must be merged
11993
        key
11994
            Key that is sorted.
11995

11996
        Examples
11997
        --------
11998
        >>> df0 = pl.DataFrame(
11999
        ...     {"name": ["steve", "elise", "bob"], "age": [42, 44, 18]}
12000
        ... ).sort("age")
12001
        >>> df0
12002
        shape: (3, 2)
12003
        ┌───────┬─────┐
12004
        │ name  ┆ age │
12005
        │ ---   ┆ --- │
12006
        │ str   ┆ i64 │
12007
        ╞═══════╪═════╡
12008
        │ bob   ┆ 18  │
12009
        │ steve ┆ 42  │
12010
        │ elise ┆ 44  │
12011
        └───────┴─────┘
12012
        >>> df1 = pl.DataFrame(
12013
        ...     {"name": ["anna", "megan", "steve", "thomas"], "age": [21, 33, 42, 20]}
12014
        ... ).sort("age")
12015
        >>> df1
12016
        shape: (4, 2)
12017
        ┌────────┬─────┐
12018
        │ name   ┆ age │
12019
        │ ---    ┆ --- │
12020
        │ str    ┆ i64 │
12021
        ╞════════╪═════╡
12022
        │ thomas ┆ 20  │
12023
        │ anna   ┆ 21  │
12024
        │ megan  ┆ 33  │
12025
        │ steve  ┆ 42  │
12026
        └────────┴─────┘
12027
        >>> df0.merge_sorted(df1, key="age")
12028
        shape: (7, 2)
12029
        ┌────────┬─────┐
12030
        │ name   ┆ age │
12031
        │ ---    ┆ --- │
12032
        │ str    ┆ i64 │
12033
        ╞════════╪═════╡
12034
        │ bob    ┆ 18  │
12035
        │ thomas ┆ 20  │
12036
        │ anna   ┆ 21  │
12037
        │ megan  ┆ 33  │
12038
        │ steve  ┆ 42  │
12039
        │ steve  ┆ 42  │
12040
        │ elise  ┆ 44  │
12041
        └────────┴─────┘
12042

12043
        Notes
12044
        -----
12045
        No guarantee is given over the output row order when the key is equal
12046
        between the both dataframes.
12047

12048
        The key must be sorted in ascending order.
12049
        """
12050
        from polars.lazyframe.opt_flags import QueryOptFlags
12051

12052
        require_same_type(self, other)
12053

12054
        return (
12055
            self.lazy()
12056
            .merge_sorted(other.lazy(), key)
12057
            .collect(optimizations=QueryOptFlags._eager())
12058
        )
12059

12060
    def set_sorted(
12061
        self,
12062
        column: str,
12063
        *,
12064
        descending: bool = False,
12065
    ) -> DataFrame:
12066
        """
12067
        Flag a column as sorted.
12068

12069
        This can speed up future operations.
12070

12071
        Parameters
12072
        ----------
12073
        column
12074
            Column that is sorted
12075
        descending
12076
            Whether the column is sorted in descending order.
12077

12078
        Warnings
12079
        --------
12080
        This can lead to incorrect results if the data is NOT sorted!!
12081
        Use with care!
12082

12083
        """
12084
        # NOTE: Only accepts 1 column on purpose! User think they are sorted by
12085
        # the combined multicolumn values.
12086
        from polars.lazyframe.opt_flags import QueryOptFlags
12087

12088
        return (
12089
            self.lazy()
12090
            .set_sorted(column, descending=descending)
12091
            .collect(optimizations=QueryOptFlags._eager())
12092
        )
12093

12094
    @unstable()
12095
    def update(
12096
        self,
12097
        other: DataFrame,
12098
        on: str | Sequence[str] | None = None,
12099
        how: Literal["left", "inner", "full"] = "left",
12100
        *,
12101
        left_on: str | Sequence[str] | None = None,
12102
        right_on: str | Sequence[str] | None = None,
12103
        include_nulls: bool = False,
12104
        maintain_order: MaintainOrderJoin | None = "left",
12105
    ) -> DataFrame:
12106
        """
12107
        Update the values in this `DataFrame` with the values in `other`.
12108

12109
        .. warning::
12110
            This functionality is considered **unstable**. It may be changed
12111
            at any point without it being considered a breaking change.
12112

12113
        Parameters
12114
        ----------
12115
        other
12116
            DataFrame that will be used to update the values
12117
        on
12118
            Column names that will be joined on. If set to `None` (default),
12119
            the implicit row index of each frame is used as a join key.
12120
        how : {'left', 'inner', 'full'}
12121
            * 'left' will keep all rows from the left table; rows may be duplicated
12122
              if multiple rows in the right frame match the left row's key.
12123
            * 'inner' keeps only those rows where the key exists in both frames.
12124
            * 'full' will update existing rows where the key matches while also
12125
              adding any new rows contained in the given frame.
12126
        left_on
12127
           Join column(s) of the left DataFrame.
12128
        right_on
12129
           Join column(s) of the right DataFrame.
12130
        include_nulls
12131
            Overwrite values in the left frame with null values from the right frame.
12132
            If set to `False` (default), null values in the right frame are ignored.
12133
        maintain_order : {'none', 'left', 'right', 'left_right', 'right_left'}
12134
            Which order of rows from the inputs to preserve. See :func:`~DataFrame.join`
12135
            for details. Unlike `join` this function preserves the left order by
12136
            default.
12137

12138
        Notes
12139
        -----
12140
        This is syntactic sugar for a left/inner join that preserves the order
12141
        of the left `DataFrame` by default, with an optional coalesce when
12142
        `include_nulls = False`.
12143

12144
        Examples
12145
        --------
12146
        >>> df = pl.DataFrame(
12147
        ...     {
12148
        ...         "A": [1, 2, 3, 4],
12149
        ...         "B": [400, 500, 600, 700],
12150
        ...     }
12151
        ... )
12152
        >>> df
12153
        shape: (4, 2)
12154
        ┌─────┬─────┐
12155
        │ A   ┆ B   │
12156
        │ --- ┆ --- │
12157
        │ i64 ┆ i64 │
12158
        ╞═════╪═════╡
12159
        │ 1   ┆ 400 │
12160
        │ 2   ┆ 500 │
12161
        │ 3   ┆ 600 │
12162
        │ 4   ┆ 700 │
12163
        └─────┴─────┘
12164
        >>> new_df = pl.DataFrame(
12165
        ...     {
12166
        ...         "B": [-66, None, -99],
12167
        ...         "C": [5, 3, 1],
12168
        ...     }
12169
        ... )
12170

12171
        Update `df` values with the non-null values in `new_df`, by row index:
12172

12173
        >>> df.update(new_df)
12174
        shape: (4, 2)
12175
        ┌─────┬─────┐
12176
        │ A   ┆ B   │
12177
        │ --- ┆ --- │
12178
        │ i64 ┆ i64 │
12179
        ╞═════╪═════╡
12180
        │ 1   ┆ -66 │
12181
        │ 2   ┆ 500 │
12182
        │ 3   ┆ -99 │
12183
        │ 4   ┆ 700 │
12184
        └─────┴─────┘
12185

12186
        Update `df` values with the non-null values in `new_df`, by row index,
12187
        but only keeping those rows that are common to both frames:
12188

12189
        >>> df.update(new_df, how="inner")
12190
        shape: (3, 2)
12191
        ┌─────┬─────┐
12192
        │ A   ┆ B   │
12193
        │ --- ┆ --- │
12194
        │ i64 ┆ i64 │
12195
        ╞═════╪═════╡
12196
        │ 1   ┆ -66 │
12197
        │ 2   ┆ 500 │
12198
        │ 3   ┆ -99 │
12199
        └─────┴─────┘
12200

12201
        Update `df` values with the non-null values in `new_df`, using a full
12202
        outer join strategy that defines explicit join columns in each frame:
12203

12204
        >>> df.update(new_df, left_on=["A"], right_on=["C"], how="full")
12205
        shape: (5, 2)
12206
        ┌─────┬─────┐
12207
        │ A   ┆ B   │
12208
        │ --- ┆ --- │
12209
        │ i64 ┆ i64 │
12210
        ╞═════╪═════╡
12211
        │ 1   ┆ -99 │
12212
        │ 2   ┆ 500 │
12213
        │ 3   ┆ 600 │
12214
        │ 4   ┆ 700 │
12215
        │ 5   ┆ -66 │
12216
        └─────┴─────┘
12217

12218
        Update `df` values including null values in `new_df`, using a full outer
12219
        join strategy that defines explicit join columns in each frame:
12220

12221
        >>> df.update(new_df, left_on="A", right_on="C", how="full", include_nulls=True)
12222
        shape: (5, 2)
12223
        ┌─────┬──────┐
12224
        │ A   ┆ B    │
12225
        │ --- ┆ ---  │
12226
        │ i64 ┆ i64  │
12227
        ╞═════╪══════╡
12228
        │ 1   ┆ -99  │
12229
        │ 2   ┆ 500  │
12230
        │ 3   ┆ null │
12231
        │ 4   ┆ 700  │
12232
        │ 5   ┆ -66  │
12233
        └─────┴──────┘
12234
        """
12235
        from polars.lazyframe.opt_flags import QueryOptFlags
12236

12237
        require_same_type(self, other)
12238
        return (
12239
            self.lazy()
12240
            .update(
12241
                other.lazy(),
12242
                on,
12243
                how,
12244
                left_on=left_on,
12245
                right_on=right_on,
12246
                include_nulls=include_nulls,
12247
                maintain_order=maintain_order,
12248
            )
12249
            .collect(optimizations=QueryOptFlags._eager())
12250
        )
12251

12252
    def count(self) -> DataFrame:
12253
        """
12254
        Return the number of non-null elements for each column.
12255

12256
        Examples
12257
        --------
12258
        >>> df = pl.DataFrame(
12259
        ...     {"a": [1, 2, 3, 4], "b": [1, 2, 1, None], "c": [None, None, None, None]}
12260
        ... )
12261
        >>> df.count()
12262
        shape: (1, 3)
12263
        ┌─────┬─────┬─────┐
12264
        │ a   ┆ b   ┆ c   │
12265
        │ --- ┆ --- ┆ --- │
12266
        │ u32 ┆ u32 ┆ u32 │
12267
        ╞═════╪═════╪═════╡
12268
        │ 4   ┆ 3   ┆ 0   │
12269
        └─────┴─────┴─────┘
12270
        """
12271
        from polars.lazyframe.opt_flags import QueryOptFlags
12272

12273
        return self.lazy().count().collect(optimizations=QueryOptFlags._eager())
12274

12275
    @deprecated(
12276
        "`DataFrame.melt` is deprecated; use `DataFrame.unpivot` instead, with "
12277
        "`index` instead of `id_vars` and `on` instead of `value_vars`"
12278
    )
12279
    def melt(
12280
        self,
12281
        id_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None,
12282
        value_vars: ColumnNameOrSelector | Sequence[ColumnNameOrSelector] | None = None,
12283
        variable_name: str | None = None,
12284
        value_name: str | None = None,
12285
    ) -> DataFrame:
12286
        """
12287
        Unpivot a DataFrame from wide to long format.
12288

12289
        Optionally leaves identifiers set.
12290

12291
        This function is useful to massage a DataFrame into a format where one or more
12292
        columns are identifier variables (id_vars) while all other columns, considered
12293
        measured variables (value_vars), are "unpivoted" to the row axis leaving just
12294
        two non-identifier columns, 'variable' and 'value'.
12295

12296
        .. deprecated:: 1.0.0
12297
            Use the :meth:`.unpivot` method instead.
12298

12299
        Parameters
12300
        ----------
12301
        id_vars
12302
            Column(s) or selector(s) to use as identifier variables.
12303
        value_vars
12304
            Column(s) or selector(s) to use as values variables; if `value_vars`
12305
            is empty all columns that are not in `id_vars` will be used.
12306
        variable_name
12307
            Name to give to the `variable` column. Defaults to "variable"
12308
        value_name
12309
            Name to give to the `value` column. Defaults to "value"
12310
        """
12311
        return self.unpivot(
12312
            index=id_vars,
12313
            on=value_vars,
12314
            variable_name=variable_name,
12315
            value_name=value_name,
12316
        )
12317

12318
    @unstable()
12319
    def match_to_schema(
12320
        self,
12321
        schema: SchemaDict | Schema,
12322
        *,
12323
        missing_columns: Literal["insert", "raise"]
12324
        | Mapping[str, Literal["insert", "raise"] | Expr] = "raise",
12325
        missing_struct_fields: Literal["insert", "raise"]
12326
        | Mapping[str, Literal["insert", "raise"]] = "raise",
12327
        extra_columns: Literal["ignore", "raise"] = "raise",
12328
        extra_struct_fields: Literal["ignore", "raise"]
12329
        | Mapping[str, Literal["ignore", "raise"]] = "raise",
12330
        integer_cast: Literal["upcast", "forbid"]
12331
        | Mapping[str, Literal["upcast", "forbid"]] = "forbid",
12332
        float_cast: Literal["upcast", "forbid"]
12333
        | Mapping[str, Literal["upcast", "forbid"]] = "forbid",
12334
    ) -> DataFrame:
12335
        """
12336
        Match or evolve the schema of a LazyFrame into a specific schema.
12337

12338
        By default, match_to_schema returns an error if the input schema does not
12339
        exactly match the target schema. It also allows columns to be freely reordered,
12340
        with additional coercion rules available through optional parameters.
12341

12342
        .. warning::
12343
            This functionality is considered **unstable**. It may be changed
12344
            at any point without it being considered a breaking change.
12345

12346
        Parameters
12347
        ----------
12348
        schema
12349
            Target schema to match or evolve to.
12350
        missing_columns
12351
            Raise of insert missing columns from the input with respect to the `schema`.
12352

12353
            This can also be an expression per column with what to insert if it is
12354
            missing.
12355
        missing_struct_fields
12356
            Raise of insert missing struct fields from the input with respect to the
12357
            `schema`.
12358
        extra_columns
12359
            Raise of ignore extra columns from the input with respect to the `schema`.
12360
        extra_struct_fields
12361
            Raise of ignore extra struct fields from the input with respect to the
12362
            `schema`.
12363
        integer_cast
12364
            Forbid of upcast for integer columns from the input to the respective column
12365
            in `schema`.
12366
        float_cast
12367
            Forbid of upcast for float columns from the input to the respective column
12368
            in `schema`.
12369

12370
        Examples
12371
        --------
12372
        Ensuring the schema matches
12373

12374
        >>> df = pl.DataFrame({"a": [1, 2, 3], "b": ["A", "B", "C"]})
12375
        >>> df.match_to_schema({"a": pl.Int64, "b": pl.String})
12376
        shape: (3, 2)
12377
        ┌─────┬─────┐
12378
        │ a   ┆ b   │
12379
        │ --- ┆ --- │
12380
        │ i64 ┆ str │
12381
        ╞═════╪═════╡
12382
        │ 1   ┆ A   │
12383
        │ 2   ┆ B   │
12384
        │ 3   ┆ C   │
12385
        └─────┴─────┘
12386
        >>> df.match_to_schema({"a": pl.Int64})  # doctest: +SKIP
12387
        polars.exceptions.SchemaError: extra columns in `match_to_schema`: "b"
12388

12389
        Adding missing columns
12390

12391
        >>> (
12392
        ...     pl.DataFrame({"a": [1, 2, 3]}).match_to_schema(
12393
        ...         {"a": pl.Int64, "b": pl.String},
12394
        ...         missing_columns="insert",
12395
        ...     )
12396
        ... )
12397
        shape: (3, 2)
12398
        ┌─────┬──────┐
12399
        │ a   ┆ b    │
12400
        │ --- ┆ ---  │
12401
        │ i64 ┆ str  │
12402
        ╞═════╪══════╡
12403
        │ 1   ┆ null │
12404
        │ 2   ┆ null │
12405
        │ 3   ┆ null │
12406
        └─────┴──────┘
12407
        >>> (
12408
        ...     pl.DataFrame({"a": [1, 2, 3]}).match_to_schema(
12409
        ...         {"a": pl.Int64, "b": pl.String},
12410
        ...         missing_columns={"b": pl.col.a.cast(pl.String)},
12411
        ...     )
12412
        ... )
12413
        shape: (3, 2)
12414
        ┌─────┬─────┐
12415
        │ a   ┆ b   │
12416
        │ --- ┆ --- │
12417
        │ i64 ┆ str │
12418
        ╞═════╪═════╡
12419
        │ 1   ┆ 1   │
12420
        │ 2   ┆ 2   │
12421
        │ 3   ┆ 3   │
12422
        └─────┴─────┘
12423

12424
        Removing extra columns
12425

12426
        >>> (
12427
        ...     pl.DataFrame({"a": [1, 2, 3], "b": ["A", "B", "C"]}).match_to_schema(
12428
        ...         {"a": pl.Int64},
12429
        ...         extra_columns="ignore",
12430
        ...     )
12431
        ... )
12432
        shape: (3, 1)
12433
        ┌─────┐
12434
        │ a   │
12435
        │ --- │
12436
        │ i64 │
12437
        ╞═════╡
12438
        │ 1   │
12439
        │ 2   │
12440
        │ 3   │
12441
        └─────┘
12442

12443
        Upcasting integers and floats
12444

12445
        >>> (
12446
        ...     pl.DataFrame(
12447
        ...         {"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]},
12448
        ...         schema={"a": pl.Int32, "b": pl.Float32},
12449
        ...     ).match_to_schema(
12450
        ...         {"a": pl.Int64, "b": pl.Float64},
12451
        ...         integer_cast="upcast",
12452
        ...         float_cast="upcast",
12453
        ...     )
12454
        ... )
12455
        shape: (3, 2)
12456
        ┌─────┬─────┐
12457
        │ a   ┆ b   │
12458
        │ --- ┆ --- │
12459
        │ i64 ┆ f64 │
12460
        ╞═════╪═════╡
12461
        │ 1   ┆ 1.0 │
12462
        │ 2   ┆ 2.0 │
12463
        │ 3   ┆ 3.0 │
12464
        └─────┴─────┘
12465
        """
12466
        from polars.lazyframe.opt_flags import QueryOptFlags
12467

12468
        return (
12469
            self.lazy()
12470
            .match_to_schema(
12471
                schema=schema,
12472
                missing_columns=missing_columns,
12473
                missing_struct_fields=missing_struct_fields,
12474
                extra_columns=extra_columns,
12475
                extra_struct_fields=extra_struct_fields,
12476
                integer_cast=integer_cast,
12477
                float_cast=float_cast,
12478
            )
12479
            .collect(optimizations=QueryOptFlags._eager())
12480
        )
12481

12482
    def _to_metadata(
12483
        self,
12484
        columns: None | str | list[str] = None,
12485
        stats: None | str | list[str] = None,
12486
    ) -> DataFrame:
12487
        """
12488
        Get all runtime metadata for each column.
12489

12490
        This is unstable and is meant for debugging purposes.
12491

12492
        Parameters
12493
        ----------
12494
        columns
12495
            Column(s) to show the information for
12496
        stats
12497
            Statistics to show
12498
        """
12499
        df = self
12500

12501
        if columns is not None:
12502
            if isinstance(columns, str):
12503
                columns = [columns]
12504

12505
            df = df.select(columns)
12506

12507
        md = self._from_pydf(df._df._to_metadata())
12508

12509
        if stats is not None:
12510
            if isinstance(stats, str):
12511
                stats = [stats]
12512

12513
            if "column_name" not in stats:
12514
                stats = ["column_name"] + stats
12515

12516
            md = md.select(stats)
12517

12518
        return md
12519

12520
    def _row_encode(
12521
        self,
12522
        *,
12523
        unordered: bool = False,
12524
        descending: list[bool] | None = None,
12525
        nulls_last: list[bool] | None = None,
12526
    ) -> Series:
12527
        """
12528
        Row encode the given DataFrame.
12529

12530
        This is an internal function not meant for outside consumption and can
12531
        be changed or removed at any point in time.
12532

12533
        fields have order:
12534
        - descending
12535
        - nulls_last
12536
        - no_order
12537
        """
12538
        return self.select_seq(
12539
            F._row_encode(
12540
                F.all(),
12541
                unordered=unordered,
12542
                descending=descending,
12543
                nulls_last=nulls_last,
12544
            )
12545
        ).to_series()
12546

12547

12548
def _prepare_other_arg(other: Any, length: int | None = None) -> Series:
12549
    # if not a series create singleton series such that it will broadcast
12550
    value = other
12551
    if not isinstance(other, pl.Series):
12552
        if isinstance(other, str):
12553
            pass
12554
        elif isinstance(other, Sequence):
12555
            msg = "operation not supported"
12556
            raise TypeError(msg)
12557
        other = pl.Series("", [other])
12558

12559
    if length is not None:
12560
        if length > 1:
12561
            other = other.extend_constant(value=value, n=length - 1)
12562
        elif length == 0:
12563
            other = other.slice(0, 0)
12564

12565
    return other
12566

12567
Product

Resources

Company