Path: blob/main/docs/source/src/python/user-guide/expressions/user-defined-functions.py
7890 views
# --8<-- [start:setup]12from numba import float64, guvectorize, int643import numpy as np4import math5import warnings67import polars as pl8from polars.exceptions import PolarsInefficientMapWarning910warnings.simplefilter("ignore", PolarsInefficientMapWarning)11# --8<-- [end:setup]1213# --8<-- [start:dataframe]14df = pl.DataFrame(15{16"keys": ["a", "a", "b", "b"],17"values": [10, 7, 1, 23],18}19)20print(df)21# --8<-- [end:dataframe]2223# --8<-- [start:individual_log]242526def my_log(value):27return math.log(value)282930out = df.select(pl.col("values").map_elements(my_log, return_dtype=pl.Float64))31print(out)32# --8<-- [end:individual_log]333435# --8<-- [start:diff_from_mean]36def diff_from_mean(series):37# This will be very slow for non-trivial Series, since it's all Python38# code:39total = 040for value in series:41total += value42mean = total / len(series)43return pl.Series([value - mean for value in series])444546# Apply our custom function to a full Series with map_batches():47out = df.select(pl.col("values").map_batches(diff_from_mean, return_dtype=pl.Float64))48print("== select() with UDF ==")49print(out)5051# Apply our custom function per group:52print("== group_by() with UDF ==")53out = df.group_by("keys").agg(54pl.col("values").map_batches(diff_from_mean, return_dtype=pl.Float64)55)56print(out)57# --8<-- [end:diff_from_mean]5859# --8<-- [start:np_log]6061out = df.select(pl.col("values").map_batches(np.log, return_dtype=pl.Float64))62print(out)63# --8<-- [end:np_log]6465# --8<-- [start:diff_from_mean_numba]666768# This will be compiled to machine code, so it will be fast. The Series is69# converted to a NumPy array before being passed to the function. See the70# Numba documentation for more details:71# https://numba.readthedocs.io/en/stable/user/vectorize.html72@guvectorize([(int64[:], float64[:])], "(n)->(n)")73def diff_from_mean_numba(arr, result):74total = 075for value in arr:76total += value77mean = total / len(arr)78for i, value in enumerate(arr):79result[i] = value - mean808182out = df.select(83pl.col("values").map_batches(diff_from_mean_numba, return_dtype=pl.Float64)84)85print("== select() with UDF ==")86print(out)8788out = df.group_by("keys").agg(89pl.col("values").map_batches(diff_from_mean_numba, return_dtype=pl.Float64)90)91print("== group_by() with UDF ==")92print(out)93# --8<-- [end:diff_from_mean_numba]949596# --8<-- [start:combine]97# Add two arrays together:98@guvectorize([(int64[:], int64[:], float64[:])], "(n),(n)->(n)")99def add(arr, arr2, result):100for i in range(len(arr)):101result[i] = arr[i] + arr2[i]102103104df3 = pl.DataFrame({"values_1": [1, 2, 3], "values_2": [10, 20, 30]})105106out = df3.select(107# Create a struct that has two columns in it:108pl.struct(["values_1", "values_2"])109# Pass the struct to a lambda that then passes the individual columns to110# the add() function:111.map_batches(112lambda combined: add(113combined.struct.field("values_1"), combined.struct.field("values_2")114),115return_dtype=pl.Float64,116)117.alias("add_columns")118)119print(out)120# --8<-- [end:combine]121122123