CoCalc -- user-defined-functions.py

GitHub Repository: pola-rs/polars
Path: blob/main/docs/source/src/python/user-guide/expressions/user-defined-functions.py
⁷⁸⁹⁰ views
1
# --8<-- [start:setup]
2

3
from numba import float64, guvectorize, int64
4
import numpy as np
5
import math
6
import warnings
7

8
import polars as pl
9
from polars.exceptions import PolarsInefficientMapWarning
10

11
warnings.simplefilter("ignore", PolarsInefficientMapWarning)
12
# --8<-- [end:setup]
13

14
# --8<-- [start:dataframe]
15
df = pl.DataFrame(
16
    {
17
        "keys": ["a", "a", "b", "b"],
18
        "values": [10, 7, 1, 23],
19
    }
20
)
21
print(df)
22
# --8<-- [end:dataframe]
23

24
# --8<-- [start:individual_log]
25

26

27
def my_log(value):
28
    return math.log(value)
29

30

31
out = df.select(pl.col("values").map_elements(my_log, return_dtype=pl.Float64))
32
print(out)
33
# --8<-- [end:individual_log]
34

35

36
# --8<-- [start:diff_from_mean]
37
def diff_from_mean(series):
38
    # This will be very slow for non-trivial Series, since it's all Python
39
    # code:
40
    total = 0
41
    for value in series:
42
        total += value
43
    mean = total / len(series)
44
    return pl.Series([value - mean for value in series])
45

46

47
# Apply our custom function to a full Series with map_batches():
48
out = df.select(pl.col("values").map_batches(diff_from_mean, return_dtype=pl.Float64))
49
print("== select() with UDF ==")
50
print(out)
51

52
# Apply our custom function per group:
53
print("== group_by() with UDF ==")
54
out = df.group_by("keys").agg(
55
    pl.col("values").map_batches(diff_from_mean, return_dtype=pl.Float64)
56
)
57
print(out)
58
# --8<-- [end:diff_from_mean]
59

60
# --8<-- [start:np_log]
61

62
out = df.select(pl.col("values").map_batches(np.log, return_dtype=pl.Float64))
63
print(out)
64
# --8<-- [end:np_log]
65

66
# --8<-- [start:diff_from_mean_numba]
67

68

69
# This will be compiled to machine code, so it will be fast. The Series is
70
# converted to a NumPy array before being passed to the function. See the
71
# Numba documentation for more details:
72
# https://numba.readthedocs.io/en/stable/user/vectorize.html
73
@guvectorize([(int64[:], float64[:])], "(n)->(n)")
74
def diff_from_mean_numba(arr, result):
75
    total = 0
76
    for value in arr:
77
        total += value
78
    mean = total / len(arr)
79
    for i, value in enumerate(arr):
80
        result[i] = value - mean
81

82

83
out = df.select(
84
    pl.col("values").map_batches(diff_from_mean_numba, return_dtype=pl.Float64)
85
)
86
print("== select() with UDF ==")
87
print(out)
88

89
out = df.group_by("keys").agg(
90
    pl.col("values").map_batches(diff_from_mean_numba, return_dtype=pl.Float64)
91
)
92
print("== group_by() with UDF ==")
93
print(out)
94
# --8<-- [end:diff_from_mean_numba]
95

96

97
# --8<-- [start:combine]
98
# Add two arrays together:
99
@guvectorize([(int64[:], int64[:], float64[:])], "(n),(n)->(n)")
100
def add(arr, arr2, result):
101
    for i in range(len(arr)):
102
        result[i] = arr[i] + arr2[i]
103

104

105
df3 = pl.DataFrame({"values_1": [1, 2, 3], "values_2": [10, 20, 30]})
106

107
out = df3.select(
108
    # Create a struct that has two columns in it:
109
    pl.struct(["values_1", "values_2"])
110
    # Pass the struct to a lambda that then passes the individual columns to
111
    # the add() function:
112
    .map_batches(
113
        lambda combined: add(
114
            combined.struct.field("values_1"), combined.struct.field("values_2")
115
        ),
116
        return_dtype=pl.Float64,
117
    )
118
    .alias("add_columns")
119
)
120
print(out)
121
# --8<-- [end:combine]
122

123
Product

Resources

Company