Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/docs/source/src/python/user-guide/expressions/user-defined-functions.py
7890 views
1
# --8<-- [start:setup]
2
3
from numba import float64, guvectorize, int64
4
import numpy as np
5
import math
6
import warnings
7
8
import polars as pl
9
from polars.exceptions import PolarsInefficientMapWarning
10
11
warnings.simplefilter("ignore", PolarsInefficientMapWarning)
12
# --8<-- [end:setup]
13
14
# --8<-- [start:dataframe]
15
df = pl.DataFrame(
16
{
17
"keys": ["a", "a", "b", "b"],
18
"values": [10, 7, 1, 23],
19
}
20
)
21
print(df)
22
# --8<-- [end:dataframe]
23
24
# --8<-- [start:individual_log]
25
26
27
def my_log(value):
28
return math.log(value)
29
30
31
out = df.select(pl.col("values").map_elements(my_log, return_dtype=pl.Float64))
32
print(out)
33
# --8<-- [end:individual_log]
34
35
36
# --8<-- [start:diff_from_mean]
37
def diff_from_mean(series):
38
# This will be very slow for non-trivial Series, since it's all Python
39
# code:
40
total = 0
41
for value in series:
42
total += value
43
mean = total / len(series)
44
return pl.Series([value - mean for value in series])
45
46
47
# Apply our custom function to a full Series with map_batches():
48
out = df.select(pl.col("values").map_batches(diff_from_mean, return_dtype=pl.Float64))
49
print("== select() with UDF ==")
50
print(out)
51
52
# Apply our custom function per group:
53
print("== group_by() with UDF ==")
54
out = df.group_by("keys").agg(
55
pl.col("values").map_batches(diff_from_mean, return_dtype=pl.Float64)
56
)
57
print(out)
58
# --8<-- [end:diff_from_mean]
59
60
# --8<-- [start:np_log]
61
62
out = df.select(pl.col("values").map_batches(np.log, return_dtype=pl.Float64))
63
print(out)
64
# --8<-- [end:np_log]
65
66
# --8<-- [start:diff_from_mean_numba]
67
68
69
# This will be compiled to machine code, so it will be fast. The Series is
70
# converted to a NumPy array before being passed to the function. See the
71
# Numba documentation for more details:
72
# https://numba.readthedocs.io/en/stable/user/vectorize.html
73
@guvectorize([(int64[:], float64[:])], "(n)->(n)")
74
def diff_from_mean_numba(arr, result):
75
total = 0
76
for value in arr:
77
total += value
78
mean = total / len(arr)
79
for i, value in enumerate(arr):
80
result[i] = value - mean
81
82
83
out = df.select(
84
pl.col("values").map_batches(diff_from_mean_numba, return_dtype=pl.Float64)
85
)
86
print("== select() with UDF ==")
87
print(out)
88
89
out = df.group_by("keys").agg(
90
pl.col("values").map_batches(diff_from_mean_numba, return_dtype=pl.Float64)
91
)
92
print("== group_by() with UDF ==")
93
print(out)
94
# --8<-- [end:diff_from_mean_numba]
95
96
97
# --8<-- [start:combine]
98
# Add two arrays together:
99
@guvectorize([(int64[:], int64[:], float64[:])], "(n),(n)->(n)")
100
def add(arr, arr2, result):
101
for i in range(len(arr)):
102
result[i] = arr[i] + arr2[i]
103
104
105
df3 = pl.DataFrame({"values_1": [1, 2, 3], "values_2": [10, 20, 30]})
106
107
out = df3.select(
108
# Create a struct that has two columns in it:
109
pl.struct(["values_1", "values_2"])
110
# Pass the struct to a lambda that then passes the individual columns to
111
# the add() function:
112
.map_batches(
113
lambda combined: add(
114
combined.struct.field("values_1"), combined.struct.field("values_2")
115
),
116
return_dtype=pl.Float64,
117
)
118
.alias("add_columns")
119
)
120
print(out)
121
# --8<-- [end:combine]
122
123