Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-plan/src/plans/python/pyarrow.rs
8384 views
1
use std::fmt::Write;
2
3
use polars_core::datatypes::AnyValue;
4
use polars_core::prelude::{TimeUnit, TimeZone};
5
6
use crate::prelude::*;
7
8
#[derive(Default, Copy, Clone)]
9
pub struct PyarrowArgs {
10
// pyarrow doesn't allow `filter([True, False])`
11
// but does allow `filter(field("a").isin([True, False]))`
12
allow_literal_series: bool,
13
}
14
15
fn to_py_datetime(v: i64, tu: &TimeUnit, tz: Option<&TimeZone>) -> String {
16
// note: `to_py_datetime` and the `Datetime`
17
// dtype have to be in-scope on the python side
18
match tz {
19
None => format!("to_py_datetime({},'{}')", v, tu.to_ascii()),
20
Some(tz) => format!("to_py_datetime({},'{}','{}')", v, tu.to_ascii(), tz),
21
}
22
}
23
24
fn sanitize(name: &str) -> Option<&str> {
25
if name.chars().all(|c| match c {
26
' ' => true,
27
'-' => true,
28
'_' => true,
29
c => c.is_alphanumeric(),
30
}) {
31
Some(name)
32
} else {
33
None
34
}
35
}
36
37
// convert to a pyarrow expression that can be evaluated with pythons eval
38
pub fn predicate_to_pa(
39
predicate: Node,
40
expr_arena: &Arena<AExpr>,
41
args: PyarrowArgs,
42
) -> Option<String> {
43
match expr_arena.get(predicate) {
44
AExpr::BinaryExpr { left, right, op } => {
45
if op.is_comparison_or_bitwise() {
46
let left = predicate_to_pa(*left, expr_arena, args)?;
47
let right = predicate_to_pa(*right, expr_arena, args)?;
48
Some(format!("({left} {op} {right})"))
49
} else {
50
None
51
}
52
},
53
AExpr::Column(name) => {
54
let name = sanitize(name)?;
55
Some(format!("pa.compute.field('{name}')"))
56
},
57
AExpr::Literal(LiteralValue::Series(s)) => {
58
if !args.allow_literal_series || s.is_empty() || s.len() > 100 {
59
None
60
} else {
61
let mut list_repr = String::with_capacity(s.len() * 5);
62
list_repr.push('[');
63
for av in s.iter() {
64
match av {
65
AnyValue::Boolean(v) => {
66
let s = if v { "True" } else { "False" };
67
write!(list_repr, "{s},").unwrap();
68
},
69
#[cfg(feature = "dtype-datetime")]
70
AnyValue::Datetime(v, tu, tz) => {
71
let dtm = to_py_datetime(v, &tu, tz);
72
write!(list_repr, "{dtm},").unwrap();
73
},
74
#[cfg(feature = "dtype-date")]
75
AnyValue::Date(v) => {
76
write!(list_repr, "to_py_date({v}),").unwrap();
77
},
78
AnyValue::String(s) => {
79
let _ = sanitize(s)?;
80
write!(list_repr, "{av},").unwrap();
81
},
82
// Hard to sanitize
83
AnyValue::Binary(_) | AnyValue::List(_) => return None,
84
#[cfg(feature = "dtype-array")]
85
AnyValue::Array(_, _) => return None,
86
#[cfg(feature = "dtype-struct")]
87
AnyValue::Struct(_, _, _) => return None,
88
_ => {
89
write!(list_repr, "{av},").unwrap();
90
},
91
}
92
}
93
// pop last comma
94
list_repr.pop();
95
list_repr.push(']');
96
Some(list_repr)
97
}
98
},
99
AExpr::Literal(lv) => {
100
let av = lv.to_any_value()?;
101
let dtype = av.dtype();
102
match av.as_borrowed() {
103
AnyValue::String(s) => {
104
let s = sanitize(s)?;
105
Some(format!("'{s}'"))
106
},
107
AnyValue::Boolean(val) => {
108
// python bools are capitalized
109
if val {
110
Some("pa.compute.scalar(True)".to_string())
111
} else {
112
Some("pa.compute.scalar(False)".to_string())
113
}
114
},
115
#[cfg(feature = "dtype-date")]
116
AnyValue::Date(v) => {
117
// the function `to_py_date` and the `Date`
118
// dtype have to be in scope on the python side
119
Some(format!("to_py_date({v})"))
120
},
121
#[cfg(feature = "dtype-datetime")]
122
AnyValue::Datetime(v, tu, tz) => Some(to_py_datetime(v, &tu, tz)),
123
// Hard to sanitize
124
AnyValue::Binary(_) | AnyValue::List(_) => None,
125
#[cfg(feature = "dtype-array")]
126
AnyValue::Array(_, _) => None,
127
#[cfg(feature = "dtype-struct")]
128
AnyValue::Struct(_, _, _) => None,
129
// Activate once pyarrow supports them
130
// #[cfg(feature = "dtype-time")]
131
// AnyValue::Time(v) => {
132
// // the function `to_py_time` has to be in scope
133
// // on the python side
134
// Some(format!("to_py_time(value={v})"))
135
// }
136
// #[cfg(feature = "dtype-duration")]
137
// AnyValue::Duration(v, tu) => {
138
// // the function `to_py_timedelta` has to be in scope
139
// // on the python side
140
// Some(format!(
141
// "to_py_timedelta(value={}, tu='{}')",
142
// v,
143
// tu.to_ascii()
144
// ))
145
// }
146
av => {
147
if dtype.is_float() {
148
let val = av.extract::<f64>()?;
149
Some(format!("{val}"))
150
} else if dtype.is_integer() {
151
let val = av.extract::<i64>()?;
152
Some(format!("{val}"))
153
} else {
154
None
155
}
156
},
157
}
158
},
159
#[cfg(feature = "is_in")]
160
AExpr::Function {
161
function: IRFunctionExpr::Boolean(IRBooleanFunction::IsIn { .. }),
162
input,
163
..
164
} => {
165
let col = predicate_to_pa(input.first()?.node(), expr_arena, args)?;
166
let mut args = args;
167
args.allow_literal_series = true;
168
let values = predicate_to_pa(input.get(1)?.node(), expr_arena, args)?;
169
170
Some(format!("({col}).isin({values})"))
171
},
172
#[cfg(feature = "is_between")]
173
AExpr::Function {
174
function: IRFunctionExpr::Boolean(IRBooleanFunction::IsBetween { closed }),
175
input,
176
..
177
} => {
178
if !matches!(expr_arena.get(input.first()?.node()), AExpr::Column(_)) {
179
None
180
} else {
181
let col = predicate_to_pa(input.first()?.node(), expr_arena, args)?;
182
let left_cmp_op = match closed {
183
ClosedInterval::None | ClosedInterval::Right => Operator::Gt,
184
ClosedInterval::Both | ClosedInterval::Left => Operator::GtEq,
185
};
186
let right_cmp_op = match closed {
187
ClosedInterval::None | ClosedInterval::Left => Operator::Lt,
188
ClosedInterval::Both | ClosedInterval::Right => Operator::LtEq,
189
};
190
191
let lower = predicate_to_pa(input.get(1)?.node(), expr_arena, args)?;
192
let upper = predicate_to_pa(input.get(2)?.node(), expr_arena, args)?;
193
194
Some(format!(
195
"(({col} {left_cmp_op} {lower}) & ({col} {right_cmp_op} {upper}))"
196
))
197
}
198
},
199
AExpr::Function {
200
function, input, ..
201
} => {
202
let input = input.first().unwrap().node();
203
let input = predicate_to_pa(input, expr_arena, args)?;
204
205
match function {
206
IRFunctionExpr::Boolean(IRBooleanFunction::Not) => Some(format!("~({input})")),
207
IRFunctionExpr::Boolean(IRBooleanFunction::IsNull) => {
208
Some(format!("({input}).is_null()"))
209
},
210
IRFunctionExpr::Boolean(IRBooleanFunction::IsNotNull) => {
211
Some(format!("~({input}).is_null()"))
212
},
213
_ => None,
214
}
215
},
216
_ => None,
217
}
218
}
219
220