Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-plan/src/plans/python/pyarrow.rs
6940 views
1
use std::fmt::Write;
2
3
use polars_core::datatypes::AnyValue;
4
use polars_core::prelude::{TimeUnit, TimeZone};
5
6
use crate::prelude::*;
7
8
#[derive(Default, Copy, Clone)]
9
pub struct PyarrowArgs {
10
// pyarrow doesn't allow `filter([True, False])`
11
// but does allow `filter(field("a").isin([True, False]))`
12
allow_literal_series: bool,
13
}
14
15
fn to_py_datetime(v: i64, tu: &TimeUnit, tz: Option<&TimeZone>) -> String {
16
// note: `to_py_datetime` and the `Datetime`
17
// dtype have to be in-scope on the python side
18
match tz {
19
None => format!("to_py_datetime({},'{}')", v, tu.to_ascii()),
20
Some(tz) => format!("to_py_datetime({},'{}',{})", v, tu.to_ascii(), tz),
21
}
22
}
23
24
fn sanitize(name: &str) -> Option<&str> {
25
if name.chars().all(|c| match c {
26
' ' => true,
27
'-' => true,
28
'_' => true,
29
c => c.is_alphanumeric(),
30
}) {
31
Some(name)
32
} else {
33
None
34
}
35
}
36
37
// convert to a pyarrow expression that can be evaluated with pythons eval
38
pub fn predicate_to_pa(
39
predicate: Node,
40
expr_arena: &Arena<AExpr>,
41
args: PyarrowArgs,
42
) -> Option<String> {
43
match expr_arena.get(predicate) {
44
AExpr::BinaryExpr { left, right, op } => {
45
if op.is_comparison_or_bitwise() {
46
let left = predicate_to_pa(*left, expr_arena, args)?;
47
let right = predicate_to_pa(*right, expr_arena, args)?;
48
Some(format!("({left} {op} {right})"))
49
} else {
50
None
51
}
52
},
53
AExpr::Column(name) => {
54
let name = sanitize(name)?;
55
Some(format!("pa.compute.field('{name}')"))
56
},
57
AExpr::Literal(LiteralValue::Series(s)) => {
58
if !args.allow_literal_series || s.is_empty() || s.len() > 100 {
59
None
60
} else {
61
let mut list_repr = String::with_capacity(s.len() * 5);
62
list_repr.push('[');
63
for av in s.rechunk().iter() {
64
match av {
65
AnyValue::Boolean(v) => {
66
let s = if v { "True" } else { "False" };
67
write!(list_repr, "{s},").unwrap();
68
},
69
#[cfg(feature = "dtype-datetime")]
70
AnyValue::Datetime(v, tu, tz) => {
71
let dtm = to_py_datetime(v, &tu, tz);
72
write!(list_repr, "{dtm},").unwrap();
73
},
74
#[cfg(feature = "dtype-date")]
75
AnyValue::Date(v) => {
76
write!(list_repr, "to_py_date({v}),").unwrap();
77
},
78
AnyValue::String(s) => {
79
let _ = sanitize(s)?;
80
write!(list_repr, "{av},").unwrap();
81
},
82
// Hard to sanitize
83
AnyValue::Binary(_)
84
| AnyValue::Struct(_, _, _)
85
| AnyValue::List(_)
86
| AnyValue::Array(_, _) => return None,
87
_ => {
88
write!(list_repr, "{av},").unwrap();
89
},
90
}
91
}
92
// pop last comma
93
list_repr.pop();
94
list_repr.push(']');
95
Some(list_repr)
96
}
97
},
98
AExpr::Literal(lv) => {
99
let av = lv.to_any_value()?;
100
let dtype = av.dtype();
101
match av.as_borrowed() {
102
AnyValue::String(s) => {
103
let s = sanitize(s)?;
104
Some(format!("'{s}'"))
105
},
106
AnyValue::Boolean(val) => {
107
// python bools are capitalized
108
if val {
109
Some("pa.compute.scalar(True)".to_string())
110
} else {
111
Some("pa.compute.scalar(False)".to_string())
112
}
113
},
114
#[cfg(feature = "dtype-date")]
115
AnyValue::Date(v) => {
116
// the function `to_py_date` and the `Date`
117
// dtype have to be in scope on the python side
118
Some(format!("to_py_date({v})"))
119
},
120
#[cfg(feature = "dtype-datetime")]
121
AnyValue::Datetime(v, tu, tz) => Some(to_py_datetime(v, &tu, tz)),
122
// Hard to sanitize
123
AnyValue::Binary(_)
124
| AnyValue::Struct(_, _, _)
125
| AnyValue::List(_)
126
| AnyValue::Array(_, _) => None,
127
// Activate once pyarrow supports them
128
// #[cfg(feature = "dtype-time")]
129
// AnyValue::Time(v) => {
130
// // the function `to_py_time` has to be in scope
131
// // on the python side
132
// Some(format!("to_py_time(value={v})"))
133
// }
134
// #[cfg(feature = "dtype-duration")]
135
// AnyValue::Duration(v, tu) => {
136
// // the function `to_py_timedelta` has to be in scope
137
// // on the python side
138
// Some(format!(
139
// "to_py_timedelta(value={}, tu='{}')",
140
// v,
141
// tu.to_ascii()
142
// ))
143
// }
144
av => {
145
if dtype.is_float() {
146
let val = av.extract::<f64>()?;
147
Some(format!("{val}"))
148
} else if dtype.is_integer() {
149
let val = av.extract::<i64>()?;
150
Some(format!("{val}"))
151
} else {
152
None
153
}
154
},
155
}
156
},
157
#[cfg(feature = "is_in")]
158
AExpr::Function {
159
function: IRFunctionExpr::Boolean(IRBooleanFunction::IsIn { .. }),
160
input,
161
..
162
} => {
163
let col = predicate_to_pa(input.first()?.node(), expr_arena, args)?;
164
let mut args = args;
165
args.allow_literal_series = true;
166
let values = predicate_to_pa(input.get(1)?.node(), expr_arena, args)?;
167
168
Some(format!("({col}).isin({values})"))
169
},
170
#[cfg(feature = "is_between")]
171
AExpr::Function {
172
function: IRFunctionExpr::Boolean(IRBooleanFunction::IsBetween { closed }),
173
input,
174
..
175
} => {
176
if !matches!(expr_arena.get(input.first()?.node()), AExpr::Column(_)) {
177
None
178
} else {
179
let col = predicate_to_pa(input.first()?.node(), expr_arena, args)?;
180
let left_cmp_op = match closed {
181
ClosedInterval::None | ClosedInterval::Right => Operator::Gt,
182
ClosedInterval::Both | ClosedInterval::Left => Operator::GtEq,
183
};
184
let right_cmp_op = match closed {
185
ClosedInterval::None | ClosedInterval::Left => Operator::Lt,
186
ClosedInterval::Both | ClosedInterval::Right => Operator::LtEq,
187
};
188
189
let lower = predicate_to_pa(input.get(1)?.node(), expr_arena, args)?;
190
let upper = predicate_to_pa(input.get(2)?.node(), expr_arena, args)?;
191
192
Some(format!(
193
"(({col} {left_cmp_op} {lower}) & ({col} {right_cmp_op} {upper}))"
194
))
195
}
196
},
197
AExpr::Function {
198
function, input, ..
199
} => {
200
let input = input.first().unwrap().node();
201
let input = predicate_to_pa(input, expr_arena, args)?;
202
203
match function {
204
IRFunctionExpr::Boolean(IRBooleanFunction::Not) => Some(format!("~({input})")),
205
IRFunctionExpr::Boolean(IRBooleanFunction::IsNull) => {
206
Some(format!("({input}).is_null()"))
207
},
208
IRFunctionExpr::Boolean(IRBooleanFunction::IsNotNull) => {
209
Some(format!("~({input}).is_null()"))
210
},
211
_ => None,
212
}
213
},
214
_ => None,
215
}
216
}
217
218