Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-time/src/chunkedarray/string/mod.rs
6940 views
1
pub mod infer;
2
use chrono::DateTime;
3
mod patterns;
4
mod strptime;
5
pub use patterns::Pattern;
6
#[cfg(feature = "dtype-time")]
7
use polars_core::chunked_array::temporal::time_to_time64ns;
8
use polars_core::prelude::arity::unary_elementwise;
9
use polars_utils::cache::LruCachedFunc;
10
11
use super::*;
12
#[cfg(feature = "dtype-date")]
13
use crate::chunkedarray::date::naive_date_to_date;
14
use crate::prelude::string::strptime::StrpTimeState;
15
16
#[cfg(feature = "dtype-time")]
17
fn time_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
18
// (string, fmt) -> PolarsResult
19
where
20
F: Fn(&str, &str) -> chrono::ParseResult<K>,
21
{
22
patterns::TIME_H_M_S
23
.iter()
24
.chain(patterns::TIME_H_M_S)
25
.find(|fmt| convert(val, fmt).is_ok())
26
.copied()
27
}
28
29
fn datetime_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
30
// (string, fmt) -> PolarsResult
31
where
32
F: Fn(&str, &str) -> chrono::ParseResult<K>,
33
{
34
patterns::DATETIME_Y_M_D
35
.iter()
36
.chain(patterns::DATETIME_D_M_Y)
37
.find(|fmt| convert(val, fmt).is_ok())
38
.copied()
39
}
40
41
fn date_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>
42
// (string, fmt) -> PolarsResult
43
where
44
F: Fn(&str, &str) -> chrono::ParseResult<K>,
45
{
46
patterns::DATE_Y_M_D
47
.iter()
48
.chain(patterns::DATE_D_M_Y)
49
.find(|fmt| convert(val, fmt).is_ok())
50
.copied()
51
}
52
53
fn get_first_val(ca: &StringChunked) -> PolarsResult<&str> {
54
let idx = ca.first_non_null().ok_or_else(|| {
55
polars_err!(ComputeError:
56
"unable to determine date parsing format, all values are null",
57
)
58
})?;
59
Ok(ca.get(idx).expect("should not be null"))
60
}
61
62
#[cfg(feature = "dtype-datetime")]
63
fn sniff_fmt_datetime(ca_string: &StringChunked) -> PolarsResult<&'static str> {
64
let val = get_first_val(ca_string)?;
65
datetime_pattern(val, NaiveDateTime::parse_from_str)
66
.or_else(|| datetime_pattern(val, NaiveDate::parse_from_str))
67
.ok_or_else(|| polars_err!(parse_fmt_idk = "datetime"))
68
}
69
70
#[cfg(feature = "dtype-date")]
71
fn sniff_fmt_date(ca_string: &StringChunked) -> PolarsResult<&'static str> {
72
let val = get_first_val(ca_string)?;
73
date_pattern(val, NaiveDate::parse_from_str).ok_or_else(|| polars_err!(parse_fmt_idk = "date"))
74
}
75
76
#[cfg(feature = "dtype-time")]
77
fn sniff_fmt_time(ca_string: &StringChunked) -> PolarsResult<&'static str> {
78
let val = get_first_val(ca_string)?;
79
time_pattern(val, NaiveTime::parse_from_str).ok_or_else(|| polars_err!(parse_fmt_idk = "time"))
80
}
81
82
pub trait StringMethods: AsString {
83
#[cfg(feature = "dtype-time")]
84
/// Parsing string values and return a [`TimeChunked`]
85
fn as_time(&self, fmt: Option<&str>, use_cache: bool) -> PolarsResult<TimeChunked> {
86
let string_ca = self.as_string();
87
let fmt = match fmt {
88
Some(fmt) => fmt,
89
None => sniff_fmt_time(string_ca)?,
90
};
91
let use_cache = use_cache && string_ca.len() > 50;
92
93
let mut convert = LruCachedFunc::new(
94
|s| {
95
let naive_time = NaiveTime::parse_from_str(s, fmt).ok()?;
96
Some(time_to_time64ns(&naive_time))
97
},
98
(string_ca.len() as f64).sqrt() as usize,
99
);
100
let ca = unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache));
101
Ok(ca.with_name(string_ca.name().clone()).into_time())
102
}
103
104
#[cfg(feature = "dtype-date")]
105
/// Parsing string values and return a [`DateChunked`]
106
/// Different from `as_date` this function allows matches that not contain the whole string
107
/// e.g. "foo-2021-01-01-bar" could match "2021-01-01"
108
fn as_date_not_exact(&self, fmt: Option<&str>) -> PolarsResult<DateChunked> {
109
let string_ca = self.as_string();
110
let fmt = match fmt {
111
Some(fmt) => fmt,
112
None => sniff_fmt_date(string_ca)?,
113
};
114
let ca = unary_elementwise(string_ca, |opt_s| {
115
let mut s = opt_s?;
116
while !s.is_empty() {
117
match NaiveDate::parse_and_remainder(s, fmt) {
118
Ok((nd, _)) => return Some(naive_date_to_date(nd)),
119
Err(_) => {
120
let mut it = s.chars();
121
it.next();
122
s = it.as_str();
123
},
124
}
125
}
126
127
None
128
});
129
Ok(ca.with_name(string_ca.name().clone()).into_date())
130
}
131
132
#[cfg(feature = "dtype-datetime")]
133
/// Parsing string values and return a [`DatetimeChunked`]
134
/// Different from `as_datetime` this function allows matches that not contain the whole string
135
/// e.g. "foo-2021-01-01-bar" could match "2021-01-01"
136
fn as_datetime_not_exact(
137
&self,
138
fmt: Option<&str>,
139
tu: TimeUnit,
140
tz_aware: bool,
141
tz: Option<&TimeZone>,
142
_ambiguous: &StringChunked,
143
// Ensure that the inferred time_zone matches the given time_zone.
144
ensure_matching_tz: bool,
145
) -> PolarsResult<DatetimeChunked> {
146
let string_ca = self.as_string();
147
let had_format = fmt.is_some();
148
let fmt = match fmt {
149
Some(fmt) => fmt,
150
None => sniff_fmt_datetime(string_ca)?,
151
};
152
153
let func = match tu {
154
TimeUnit::Nanoseconds => datetime_to_timestamp_ns,
155
TimeUnit::Microseconds => datetime_to_timestamp_us,
156
TimeUnit::Milliseconds => datetime_to_timestamp_ms,
157
};
158
159
let ca = unary_elementwise(string_ca, |opt_s| {
160
let mut s = opt_s?;
161
while !s.is_empty() {
162
let timestamp = if tz_aware {
163
DateTime::parse_and_remainder(s, fmt).map(|(dt, _r)| func(dt.naive_utc()))
164
} else {
165
NaiveDateTime::parse_and_remainder(s, fmt).map(|(nd, _r)| func(nd))
166
};
167
match timestamp {
168
Ok(ts) => return Some(ts),
169
Err(_) => {
170
let mut it = s.chars();
171
it.next();
172
s = it.as_str();
173
},
174
}
175
}
176
None
177
})
178
.with_name(string_ca.name().clone());
179
180
polars_ensure!(
181
!ensure_matching_tz || had_format || !(tz_aware && tz.is_none()),
182
to_datetime_tz_mismatch
183
);
184
185
match (tz_aware, tz) {
186
#[cfg(feature = "timezones")]
187
(false, Some(tz)) => polars_ops::prelude::replace_time_zone(
188
&ca.into_datetime(tu, None),
189
Some(tz),
190
_ambiguous,
191
NonExistent::Raise,
192
),
193
#[cfg(feature = "timezones")]
194
(true, tz) => Ok(ca.into_datetime(tu, Some(tz.cloned().unwrap_or(TimeZone::UTC)))),
195
_ => Ok(ca.into_datetime(tu, None)),
196
}
197
}
198
199
#[cfg(feature = "dtype-date")]
200
/// Parsing string values and return a [`DateChunked`]
201
fn as_date(&self, fmt: Option<&str>, use_cache: bool) -> PolarsResult<DateChunked> {
202
let string_ca = self.as_string();
203
let fmt = match fmt {
204
Some(fmt) => fmt,
205
None => return infer::to_date(string_ca),
206
};
207
let use_cache = use_cache && string_ca.len() > 50;
208
let fmt = strptime::compile_fmt(fmt)?;
209
210
// We can use the fast parser.
211
let ca = if let Some(fmt_len) = strptime::fmt_len(fmt.as_bytes()) {
212
let mut strptime_cache = StrpTimeState::default();
213
let mut convert = LruCachedFunc::new(
214
|s: &str| {
215
// SAFETY: fmt_len is correct, it was computed with this `fmt` str.
216
match unsafe { strptime_cache.parse(s.as_bytes(), fmt.as_bytes(), fmt_len) } {
217
// Fallback to chrono.
218
None => NaiveDate::parse_from_str(s, &fmt).ok(),
219
Some(ndt) => Some(ndt.date()),
220
}
221
.map(naive_date_to_date)
222
},
223
(string_ca.len() as f64).sqrt() as usize,
224
);
225
unary_elementwise(string_ca, |val| convert.eval(val?, use_cache))
226
} else {
227
let mut convert = LruCachedFunc::new(
228
|s| {
229
let naive_date = NaiveDate::parse_from_str(s, &fmt).ok()?;
230
Some(naive_date_to_date(naive_date))
231
},
232
(string_ca.len() as f64).sqrt() as usize,
233
);
234
unary_elementwise(string_ca, |val| convert.eval(val?, use_cache))
235
};
236
237
Ok(ca.with_name(string_ca.name().clone()).into_date())
238
}
239
240
#[cfg(feature = "dtype-datetime")]
241
/// Parsing string values and return a [`DatetimeChunked`].
242
fn as_datetime(
243
&self,
244
fmt: Option<&str>,
245
tu: TimeUnit,
246
use_cache: bool,
247
tz_aware: bool,
248
tz: Option<&TimeZone>,
249
ambiguous: &StringChunked,
250
) -> PolarsResult<DatetimeChunked> {
251
let string_ca = self.as_string();
252
let fmt = match fmt {
253
Some(fmt) => fmt,
254
None => return infer::to_datetime(string_ca, tu, tz, ambiguous, true),
255
};
256
let fmt = strptime::compile_fmt(fmt)?;
257
let use_cache = use_cache && string_ca.len() > 50;
258
259
let func = match tu {
260
TimeUnit::Nanoseconds => datetime_to_timestamp_ns,
261
TimeUnit::Microseconds => datetime_to_timestamp_us,
262
TimeUnit::Milliseconds => datetime_to_timestamp_ms,
263
};
264
265
if tz_aware {
266
#[cfg(feature = "timezones")]
267
{
268
let mut convert = LruCachedFunc::new(
269
|s: &str| {
270
let dt = DateTime::parse_from_str(s, &fmt).ok()?;
271
Some(func(dt.naive_utc()))
272
},
273
(string_ca.len() as f64).sqrt() as usize,
274
);
275
Ok(
276
unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))
277
.with_name(string_ca.name().clone())
278
.into_datetime(tu, Some(tz.cloned().unwrap_or(TimeZone::UTC))),
279
)
280
}
281
#[cfg(not(feature = "timezones"))]
282
{
283
panic!("activate 'timezones' feature")
284
}
285
} else {
286
let transform = match tu {
287
TimeUnit::Nanoseconds => infer::transform_datetime_ns,
288
TimeUnit::Microseconds => infer::transform_datetime_us,
289
TimeUnit::Milliseconds => infer::transform_datetime_ms,
290
};
291
// We can use the fast parser.
292
let ca = if let Some(fmt_len) = self::strptime::fmt_len(fmt.as_bytes()) {
293
let mut strptime_cache = StrpTimeState::default();
294
let mut convert = LruCachedFunc::new(
295
|s: &str| {
296
// SAFETY: fmt_len is correct, it was computed with this `fmt` str.
297
match unsafe { strptime_cache.parse(s.as_bytes(), fmt.as_bytes(), fmt_len) }
298
{
299
None => transform(s, &fmt),
300
Some(ndt) => Some(func(ndt)),
301
}
302
},
303
(string_ca.len() as f64).sqrt() as usize,
304
);
305
unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))
306
} else {
307
let mut convert = LruCachedFunc::new(
308
|s| transform(s, &fmt),
309
(string_ca.len() as f64).sqrt() as usize,
310
);
311
unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))
312
};
313
let dt = ca
314
.with_name(string_ca.name().clone())
315
.into_datetime(tu, None);
316
match tz {
317
#[cfg(feature = "timezones")]
318
Some(tz) => polars_ops::prelude::replace_time_zone(
319
&dt,
320
Some(tz),
321
ambiguous,
322
NonExistent::Raise,
323
),
324
_ => Ok(dt),
325
}
326
}
327
}
328
}
329
330
pub trait AsString {
331
fn as_string(&self) -> &StringChunked;
332
}
333
334
impl AsString for StringChunked {
335
fn as_string(&self) -> &StringChunked {
336
self
337
}
338
}
339
340
impl StringMethods for StringChunked {}
341
342