Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-time/src/chunkedarray/string/infer.rs
6940 views
1
use arrow::array::PrimitiveArray;
2
use chrono::format::ParseErrorKind;
3
use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime};
4
use polars_core::prelude::*;
5
6
use super::patterns::{self, Pattern};
7
#[cfg(feature = "dtype-date")]
8
use crate::chunkedarray::date::naive_date_to_date;
9
use crate::chunkedarray::string::strptime;
10
use crate::prelude::string::strptime::StrpTimeState;
11
12
polars_utils::regex_cache::cached_regex! {
13
static DATETIME_DMY_RE = r#"(?x)
14
^
15
['"]? # optional quotes
16
(?:\d{1,2}) # day
17
[-/\.] # separator
18
(?P<month>[01]?\d{1}) # month
19
[-/\.] # separator
20
(?:\d{4,}) # year
21
(?:
22
[T\ ] # separator
23
(?:\d{1,2}) # hour
24
:? # separator
25
(?:\d{1,2}) # minute
26
(?:
27
:? # separator
28
(?:\d{1,2}) # second
29
(?:
30
\.(?:\d{1,9}) # subsecond
31
)?
32
)?
33
)?
34
['"]? # optional quotes
35
$
36
"#;
37
38
static DATETIME_YMD_RE = r#"(?x)
39
^
40
['"]? # optional quotes
41
(?:\d{4,}) # year
42
[-/\.] # separator
43
(?P<month>[01]?\d{1}) # month
44
[-/\.] # separator
45
(?:\d{1,2}) # day
46
(?:
47
[T\ ] # separator
48
(?:\d{1,2}) # hour
49
:? # separator
50
(?:\d{1,2}) # minute
51
(?:
52
:? # separator
53
(?:\d{1,2}) # seconds
54
(?:
55
\.(?:\d{1,9}) # subsecond
56
)?
57
)?
58
)?
59
['"]? # optional quotes
60
$
61
"#;
62
63
static DATETIME_YMDZ_RE = r#"(?x)
64
^
65
['"]? # optional quotes
66
(?:\d{4,}) # year
67
[-/\.] # separator
68
(?P<month>[01]?\d{1}) # month
69
[-/\.] # separator
70
(?:\d{1,2}) # year
71
[T\ ] # separator
72
(?:\d{2}) # hour
73
:? # separator
74
(?:\d{2}) # minute
75
(?:
76
:? # separator
77
(?:\d{2}) # second
78
(?:
79
\.(?:\d{1,9}) # subsecond
80
)?
81
)?
82
(?:
83
# offset (e.g. +01:00, +0100, or +01)
84
[+-](?:\d{2})
85
(?::?\d{2})?
86
# or Zulu suffix
87
|Z
88
)
89
['"]? # optional quotes
90
$
91
"#;
92
}
93
94
impl Pattern {
95
pub fn is_inferable(&self, val: &str) -> bool {
96
match self {
97
Pattern::DateDMY => true, // there are very few Date patterns, so it's cheaper
98
Pattern::DateYMD => true, // to just try them
99
Pattern::Time => true,
100
Pattern::DatetimeDMY => match DATETIME_DMY_RE.captures(val) {
101
Some(search) => (1..=12).contains(
102
&search
103
.name("month")
104
.unwrap()
105
.as_str()
106
.parse::<u8>()
107
.unwrap(),
108
),
109
None => false,
110
},
111
Pattern::DatetimeYMD => match DATETIME_YMD_RE.captures(val) {
112
Some(search) => (1..=12).contains(
113
&search
114
.name("month")
115
.unwrap()
116
.as_str()
117
.parse::<u8>()
118
.unwrap(),
119
),
120
None => false,
121
},
122
Pattern::DatetimeYMDZ => match DATETIME_YMDZ_RE.captures(val) {
123
Some(search) => (1..=12).contains(
124
&search
125
.name("month")
126
.unwrap()
127
.as_str()
128
.parse::<u8>()
129
.unwrap(),
130
),
131
None => false,
132
},
133
}
134
}
135
}
136
137
pub trait StrpTimeParser<T> {
138
fn parse_bytes(&mut self, val: &[u8], time_unit: Option<TimeUnit>) -> Option<T>;
139
}
140
141
#[cfg(feature = "dtype-datetime")]
142
impl StrpTimeParser<i64> for DatetimeInfer<Int64Type> {
143
fn parse_bytes(&mut self, val: &[u8], time_unit: Option<TimeUnit>) -> Option<i64> {
144
if self.fmt_len == 0 {
145
self.fmt_len = strptime::fmt_len(self.latest_fmt.as_bytes())?;
146
}
147
let transform = match time_unit {
148
Some(TimeUnit::Nanoseconds) => datetime_to_timestamp_ns,
149
Some(TimeUnit::Microseconds) => datetime_to_timestamp_us,
150
Some(TimeUnit::Milliseconds) => datetime_to_timestamp_ms,
151
_ => unreachable!(), // time_unit has to be provided for datetime
152
};
153
unsafe {
154
self.transform_bytes
155
.parse(val, self.latest_fmt.as_bytes(), self.fmt_len)
156
.map(transform)
157
.or_else(|| {
158
// TODO! this will try all patterns.
159
// somehow we must early escape if value is invalid
160
for fmt in self.patterns {
161
if self.fmt_len == 0 {
162
self.fmt_len = strptime::fmt_len(fmt.as_bytes())?;
163
}
164
if let Some(parsed) = self
165
.transform_bytes
166
.parse(val, fmt.as_bytes(), self.fmt_len)
167
.map(datetime_to_timestamp_us)
168
{
169
self.latest_fmt = fmt;
170
return Some(parsed);
171
}
172
}
173
None
174
})
175
}
176
}
177
}
178
179
#[cfg(feature = "dtype-date")]
180
impl StrpTimeParser<i32> for DatetimeInfer<Int32Type> {
181
fn parse_bytes(&mut self, val: &[u8], _time_unit: Option<TimeUnit>) -> Option<i32> {
182
if self.fmt_len == 0 {
183
self.fmt_len = strptime::fmt_len(self.latest_fmt.as_bytes())?;
184
}
185
unsafe {
186
self.transform_bytes
187
.parse(val, self.latest_fmt.as_bytes(), self.fmt_len)
188
.map(|ndt| naive_date_to_date(ndt.date()))
189
.or_else(|| {
190
// TODO! this will try all patterns.
191
// somehow we must early escape if value is invalid
192
for fmt in self.patterns {
193
if self.fmt_len == 0 {
194
self.fmt_len = strptime::fmt_len(fmt.as_bytes())?;
195
}
196
if let Some(parsed) = self
197
.transform_bytes
198
.parse(val, fmt.as_bytes(), self.fmt_len)
199
.map(|ndt| naive_date_to_date(ndt.date()))
200
{
201
self.latest_fmt = fmt;
202
return Some(parsed);
203
}
204
}
205
None
206
})
207
}
208
}
209
}
210
211
#[derive(Clone)]
212
pub struct DatetimeInfer<T: PolarsNumericType> {
213
pub pattern: Pattern,
214
patterns: &'static [&'static str],
215
latest_fmt: &'static str,
216
transform: fn(&str, &str) -> Option<T::Native>,
217
transform_bytes: StrpTimeState,
218
fmt_len: u16,
219
pub logical_type: DataType,
220
}
221
222
pub trait TryFromWithUnit<T>: Sized {
223
type Error;
224
fn try_from_with_unit(pattern: T, unit: Option<TimeUnit>) -> PolarsResult<Self>;
225
}
226
227
#[cfg(feature = "dtype-datetime")]
228
impl TryFromWithUnit<Pattern> for DatetimeInfer<Int64Type> {
229
type Error = PolarsError;
230
231
fn try_from_with_unit(value: Pattern, time_unit: Option<TimeUnit>) -> PolarsResult<Self> {
232
let time_unit = time_unit.expect("time_unit must be provided for datetime");
233
234
let transform = match (time_unit, value) {
235
(TimeUnit::Milliseconds, Pattern::DatetimeYMDZ) => transform_tzaware_datetime_ms,
236
(TimeUnit::Milliseconds, _) => transform_datetime_ms,
237
(TimeUnit::Microseconds, Pattern::DatetimeYMDZ) => transform_tzaware_datetime_us,
238
(TimeUnit::Microseconds, _) => transform_datetime_us,
239
(TimeUnit::Nanoseconds, Pattern::DatetimeYMDZ) => transform_tzaware_datetime_ns,
240
(TimeUnit::Nanoseconds, _) => transform_datetime_ns,
241
};
242
let (pattern, patterns) = match value {
243
Pattern::DatetimeDMY | Pattern::DateDMY => {
244
(Pattern::DatetimeDMY, patterns::DATETIME_D_M_Y)
245
},
246
Pattern::DatetimeYMD | Pattern::DateYMD => {
247
(Pattern::DatetimeYMD, patterns::DATETIME_Y_M_D)
248
},
249
Pattern::DatetimeYMDZ => (Pattern::DatetimeYMDZ, patterns::DATETIME_Y_M_D_Z),
250
Pattern::Time => (Pattern::Time, patterns::TIME_H_M_S),
251
};
252
253
Ok(DatetimeInfer {
254
pattern,
255
patterns,
256
latest_fmt: patterns[0],
257
transform,
258
transform_bytes: StrpTimeState::default(),
259
fmt_len: 0,
260
logical_type: DataType::Datetime(time_unit, None),
261
})
262
}
263
}
264
265
#[cfg(feature = "dtype-date")]
266
impl TryFromWithUnit<Pattern> for DatetimeInfer<Int32Type> {
267
type Error = PolarsError;
268
269
fn try_from_with_unit(value: Pattern, _time_unit: Option<TimeUnit>) -> PolarsResult<Self> {
270
match value {
271
Pattern::DateDMY => Ok(DatetimeInfer {
272
pattern: Pattern::DateDMY,
273
patterns: patterns::DATE_D_M_Y,
274
latest_fmt: patterns::DATE_D_M_Y[0],
275
transform: transform_date,
276
transform_bytes: StrpTimeState::default(),
277
fmt_len: 0,
278
logical_type: DataType::Date,
279
}),
280
Pattern::DateYMD => Ok(DatetimeInfer {
281
pattern: Pattern::DateYMD,
282
patterns: patterns::DATE_Y_M_D,
283
latest_fmt: patterns::DATE_Y_M_D[0],
284
transform: transform_date,
285
transform_bytes: StrpTimeState::default(),
286
fmt_len: 0,
287
logical_type: DataType::Date,
288
}),
289
_ => polars_bail!(ComputeError: "could not convert pattern"),
290
}
291
}
292
}
293
294
impl<T: PolarsNumericType> DatetimeInfer<T> {
295
pub fn parse(&mut self, val: &str) -> Option<T::Native> {
296
match (self.transform)(val, self.latest_fmt) {
297
Some(parsed) => Some(parsed),
298
// try other patterns
299
None => {
300
if !self.pattern.is_inferable(val) {
301
return None;
302
}
303
for fmt in self.patterns {
304
self.fmt_len = 0;
305
if let Some(parsed) = (self.transform)(val, fmt) {
306
self.latest_fmt = fmt;
307
return Some(parsed);
308
}
309
}
310
None
311
},
312
}
313
}
314
}
315
316
impl<T: PolarsNumericType> DatetimeInfer<T> {
317
fn coerce_string(&mut self, ca: &StringChunked) -> Series {
318
let chunks = ca.downcast_iter().map(|array| {
319
let iter = array
320
.into_iter()
321
.map(|opt_val| opt_val.and_then(|val| self.parse(val)));
322
PrimitiveArray::from_trusted_len_iter(iter)
323
});
324
ChunkedArray::<T>::from_chunk_iter(ca.name().clone(), chunks)
325
.into_series()
326
.cast(&self.logical_type)
327
.unwrap()
328
.with_name(ca.name().clone())
329
}
330
}
331
332
#[cfg(feature = "dtype-date")]
333
fn transform_date(val: &str, fmt: &str) -> Option<i32> {
334
NaiveDate::parse_from_str(val, fmt)
335
.ok()
336
.map(naive_date_to_date)
337
}
338
339
#[cfg(feature = "dtype-datetime")]
340
pub(crate) fn transform_datetime_ns(val: &str, fmt: &str) -> Option<i64> {
341
match NaiveDateTime::parse_from_str(val, fmt) {
342
Ok(ndt) => Some(datetime_to_timestamp_ns(ndt)),
343
Err(parse_error) => match parse_error.kind() {
344
ParseErrorKind::NotEnough => NaiveDate::parse_from_str(val, fmt)
345
.ok()
346
.map(|nd| datetime_to_timestamp_ns(nd.and_hms_opt(0, 0, 0).unwrap())),
347
_ => None,
348
},
349
}
350
}
351
352
fn transform_tzaware_datetime_ns(val: &str, fmt: &str) -> Option<i64> {
353
let dt = DateTime::parse_from_str(val, fmt);
354
dt.ok().map(|dt| datetime_to_timestamp_ns(dt.naive_utc()))
355
}
356
357
#[cfg(feature = "dtype-datetime")]
358
pub(crate) fn transform_datetime_us(val: &str, fmt: &str) -> Option<i64> {
359
match NaiveDateTime::parse_from_str(val, fmt) {
360
Ok(ndt) => Some(datetime_to_timestamp_us(ndt)),
361
Err(parse_error) => match parse_error.kind() {
362
ParseErrorKind::NotEnough => NaiveDate::parse_from_str(val, fmt)
363
.ok()
364
.map(|nd| datetime_to_timestamp_us(nd.and_hms_opt(0, 0, 0).unwrap())),
365
_ => None,
366
},
367
}
368
}
369
370
fn transform_tzaware_datetime_us(val: &str, fmt: &str) -> Option<i64> {
371
let dt = DateTime::parse_from_str(val, fmt);
372
dt.ok().map(|dt| datetime_to_timestamp_us(dt.naive_utc()))
373
}
374
375
#[cfg(feature = "dtype-datetime")]
376
pub(crate) fn transform_datetime_ms(val: &str, fmt: &str) -> Option<i64> {
377
match NaiveDateTime::parse_from_str(val, fmt) {
378
Ok(ndt) => Some(datetime_to_timestamp_ms(ndt)),
379
Err(parse_error) => match parse_error.kind() {
380
ParseErrorKind::NotEnough => NaiveDate::parse_from_str(val, fmt)
381
.ok()
382
.map(|nd| datetime_to_timestamp_ms(nd.and_hms_opt(0, 0, 0).unwrap())),
383
_ => None,
384
},
385
}
386
}
387
388
fn transform_tzaware_datetime_ms(val: &str, fmt: &str) -> Option<i64> {
389
let dt = DateTime::parse_from_str(val, fmt);
390
dt.ok().map(|dt| datetime_to_timestamp_ms(dt.naive_utc()))
391
}
392
393
pub fn infer_pattern_single(val: &str) -> Option<Pattern> {
394
// Dates come first, because we see datetimes as superset of dates
395
infer_pattern_date_single(val)
396
.or_else(|| infer_pattern_time_single(val))
397
.or_else(|| infer_pattern_datetime_single(val))
398
}
399
400
fn infer_pattern_datetime_single(val: &str) -> Option<Pattern> {
401
if patterns::DATETIME_D_M_Y.iter().any(|fmt| {
402
NaiveDateTime::parse_from_str(val, fmt).is_ok()
403
|| NaiveDate::parse_from_str(val, fmt).is_ok()
404
}) {
405
Some(Pattern::DatetimeDMY)
406
} else if patterns::DATETIME_Y_M_D.iter().any(|fmt| {
407
NaiveDateTime::parse_from_str(val, fmt).is_ok()
408
|| NaiveDate::parse_from_str(val, fmt).is_ok()
409
}) {
410
Some(Pattern::DatetimeYMD)
411
} else if patterns::DATETIME_Y_M_D_Z
412
.iter()
413
.any(|fmt| NaiveDateTime::parse_from_str(val, fmt).is_ok())
414
{
415
Some(Pattern::DatetimeYMDZ)
416
} else {
417
None
418
}
419
}
420
421
fn infer_pattern_date_single(val: &str) -> Option<Pattern> {
422
if patterns::DATE_D_M_Y
423
.iter()
424
.any(|fmt| NaiveDate::parse_from_str(val, fmt).is_ok())
425
{
426
Some(Pattern::DateDMY)
427
} else if patterns::DATE_Y_M_D
428
.iter()
429
.any(|fmt| NaiveDate::parse_from_str(val, fmt).is_ok())
430
{
431
Some(Pattern::DateYMD)
432
} else {
433
None
434
}
435
}
436
437
fn infer_pattern_time_single(val: &str) -> Option<Pattern> {
438
patterns::TIME_H_M_S
439
.iter()
440
.any(|fmt| NaiveTime::parse_from_str(val, fmt).is_ok())
441
.then_some(Pattern::Time)
442
}
443
444
#[cfg(feature = "dtype-datetime")]
445
pub fn to_datetime_with_inferred_tz(
446
ca: &StringChunked,
447
tu: TimeUnit,
448
strict: bool,
449
exact: bool,
450
ambiguous: &StringChunked,
451
) -> PolarsResult<DatetimeChunked> {
452
use super::StringMethods;
453
454
let out = if exact {
455
to_datetime(ca, tu, None, ambiguous, false)
456
} else {
457
ca.as_datetime_not_exact(None, tu, false, None, ambiguous, false)
458
}?;
459
460
if strict && ca.null_count() != out.null_count() {
461
polars_core::utils::handle_casting_failures(
462
&ca.clone().into_series(),
463
&out.clone().into_series(),
464
)?;
465
}
466
467
Ok(out)
468
}
469
470
#[cfg(feature = "dtype-datetime")]
471
pub fn to_datetime(
472
ca: &StringChunked,
473
tu: TimeUnit,
474
tz: Option<&TimeZone>,
475
_ambiguous: &StringChunked,
476
// Ensure that the inferred time_zone matches the given time_zone.
477
ensure_matching_time_zone: bool,
478
) -> PolarsResult<DatetimeChunked> {
479
match ca.first_non_null() {
480
None => {
481
Ok(Int64Chunked::full_null(ca.name().clone(), ca.len()).into_datetime(tu, tz.cloned()))
482
},
483
Some(idx) => {
484
let subset = ca.slice(idx as i64, ca.len());
485
let pattern = subset
486
.into_iter()
487
.find_map(|opt_val| opt_val.and_then(infer_pattern_datetime_single))
488
.ok_or_else(|| polars_err!(parse_fmt_idk = "date"))?;
489
let mut infer = DatetimeInfer::<Int64Type>::try_from_with_unit(pattern, Some(tu))?;
490
match pattern {
491
#[cfg(feature = "timezones")]
492
Pattern::DatetimeYMDZ => infer.coerce_string(ca).datetime().map(|ca| {
493
polars_ensure!(
494
!ensure_matching_time_zone || tz.is_some(),
495
to_datetime_tz_mismatch
496
);
497
498
let mut ca = ca.clone();
499
// `tz` has already been validated.
500
ca.set_time_unit_and_time_zone(tu, tz.cloned().unwrap_or(TimeZone::UTC))?;
501
Ok(ca)
502
})?,
503
_ => infer.coerce_string(ca).datetime().map(|ca| {
504
let mut ca = ca.clone();
505
ca.set_time_unit(tu);
506
match tz {
507
#[cfg(feature = "timezones")]
508
Some(tz) => polars_ops::prelude::replace_time_zone(
509
&ca,
510
Some(tz),
511
_ambiguous,
512
NonExistent::Raise,
513
),
514
_ => Ok(ca),
515
}
516
})?,
517
}
518
},
519
}
520
}
521
#[cfg(feature = "dtype-date")]
522
pub(crate) fn to_date(ca: &StringChunked) -> PolarsResult<DateChunked> {
523
match ca.first_non_null() {
524
None => Ok(Int32Chunked::full_null(ca.name().clone(), ca.len()).into_date()),
525
Some(idx) => {
526
let subset = ca.slice(idx as i64, ca.len());
527
let pattern = subset
528
.into_iter()
529
.find_map(|opt_val| opt_val.and_then(infer_pattern_date_single))
530
.ok_or_else(|| polars_err!(parse_fmt_idk = "date"))?;
531
let mut infer = DatetimeInfer::<Int32Type>::try_from_with_unit(pattern, None).unwrap();
532
infer.coerce_string(ca).date().cloned()
533
},
534
}
535
}
536
537