Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-time/src/chunkedarray/string/strptime.rs
6940 views
1
#![allow(unsafe_op_in_unsafe_fn)]
2
//! Much more opinionated, but also much faster strptrime than the one given in Chrono.
3
4
use chrono::{NaiveDate, NaiveDateTime};
5
6
use crate::chunkedarray::{PolarsResult, polars_bail};
7
8
polars_utils::regex_cache::cached_regex! {
9
static HOUR_PATTERN = r"%[_-]?[HkIl]";
10
static MINUTE_PATTERN = r"%[_-]?M";
11
static SECOND_PATTERN = r"%[_-]?S";
12
static TWELVE_HOUR_PATTERN = r"%[_-]?[Il]";
13
static MERIDIEM_PATTERN = r"%[_-]?[pP]";
14
}
15
16
#[inline]
17
fn update_and_parse<T: atoi_simd::Parse>(
18
incr: usize,
19
offset: usize,
20
vals: &[u8],
21
) -> Option<(T, usize)> {
22
// this maybe oob because we cannot entirely sure about fmt lengths
23
let new_offset = offset + incr;
24
let bytes = vals.get(offset..new_offset)?;
25
let (val, parsed) = atoi_simd::parse_any(bytes).ok()?;
26
if parsed != incr {
27
None
28
} else {
29
Some((val, new_offset))
30
}
31
}
32
33
#[inline]
34
fn parse_month_abbrev(val: &[u8], offset: usize) -> Option<(u32, usize)> {
35
let new_offset = offset + 3;
36
match &val[offset..new_offset] {
37
b"Jan" => Some((1, new_offset)),
38
b"Feb" => Some((2, new_offset)),
39
b"Mar" => Some((3, new_offset)),
40
b"Apr" => Some((4, new_offset)),
41
b"May" => Some((5, new_offset)),
42
b"Jun" => Some((6, new_offset)),
43
b"Jul" => Some((7, new_offset)),
44
b"Aug" => Some((8, new_offset)),
45
b"Sep" => Some((9, new_offset)),
46
b"Oct" => Some((10, new_offset)),
47
b"Nov" => Some((11, new_offset)),
48
b"Dec" => Some((12, new_offset)),
49
_ => None,
50
}
51
}
52
#[inline]
53
fn parse_month_full(val: &[u8], offset: usize) -> Option<(u32, usize)> {
54
let min_offset = offset + 3;
55
match &val[offset..min_offset] {
56
b"Jan" => {
57
let new_offset = min_offset + 4;
58
match &val[min_offset..new_offset] {
59
b"uary" => Some((1, new_offset)),
60
_ => None,
61
}
62
},
63
b"Feb" => {
64
let new_offset = min_offset + 5;
65
match &val[min_offset..new_offset] {
66
b"ruary" => Some((2, new_offset)),
67
_ => None,
68
}
69
},
70
b"Mar" => {
71
let new_offset = min_offset + 2;
72
match &val[min_offset..new_offset] {
73
b"ch" => Some((3, new_offset)),
74
_ => None,
75
}
76
},
77
b"Apr" => {
78
let new_offset = min_offset + 2;
79
match &val[min_offset..new_offset] {
80
b"il" => Some((4, new_offset)),
81
_ => None,
82
}
83
},
84
b"May" => Some((5, min_offset)),
85
b"Jun" => {
86
let new_offset = min_offset + 1;
87
match &val[min_offset..new_offset] {
88
b"e" => Some((6, new_offset)),
89
_ => None,
90
}
91
},
92
b"Jul" => {
93
let new_offset = min_offset + 1;
94
match &val[min_offset..new_offset] {
95
b"y" => Some((7, new_offset)),
96
_ => None,
97
}
98
},
99
b"Aug" => {
100
let new_offset = min_offset + 3;
101
match &val[min_offset..new_offset] {
102
b"ust" => Some((8, new_offset)),
103
_ => None,
104
}
105
},
106
b"Sep" => {
107
let new_offset = min_offset + 6;
108
match &val[min_offset..new_offset] {
109
b"tember" => Some((9, new_offset)),
110
_ => None,
111
}
112
},
113
b"Oct" => {
114
let new_offset = min_offset + 4;
115
match &val[min_offset..new_offset] {
116
b"ober" => Some((10, new_offset)),
117
_ => None,
118
}
119
},
120
b"Nov" => {
121
let new_offset = min_offset + 5;
122
match &val[min_offset..new_offset] {
123
b"ember" => Some((11, new_offset)),
124
_ => None,
125
}
126
},
127
b"Dec" => {
128
let new_offset = min_offset + 5;
129
match &val[min_offset..new_offset] {
130
b"ember" => Some((12, new_offset)),
131
_ => None,
132
}
133
},
134
_ => None,
135
}
136
}
137
/// Tries to convert a chrono `fmt` to a `fmt` that the polars parser consumes.
138
/// E.g. chrono supports single letter date identifiers like %F, whereas polars only consumes
139
/// year, day, month distinctively with %Y, %d, %m.
140
pub(super) fn compile_fmt(fmt: &str) -> PolarsResult<String> {
141
// (hopefully) temporary hacks. Ideally, chrono would return a ParseKindError indicating
142
// if `fmt` is too long for NaiveDate. If that's implemented, then this check could
143
// be removed, and that error could be matched against in `transform_datetime_*s`
144
// See https://github.com/chronotope/chrono/issues/1075.
145
if HOUR_PATTERN.is_match(fmt) ^ MINUTE_PATTERN.is_match(fmt) {
146
polars_bail!(ComputeError: "Invalid format string: \
147
Please either specify both hour and minute, or neither.");
148
}
149
if SECOND_PATTERN.is_match(fmt) && !HOUR_PATTERN.is_match(fmt) {
150
polars_bail!(ComputeError: "Invalid format string: \
151
Found seconds directive, but no hours directive.");
152
}
153
if TWELVE_HOUR_PATTERN.is_match(fmt) ^ MERIDIEM_PATTERN.is_match(fmt) {
154
polars_bail!(ComputeError: "Invalid format string: \
155
Please either specify both 12-hour directive and meridiem directive, or neither.");
156
}
157
158
Ok(fmt
159
.replace("%D", "%m/%d/%y")
160
.replace("%R", "%H:%M")
161
.replace("%T", "%H:%M:%S")
162
.replace("%X", "%H:%M:%S")
163
.replace("%F", "%Y-%m-%d"))
164
}
165
166
#[derive(Default, Clone)]
167
pub(super) struct StrpTimeState {}
168
169
impl StrpTimeState {
170
#[inline]
171
// # Safety
172
// Caller must ensure that fmt adheres to the fmt rules of chrono and `fmt_len` is correct.
173
pub(super) unsafe fn parse(
174
&mut self,
175
val: &[u8],
176
fmt: &[u8],
177
fmt_len_val: u16,
178
) -> Option<NaiveDateTime> {
179
let mut offset = 0;
180
let mut negative = false;
181
if val.starts_with(b"-") && fmt.starts_with(b"%Y") {
182
offset = 1;
183
negative = true;
184
}
185
#[allow(non_snake_case)]
186
let has_B_code = fmt.windows(2).any(|w| w == b"%B");
187
// SAFETY: this still ensures get_unchecked won't be out of bounds as val will be at least as big as we expect.
188
// After consuming the full month name, we'll double check remaining len is exactly equal.
189
let is_too_short = has_B_code && val.len() - offset < (fmt_len_val as usize);
190
if (!has_B_code && val.len() - offset != (fmt_len_val as usize)) || is_too_short {
191
return None;
192
}
193
194
const ESCAPE: u8 = b'%';
195
let mut year: i32 = 1;
196
// minimal day/month is always 1
197
// otherwise chrono may panic.
198
let mut month: u32 = 1;
199
let mut day: u32 = 1;
200
let mut hour: u32 = 0;
201
let mut min: u32 = 0;
202
let mut sec: u32 = 0;
203
let mut nano: u32 = 0;
204
205
let mut fmt_iter = fmt.iter();
206
207
while let Some(fmt_b) = fmt_iter.next() {
208
debug_assert!(offset < val.len());
209
let b = *val.get_unchecked(offset);
210
if *fmt_b == ESCAPE {
211
// SAFETY: we must ensure we provide valid patterns
212
let next = fmt_iter.next();
213
debug_assert!(next.is_some());
214
match next.unwrap_unchecked() {
215
b'Y' => {
216
(year, offset) = update_and_parse(4, offset, val)?;
217
if negative {
218
year *= -1
219
}
220
},
221
b'm' => {
222
(month, offset) = update_and_parse(2, offset, val)?;
223
if month > 12 {
224
return None;
225
}
226
},
227
b'b' => {
228
(month, offset) = parse_month_abbrev(val, offset)?;
229
},
230
b'B' => {
231
(month, offset) = parse_month_full(val, offset)?;
232
// After variable sized month is consumed, verify remaining is exact len
233
let new_fmt_len = fmt_len(fmt_iter.as_slice())?;
234
let remaining_val_len = val.len() - offset;
235
if remaining_val_len != (new_fmt_len as usize) {
236
return None;
237
}
238
},
239
b'd' => {
240
(day, offset) = update_and_parse(2, offset, val)?;
241
},
242
b'H' => {
243
(hour, offset) = update_and_parse(2, offset, val)?;
244
},
245
b'M' => {
246
(min, offset) = update_and_parse(2, offset, val)?;
247
},
248
b'S' => {
249
(sec, offset) = update_and_parse(2, offset, val)?;
250
},
251
b'y' => {
252
let new_offset = offset + 2;
253
let bytes = val.get_unchecked(offset..new_offset);
254
255
let (decade, parsed) = atoi_simd::parse_any::<i32>(bytes).ok()?;
256
if parsed == 0 {
257
return None;
258
}
259
260
if decade < 70 {
261
year = 2000 + decade;
262
} else {
263
year = 1900 + decade;
264
}
265
offset = new_offset;
266
},
267
b'9' => {
268
(nano, offset) = update_and_parse(9, offset, val)?;
269
break;
270
},
271
b'6' => {
272
(nano, offset) = update_and_parse(6, offset, val)?;
273
nano *= 1000;
274
break;
275
},
276
b'3' => {
277
(nano, offset) = update_and_parse(3, offset, val)?;
278
nano *= 1_000_000;
279
break;
280
},
281
_ => return None,
282
}
283
}
284
// consume
285
else if b == *fmt_b {
286
offset += 1;
287
} else {
288
return None;
289
}
290
}
291
// all values processed
292
if offset == val.len() {
293
NaiveDate::from_ymd_opt(year, month, day)
294
.and_then(|nd| nd.and_hms_nano_opt(hour, min, sec, nano))
295
}
296
// remaining values did not match pattern
297
else {
298
None
299
}
300
}
301
}
302
303
pub(super) fn fmt_len(fmt: &[u8]) -> Option<u16> {
304
let mut iter = fmt.iter();
305
let mut cnt = 0;
306
307
while let Some(&val) = iter.next() {
308
match val {
309
b'%' => match iter.next() {
310
Some(&next_val) => match next_val {
311
b'Y' => cnt += 4,
312
b'y' => cnt += 2,
313
b'd' => cnt += 2,
314
b'm' => cnt += 2,
315
b'b' => cnt += 3,
316
b'B' => cnt += 3, // This is minimum size for full month
317
b'H' => cnt += 2,
318
b'M' => cnt += 2,
319
b'S' => cnt += 2,
320
b'9' => {
321
cnt += 9;
322
if matches!(iter.next(), Some(&b'f')) && iter.next().is_none() {
323
return Some(cnt);
324
} else {
325
return None;
326
}
327
},
328
b'6' => {
329
cnt += 6;
330
if matches!(iter.next(), Some(&b'f')) && iter.next().is_none() {
331
return Some(cnt);
332
} else {
333
return None;
334
}
335
},
336
b'3' => {
337
cnt += 3;
338
if matches!(iter.next(), Some(&b'f')) && iter.next().is_none() {
339
return Some(cnt);
340
} else {
341
return None;
342
}
343
},
344
_ => return None,
345
},
346
None => return None,
347
},
348
_ => {
349
cnt += 1;
350
},
351
}
352
}
353
Some(cnt)
354
}
355
356
#[cfg(test)]
357
mod test {
358
use super::*;
359
360
#[test]
361
fn test_parsing() {
362
let patterns = [
363
(
364
"2021-01-01",
365
"%Y-%m-%d",
366
10,
367
Some(
368
NaiveDate::from_ymd_opt(2021, 1, 1)
369
.unwrap()
370
.and_hms_nano_opt(0, 0, 0, 0)
371
.unwrap(),
372
),
373
),
374
(
375
"2021-01-01 07:45:12",
376
"%Y-%m-%d %H:%M:%S",
377
19,
378
Some(
379
NaiveDate::from_ymd_opt(2021, 1, 1)
380
.unwrap()
381
.and_hms_nano_opt(7, 45, 12, 0)
382
.unwrap(),
383
),
384
),
385
(
386
"2021-01-01 07:45:12",
387
"%Y-%m-%d %H:%M:%S",
388
19,
389
Some(
390
NaiveDate::from_ymd_opt(2021, 1, 1)
391
.unwrap()
392
.and_hms_nano_opt(7, 45, 12, 0)
393
.unwrap(),
394
),
395
),
396
(
397
"2019-04-18T02:45:55.555000000",
398
"%Y-%m-%dT%H:%M:%S.%9f",
399
29,
400
Some(
401
NaiveDate::from_ymd_opt(2019, 4, 18)
402
.unwrap()
403
.and_hms_nano_opt(2, 45, 55, 555000000)
404
.unwrap(),
405
),
406
),
407
(
408
"2019-04-18T02:45:55.555000",
409
"%Y-%m-%dT%H:%M:%S.%6f",
410
26,
411
Some(
412
NaiveDate::from_ymd_opt(2019, 4, 18)
413
.unwrap()
414
.and_hms_nano_opt(2, 45, 55, 555000000)
415
.unwrap(),
416
),
417
),
418
(
419
"2019-04-18T02:45:55.555",
420
"%Y-%m-%dT%H:%M:%S.%3f",
421
23,
422
Some(
423
NaiveDate::from_ymd_opt(2019, 4, 18)
424
.unwrap()
425
.and_hms_nano_opt(2, 45, 55, 555000000)
426
.unwrap(),
427
),
428
),
429
];
430
431
for (val, fmt, len, expected) in patterns {
432
assert_eq!(fmt_len(fmt.as_bytes()).unwrap(), len);
433
unsafe {
434
assert_eq!(
435
StrpTimeState::default().parse(val.as_bytes(), fmt.as_bytes(), len),
436
expected
437
)
438
};
439
}
440
}
441
}
442
443