Path: blob/main/crates/polars-time/src/chunkedarray/string/strptime.rs
6940 views
#![allow(unsafe_op_in_unsafe_fn)]1//! Much more opinionated, but also much faster strptrime than the one given in Chrono.23use chrono::{NaiveDate, NaiveDateTime};45use crate::chunkedarray::{PolarsResult, polars_bail};67polars_utils::regex_cache::cached_regex! {8static HOUR_PATTERN = r"%[_-]?[HkIl]";9static MINUTE_PATTERN = r"%[_-]?M";10static SECOND_PATTERN = r"%[_-]?S";11static TWELVE_HOUR_PATTERN = r"%[_-]?[Il]";12static MERIDIEM_PATTERN = r"%[_-]?[pP]";13}1415#[inline]16fn update_and_parse<T: atoi_simd::Parse>(17incr: usize,18offset: usize,19vals: &[u8],20) -> Option<(T, usize)> {21// this maybe oob because we cannot entirely sure about fmt lengths22let new_offset = offset + incr;23let bytes = vals.get(offset..new_offset)?;24let (val, parsed) = atoi_simd::parse_any(bytes).ok()?;25if parsed != incr {26None27} else {28Some((val, new_offset))29}30}3132#[inline]33fn parse_month_abbrev(val: &[u8], offset: usize) -> Option<(u32, usize)> {34let new_offset = offset + 3;35match &val[offset..new_offset] {36b"Jan" => Some((1, new_offset)),37b"Feb" => Some((2, new_offset)),38b"Mar" => Some((3, new_offset)),39b"Apr" => Some((4, new_offset)),40b"May" => Some((5, new_offset)),41b"Jun" => Some((6, new_offset)),42b"Jul" => Some((7, new_offset)),43b"Aug" => Some((8, new_offset)),44b"Sep" => Some((9, new_offset)),45b"Oct" => Some((10, new_offset)),46b"Nov" => Some((11, new_offset)),47b"Dec" => Some((12, new_offset)),48_ => None,49}50}51#[inline]52fn parse_month_full(val: &[u8], offset: usize) -> Option<(u32, usize)> {53let min_offset = offset + 3;54match &val[offset..min_offset] {55b"Jan" => {56let new_offset = min_offset + 4;57match &val[min_offset..new_offset] {58b"uary" => Some((1, new_offset)),59_ => None,60}61},62b"Feb" => {63let new_offset = min_offset + 5;64match &val[min_offset..new_offset] {65b"ruary" => Some((2, new_offset)),66_ => None,67}68},69b"Mar" => {70let new_offset = min_offset + 2;71match &val[min_offset..new_offset] {72b"ch" => Some((3, new_offset)),73_ => None,74}75},76b"Apr" => {77let new_offset = min_offset + 2;78match &val[min_offset..new_offset] {79b"il" => Some((4, new_offset)),80_ => None,81}82},83b"May" => Some((5, min_offset)),84b"Jun" => {85let new_offset = min_offset + 1;86match &val[min_offset..new_offset] {87b"e" => Some((6, new_offset)),88_ => None,89}90},91b"Jul" => {92let new_offset = min_offset + 1;93match &val[min_offset..new_offset] {94b"y" => Some((7, new_offset)),95_ => None,96}97},98b"Aug" => {99let new_offset = min_offset + 3;100match &val[min_offset..new_offset] {101b"ust" => Some((8, new_offset)),102_ => None,103}104},105b"Sep" => {106let new_offset = min_offset + 6;107match &val[min_offset..new_offset] {108b"tember" => Some((9, new_offset)),109_ => None,110}111},112b"Oct" => {113let new_offset = min_offset + 4;114match &val[min_offset..new_offset] {115b"ober" => Some((10, new_offset)),116_ => None,117}118},119b"Nov" => {120let new_offset = min_offset + 5;121match &val[min_offset..new_offset] {122b"ember" => Some((11, new_offset)),123_ => None,124}125},126b"Dec" => {127let new_offset = min_offset + 5;128match &val[min_offset..new_offset] {129b"ember" => Some((12, new_offset)),130_ => None,131}132},133_ => None,134}135}136/// Tries to convert a chrono `fmt` to a `fmt` that the polars parser consumes.137/// E.g. chrono supports single letter date identifiers like %F, whereas polars only consumes138/// year, day, month distinctively with %Y, %d, %m.139pub(super) fn compile_fmt(fmt: &str) -> PolarsResult<String> {140// (hopefully) temporary hacks. Ideally, chrono would return a ParseKindError indicating141// if `fmt` is too long for NaiveDate. If that's implemented, then this check could142// be removed, and that error could be matched against in `transform_datetime_*s`143// See https://github.com/chronotope/chrono/issues/1075.144if HOUR_PATTERN.is_match(fmt) ^ MINUTE_PATTERN.is_match(fmt) {145polars_bail!(ComputeError: "Invalid format string: \146Please either specify both hour and minute, or neither.");147}148if SECOND_PATTERN.is_match(fmt) && !HOUR_PATTERN.is_match(fmt) {149polars_bail!(ComputeError: "Invalid format string: \150Found seconds directive, but no hours directive.");151}152if TWELVE_HOUR_PATTERN.is_match(fmt) ^ MERIDIEM_PATTERN.is_match(fmt) {153polars_bail!(ComputeError: "Invalid format string: \154Please either specify both 12-hour directive and meridiem directive, or neither.");155}156157Ok(fmt158.replace("%D", "%m/%d/%y")159.replace("%R", "%H:%M")160.replace("%T", "%H:%M:%S")161.replace("%X", "%H:%M:%S")162.replace("%F", "%Y-%m-%d"))163}164165#[derive(Default, Clone)]166pub(super) struct StrpTimeState {}167168impl StrpTimeState {169#[inline]170// # Safety171// Caller must ensure that fmt adheres to the fmt rules of chrono and `fmt_len` is correct.172pub(super) unsafe fn parse(173&mut self,174val: &[u8],175fmt: &[u8],176fmt_len_val: u16,177) -> Option<NaiveDateTime> {178let mut offset = 0;179let mut negative = false;180if val.starts_with(b"-") && fmt.starts_with(b"%Y") {181offset = 1;182negative = true;183}184#[allow(non_snake_case)]185let has_B_code = fmt.windows(2).any(|w| w == b"%B");186// SAFETY: this still ensures get_unchecked won't be out of bounds as val will be at least as big as we expect.187// After consuming the full month name, we'll double check remaining len is exactly equal.188let is_too_short = has_B_code && val.len() - offset < (fmt_len_val as usize);189if (!has_B_code && val.len() - offset != (fmt_len_val as usize)) || is_too_short {190return None;191}192193const ESCAPE: u8 = b'%';194let mut year: i32 = 1;195// minimal day/month is always 1196// otherwise chrono may panic.197let mut month: u32 = 1;198let mut day: u32 = 1;199let mut hour: u32 = 0;200let mut min: u32 = 0;201let mut sec: u32 = 0;202let mut nano: u32 = 0;203204let mut fmt_iter = fmt.iter();205206while let Some(fmt_b) = fmt_iter.next() {207debug_assert!(offset < val.len());208let b = *val.get_unchecked(offset);209if *fmt_b == ESCAPE {210// SAFETY: we must ensure we provide valid patterns211let next = fmt_iter.next();212debug_assert!(next.is_some());213match next.unwrap_unchecked() {214b'Y' => {215(year, offset) = update_and_parse(4, offset, val)?;216if negative {217year *= -1218}219},220b'm' => {221(month, offset) = update_and_parse(2, offset, val)?;222if month > 12 {223return None;224}225},226b'b' => {227(month, offset) = parse_month_abbrev(val, offset)?;228},229b'B' => {230(month, offset) = parse_month_full(val, offset)?;231// After variable sized month is consumed, verify remaining is exact len232let new_fmt_len = fmt_len(fmt_iter.as_slice())?;233let remaining_val_len = val.len() - offset;234if remaining_val_len != (new_fmt_len as usize) {235return None;236}237},238b'd' => {239(day, offset) = update_and_parse(2, offset, val)?;240},241b'H' => {242(hour, offset) = update_and_parse(2, offset, val)?;243},244b'M' => {245(min, offset) = update_and_parse(2, offset, val)?;246},247b'S' => {248(sec, offset) = update_and_parse(2, offset, val)?;249},250b'y' => {251let new_offset = offset + 2;252let bytes = val.get_unchecked(offset..new_offset);253254let (decade, parsed) = atoi_simd::parse_any::<i32>(bytes).ok()?;255if parsed == 0 {256return None;257}258259if decade < 70 {260year = 2000 + decade;261} else {262year = 1900 + decade;263}264offset = new_offset;265},266b'9' => {267(nano, offset) = update_and_parse(9, offset, val)?;268break;269},270b'6' => {271(nano, offset) = update_and_parse(6, offset, val)?;272nano *= 1000;273break;274},275b'3' => {276(nano, offset) = update_and_parse(3, offset, val)?;277nano *= 1_000_000;278break;279},280_ => return None,281}282}283// consume284else if b == *fmt_b {285offset += 1;286} else {287return None;288}289}290// all values processed291if offset == val.len() {292NaiveDate::from_ymd_opt(year, month, day)293.and_then(|nd| nd.and_hms_nano_opt(hour, min, sec, nano))294}295// remaining values did not match pattern296else {297None298}299}300}301302pub(super) fn fmt_len(fmt: &[u8]) -> Option<u16> {303let mut iter = fmt.iter();304let mut cnt = 0;305306while let Some(&val) = iter.next() {307match val {308b'%' => match iter.next() {309Some(&next_val) => match next_val {310b'Y' => cnt += 4,311b'y' => cnt += 2,312b'd' => cnt += 2,313b'm' => cnt += 2,314b'b' => cnt += 3,315b'B' => cnt += 3, // This is minimum size for full month316b'H' => cnt += 2,317b'M' => cnt += 2,318b'S' => cnt += 2,319b'9' => {320cnt += 9;321if matches!(iter.next(), Some(&b'f')) && iter.next().is_none() {322return Some(cnt);323} else {324return None;325}326},327b'6' => {328cnt += 6;329if matches!(iter.next(), Some(&b'f')) && iter.next().is_none() {330return Some(cnt);331} else {332return None;333}334},335b'3' => {336cnt += 3;337if matches!(iter.next(), Some(&b'f')) && iter.next().is_none() {338return Some(cnt);339} else {340return None;341}342},343_ => return None,344},345None => return None,346},347_ => {348cnt += 1;349},350}351}352Some(cnt)353}354355#[cfg(test)]356mod test {357use super::*;358359#[test]360fn test_parsing() {361let patterns = [362(363"2021-01-01",364"%Y-%m-%d",36510,366Some(367NaiveDate::from_ymd_opt(2021, 1, 1)368.unwrap()369.and_hms_nano_opt(0, 0, 0, 0)370.unwrap(),371),372),373(374"2021-01-01 07:45:12",375"%Y-%m-%d %H:%M:%S",37619,377Some(378NaiveDate::from_ymd_opt(2021, 1, 1)379.unwrap()380.and_hms_nano_opt(7, 45, 12, 0)381.unwrap(),382),383),384(385"2021-01-01 07:45:12",386"%Y-%m-%d %H:%M:%S",38719,388Some(389NaiveDate::from_ymd_opt(2021, 1, 1)390.unwrap()391.and_hms_nano_opt(7, 45, 12, 0)392.unwrap(),393),394),395(396"2019-04-18T02:45:55.555000000",397"%Y-%m-%dT%H:%M:%S.%9f",39829,399Some(400NaiveDate::from_ymd_opt(2019, 4, 18)401.unwrap()402.and_hms_nano_opt(2, 45, 55, 555000000)403.unwrap(),404),405),406(407"2019-04-18T02:45:55.555000",408"%Y-%m-%dT%H:%M:%S.%6f",40926,410Some(411NaiveDate::from_ymd_opt(2019, 4, 18)412.unwrap()413.and_hms_nano_opt(2, 45, 55, 555000000)414.unwrap(),415),416),417(418"2019-04-18T02:45:55.555",419"%Y-%m-%dT%H:%M:%S.%3f",42023,421Some(422NaiveDate::from_ymd_opt(2019, 4, 18)423.unwrap()424.and_hms_nano_opt(2, 45, 55, 555000000)425.unwrap(),426),427),428];429430for (val, fmt, len, expected) in patterns {431assert_eq!(fmt_len(fmt.as_bytes()).unwrap(), len);432unsafe {433assert_eq!(434StrpTimeState::default().parse(val.as_bytes(), fmt.as_bytes(), len),435expected436)437};438}439}440}441442443