Path: blob/main/crates/polars-time/src/chunkedarray/string/infer.rs
6940 views
use arrow::array::PrimitiveArray;1use chrono::format::ParseErrorKind;2use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime};3use polars_core::prelude::*;45use super::patterns::{self, Pattern};6#[cfg(feature = "dtype-date")]7use crate::chunkedarray::date::naive_date_to_date;8use crate::chunkedarray::string::strptime;9use crate::prelude::string::strptime::StrpTimeState;1011polars_utils::regex_cache::cached_regex! {12static DATETIME_DMY_RE = r#"(?x)13^14['"]? # optional quotes15(?:\d{1,2}) # day16[-/\.] # separator17(?P<month>[01]?\d{1}) # month18[-/\.] # separator19(?:\d{4,}) # year20(?:21[T\ ] # separator22(?:\d{1,2}) # hour23:? # separator24(?:\d{1,2}) # minute25(?:26:? # separator27(?:\d{1,2}) # second28(?:29\.(?:\d{1,9}) # subsecond30)?31)?32)?33['"]? # optional quotes34$35"#;3637static DATETIME_YMD_RE = r#"(?x)38^39['"]? # optional quotes40(?:\d{4,}) # year41[-/\.] # separator42(?P<month>[01]?\d{1}) # month43[-/\.] # separator44(?:\d{1,2}) # day45(?:46[T\ ] # separator47(?:\d{1,2}) # hour48:? # separator49(?:\d{1,2}) # minute50(?:51:? # separator52(?:\d{1,2}) # seconds53(?:54\.(?:\d{1,9}) # subsecond55)?56)?57)?58['"]? # optional quotes59$60"#;6162static DATETIME_YMDZ_RE = r#"(?x)63^64['"]? # optional quotes65(?:\d{4,}) # year66[-/\.] # separator67(?P<month>[01]?\d{1}) # month68[-/\.] # separator69(?:\d{1,2}) # year70[T\ ] # separator71(?:\d{2}) # hour72:? # separator73(?:\d{2}) # minute74(?:75:? # separator76(?:\d{2}) # second77(?:78\.(?:\d{1,9}) # subsecond79)?80)?81(?:82# offset (e.g. +01:00, +0100, or +01)83[+-](?:\d{2})84(?::?\d{2})?85# or Zulu suffix86|Z87)88['"]? # optional quotes89$90"#;91}9293impl Pattern {94pub fn is_inferable(&self, val: &str) -> bool {95match self {96Pattern::DateDMY => true, // there are very few Date patterns, so it's cheaper97Pattern::DateYMD => true, // to just try them98Pattern::Time => true,99Pattern::DatetimeDMY => match DATETIME_DMY_RE.captures(val) {100Some(search) => (1..=12).contains(101&search102.name("month")103.unwrap()104.as_str()105.parse::<u8>()106.unwrap(),107),108None => false,109},110Pattern::DatetimeYMD => match DATETIME_YMD_RE.captures(val) {111Some(search) => (1..=12).contains(112&search113.name("month")114.unwrap()115.as_str()116.parse::<u8>()117.unwrap(),118),119None => false,120},121Pattern::DatetimeYMDZ => match DATETIME_YMDZ_RE.captures(val) {122Some(search) => (1..=12).contains(123&search124.name("month")125.unwrap()126.as_str()127.parse::<u8>()128.unwrap(),129),130None => false,131},132}133}134}135136pub trait StrpTimeParser<T> {137fn parse_bytes(&mut self, val: &[u8], time_unit: Option<TimeUnit>) -> Option<T>;138}139140#[cfg(feature = "dtype-datetime")]141impl StrpTimeParser<i64> for DatetimeInfer<Int64Type> {142fn parse_bytes(&mut self, val: &[u8], time_unit: Option<TimeUnit>) -> Option<i64> {143if self.fmt_len == 0 {144self.fmt_len = strptime::fmt_len(self.latest_fmt.as_bytes())?;145}146let transform = match time_unit {147Some(TimeUnit::Nanoseconds) => datetime_to_timestamp_ns,148Some(TimeUnit::Microseconds) => datetime_to_timestamp_us,149Some(TimeUnit::Milliseconds) => datetime_to_timestamp_ms,150_ => unreachable!(), // time_unit has to be provided for datetime151};152unsafe {153self.transform_bytes154.parse(val, self.latest_fmt.as_bytes(), self.fmt_len)155.map(transform)156.or_else(|| {157// TODO! this will try all patterns.158// somehow we must early escape if value is invalid159for fmt in self.patterns {160if self.fmt_len == 0 {161self.fmt_len = strptime::fmt_len(fmt.as_bytes())?;162}163if let Some(parsed) = self164.transform_bytes165.parse(val, fmt.as_bytes(), self.fmt_len)166.map(datetime_to_timestamp_us)167{168self.latest_fmt = fmt;169return Some(parsed);170}171}172None173})174}175}176}177178#[cfg(feature = "dtype-date")]179impl StrpTimeParser<i32> for DatetimeInfer<Int32Type> {180fn parse_bytes(&mut self, val: &[u8], _time_unit: Option<TimeUnit>) -> Option<i32> {181if self.fmt_len == 0 {182self.fmt_len = strptime::fmt_len(self.latest_fmt.as_bytes())?;183}184unsafe {185self.transform_bytes186.parse(val, self.latest_fmt.as_bytes(), self.fmt_len)187.map(|ndt| naive_date_to_date(ndt.date()))188.or_else(|| {189// TODO! this will try all patterns.190// somehow we must early escape if value is invalid191for fmt in self.patterns {192if self.fmt_len == 0 {193self.fmt_len = strptime::fmt_len(fmt.as_bytes())?;194}195if let Some(parsed) = self196.transform_bytes197.parse(val, fmt.as_bytes(), self.fmt_len)198.map(|ndt| naive_date_to_date(ndt.date()))199{200self.latest_fmt = fmt;201return Some(parsed);202}203}204None205})206}207}208}209210#[derive(Clone)]211pub struct DatetimeInfer<T: PolarsNumericType> {212pub pattern: Pattern,213patterns: &'static [&'static str],214latest_fmt: &'static str,215transform: fn(&str, &str) -> Option<T::Native>,216transform_bytes: StrpTimeState,217fmt_len: u16,218pub logical_type: DataType,219}220221pub trait TryFromWithUnit<T>: Sized {222type Error;223fn try_from_with_unit(pattern: T, unit: Option<TimeUnit>) -> PolarsResult<Self>;224}225226#[cfg(feature = "dtype-datetime")]227impl TryFromWithUnit<Pattern> for DatetimeInfer<Int64Type> {228type Error = PolarsError;229230fn try_from_with_unit(value: Pattern, time_unit: Option<TimeUnit>) -> PolarsResult<Self> {231let time_unit = time_unit.expect("time_unit must be provided for datetime");232233let transform = match (time_unit, value) {234(TimeUnit::Milliseconds, Pattern::DatetimeYMDZ) => transform_tzaware_datetime_ms,235(TimeUnit::Milliseconds, _) => transform_datetime_ms,236(TimeUnit::Microseconds, Pattern::DatetimeYMDZ) => transform_tzaware_datetime_us,237(TimeUnit::Microseconds, _) => transform_datetime_us,238(TimeUnit::Nanoseconds, Pattern::DatetimeYMDZ) => transform_tzaware_datetime_ns,239(TimeUnit::Nanoseconds, _) => transform_datetime_ns,240};241let (pattern, patterns) = match value {242Pattern::DatetimeDMY | Pattern::DateDMY => {243(Pattern::DatetimeDMY, patterns::DATETIME_D_M_Y)244},245Pattern::DatetimeYMD | Pattern::DateYMD => {246(Pattern::DatetimeYMD, patterns::DATETIME_Y_M_D)247},248Pattern::DatetimeYMDZ => (Pattern::DatetimeYMDZ, patterns::DATETIME_Y_M_D_Z),249Pattern::Time => (Pattern::Time, patterns::TIME_H_M_S),250};251252Ok(DatetimeInfer {253pattern,254patterns,255latest_fmt: patterns[0],256transform,257transform_bytes: StrpTimeState::default(),258fmt_len: 0,259logical_type: DataType::Datetime(time_unit, None),260})261}262}263264#[cfg(feature = "dtype-date")]265impl TryFromWithUnit<Pattern> for DatetimeInfer<Int32Type> {266type Error = PolarsError;267268fn try_from_with_unit(value: Pattern, _time_unit: Option<TimeUnit>) -> PolarsResult<Self> {269match value {270Pattern::DateDMY => Ok(DatetimeInfer {271pattern: Pattern::DateDMY,272patterns: patterns::DATE_D_M_Y,273latest_fmt: patterns::DATE_D_M_Y[0],274transform: transform_date,275transform_bytes: StrpTimeState::default(),276fmt_len: 0,277logical_type: DataType::Date,278}),279Pattern::DateYMD => Ok(DatetimeInfer {280pattern: Pattern::DateYMD,281patterns: patterns::DATE_Y_M_D,282latest_fmt: patterns::DATE_Y_M_D[0],283transform: transform_date,284transform_bytes: StrpTimeState::default(),285fmt_len: 0,286logical_type: DataType::Date,287}),288_ => polars_bail!(ComputeError: "could not convert pattern"),289}290}291}292293impl<T: PolarsNumericType> DatetimeInfer<T> {294pub fn parse(&mut self, val: &str) -> Option<T::Native> {295match (self.transform)(val, self.latest_fmt) {296Some(parsed) => Some(parsed),297// try other patterns298None => {299if !self.pattern.is_inferable(val) {300return None;301}302for fmt in self.patterns {303self.fmt_len = 0;304if let Some(parsed) = (self.transform)(val, fmt) {305self.latest_fmt = fmt;306return Some(parsed);307}308}309None310},311}312}313}314315impl<T: PolarsNumericType> DatetimeInfer<T> {316fn coerce_string(&mut self, ca: &StringChunked) -> Series {317let chunks = ca.downcast_iter().map(|array| {318let iter = array319.into_iter()320.map(|opt_val| opt_val.and_then(|val| self.parse(val)));321PrimitiveArray::from_trusted_len_iter(iter)322});323ChunkedArray::<T>::from_chunk_iter(ca.name().clone(), chunks)324.into_series()325.cast(&self.logical_type)326.unwrap()327.with_name(ca.name().clone())328}329}330331#[cfg(feature = "dtype-date")]332fn transform_date(val: &str, fmt: &str) -> Option<i32> {333NaiveDate::parse_from_str(val, fmt)334.ok()335.map(naive_date_to_date)336}337338#[cfg(feature = "dtype-datetime")]339pub(crate) fn transform_datetime_ns(val: &str, fmt: &str) -> Option<i64> {340match NaiveDateTime::parse_from_str(val, fmt) {341Ok(ndt) => Some(datetime_to_timestamp_ns(ndt)),342Err(parse_error) => match parse_error.kind() {343ParseErrorKind::NotEnough => NaiveDate::parse_from_str(val, fmt)344.ok()345.map(|nd| datetime_to_timestamp_ns(nd.and_hms_opt(0, 0, 0).unwrap())),346_ => None,347},348}349}350351fn transform_tzaware_datetime_ns(val: &str, fmt: &str) -> Option<i64> {352let dt = DateTime::parse_from_str(val, fmt);353dt.ok().map(|dt| datetime_to_timestamp_ns(dt.naive_utc()))354}355356#[cfg(feature = "dtype-datetime")]357pub(crate) fn transform_datetime_us(val: &str, fmt: &str) -> Option<i64> {358match NaiveDateTime::parse_from_str(val, fmt) {359Ok(ndt) => Some(datetime_to_timestamp_us(ndt)),360Err(parse_error) => match parse_error.kind() {361ParseErrorKind::NotEnough => NaiveDate::parse_from_str(val, fmt)362.ok()363.map(|nd| datetime_to_timestamp_us(nd.and_hms_opt(0, 0, 0).unwrap())),364_ => None,365},366}367}368369fn transform_tzaware_datetime_us(val: &str, fmt: &str) -> Option<i64> {370let dt = DateTime::parse_from_str(val, fmt);371dt.ok().map(|dt| datetime_to_timestamp_us(dt.naive_utc()))372}373374#[cfg(feature = "dtype-datetime")]375pub(crate) fn transform_datetime_ms(val: &str, fmt: &str) -> Option<i64> {376match NaiveDateTime::parse_from_str(val, fmt) {377Ok(ndt) => Some(datetime_to_timestamp_ms(ndt)),378Err(parse_error) => match parse_error.kind() {379ParseErrorKind::NotEnough => NaiveDate::parse_from_str(val, fmt)380.ok()381.map(|nd| datetime_to_timestamp_ms(nd.and_hms_opt(0, 0, 0).unwrap())),382_ => None,383},384}385}386387fn transform_tzaware_datetime_ms(val: &str, fmt: &str) -> Option<i64> {388let dt = DateTime::parse_from_str(val, fmt);389dt.ok().map(|dt| datetime_to_timestamp_ms(dt.naive_utc()))390}391392pub fn infer_pattern_single(val: &str) -> Option<Pattern> {393// Dates come first, because we see datetimes as superset of dates394infer_pattern_date_single(val)395.or_else(|| infer_pattern_time_single(val))396.or_else(|| infer_pattern_datetime_single(val))397}398399fn infer_pattern_datetime_single(val: &str) -> Option<Pattern> {400if patterns::DATETIME_D_M_Y.iter().any(|fmt| {401NaiveDateTime::parse_from_str(val, fmt).is_ok()402|| NaiveDate::parse_from_str(val, fmt).is_ok()403}) {404Some(Pattern::DatetimeDMY)405} else if patterns::DATETIME_Y_M_D.iter().any(|fmt| {406NaiveDateTime::parse_from_str(val, fmt).is_ok()407|| NaiveDate::parse_from_str(val, fmt).is_ok()408}) {409Some(Pattern::DatetimeYMD)410} else if patterns::DATETIME_Y_M_D_Z411.iter()412.any(|fmt| NaiveDateTime::parse_from_str(val, fmt).is_ok())413{414Some(Pattern::DatetimeYMDZ)415} else {416None417}418}419420fn infer_pattern_date_single(val: &str) -> Option<Pattern> {421if patterns::DATE_D_M_Y422.iter()423.any(|fmt| NaiveDate::parse_from_str(val, fmt).is_ok())424{425Some(Pattern::DateDMY)426} else if patterns::DATE_Y_M_D427.iter()428.any(|fmt| NaiveDate::parse_from_str(val, fmt).is_ok())429{430Some(Pattern::DateYMD)431} else {432None433}434}435436fn infer_pattern_time_single(val: &str) -> Option<Pattern> {437patterns::TIME_H_M_S438.iter()439.any(|fmt| NaiveTime::parse_from_str(val, fmt).is_ok())440.then_some(Pattern::Time)441}442443#[cfg(feature = "dtype-datetime")]444pub fn to_datetime_with_inferred_tz(445ca: &StringChunked,446tu: TimeUnit,447strict: bool,448exact: bool,449ambiguous: &StringChunked,450) -> PolarsResult<DatetimeChunked> {451use super::StringMethods;452453let out = if exact {454to_datetime(ca, tu, None, ambiguous, false)455} else {456ca.as_datetime_not_exact(None, tu, false, None, ambiguous, false)457}?;458459if strict && ca.null_count() != out.null_count() {460polars_core::utils::handle_casting_failures(461&ca.clone().into_series(),462&out.clone().into_series(),463)?;464}465466Ok(out)467}468469#[cfg(feature = "dtype-datetime")]470pub fn to_datetime(471ca: &StringChunked,472tu: TimeUnit,473tz: Option<&TimeZone>,474_ambiguous: &StringChunked,475// Ensure that the inferred time_zone matches the given time_zone.476ensure_matching_time_zone: bool,477) -> PolarsResult<DatetimeChunked> {478match ca.first_non_null() {479None => {480Ok(Int64Chunked::full_null(ca.name().clone(), ca.len()).into_datetime(tu, tz.cloned()))481},482Some(idx) => {483let subset = ca.slice(idx as i64, ca.len());484let pattern = subset485.into_iter()486.find_map(|opt_val| opt_val.and_then(infer_pattern_datetime_single))487.ok_or_else(|| polars_err!(parse_fmt_idk = "date"))?;488let mut infer = DatetimeInfer::<Int64Type>::try_from_with_unit(pattern, Some(tu))?;489match pattern {490#[cfg(feature = "timezones")]491Pattern::DatetimeYMDZ => infer.coerce_string(ca).datetime().map(|ca| {492polars_ensure!(493!ensure_matching_time_zone || tz.is_some(),494to_datetime_tz_mismatch495);496497let mut ca = ca.clone();498// `tz` has already been validated.499ca.set_time_unit_and_time_zone(tu, tz.cloned().unwrap_or(TimeZone::UTC))?;500Ok(ca)501})?,502_ => infer.coerce_string(ca).datetime().map(|ca| {503let mut ca = ca.clone();504ca.set_time_unit(tu);505match tz {506#[cfg(feature = "timezones")]507Some(tz) => polars_ops::prelude::replace_time_zone(508&ca,509Some(tz),510_ambiguous,511NonExistent::Raise,512),513_ => Ok(ca),514}515})?,516}517},518}519}520#[cfg(feature = "dtype-date")]521pub(crate) fn to_date(ca: &StringChunked) -> PolarsResult<DateChunked> {522match ca.first_non_null() {523None => Ok(Int32Chunked::full_null(ca.name().clone(), ca.len()).into_date()),524Some(idx) => {525let subset = ca.slice(idx as i64, ca.len());526let pattern = subset527.into_iter()528.find_map(|opt_val| opt_val.and_then(infer_pattern_date_single))529.ok_or_else(|| polars_err!(parse_fmt_idk = "date"))?;530let mut infer = DatetimeInfer::<Int32Type>::try_from_with_unit(pattern, None).unwrap();531infer.coerce_string(ca).date().cloned()532},533}534}535536537