Path: blob/main/crates/polars-time/src/chunkedarray/string/mod.rs
6940 views
pub mod infer;1use chrono::DateTime;2mod patterns;3mod strptime;4pub use patterns::Pattern;5#[cfg(feature = "dtype-time")]6use polars_core::chunked_array::temporal::time_to_time64ns;7use polars_core::prelude::arity::unary_elementwise;8use polars_utils::cache::LruCachedFunc;910use super::*;11#[cfg(feature = "dtype-date")]12use crate::chunkedarray::date::naive_date_to_date;13use crate::prelude::string::strptime::StrpTimeState;1415#[cfg(feature = "dtype-time")]16fn time_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>17// (string, fmt) -> PolarsResult18where19F: Fn(&str, &str) -> chrono::ParseResult<K>,20{21patterns::TIME_H_M_S22.iter()23.chain(patterns::TIME_H_M_S)24.find(|fmt| convert(val, fmt).is_ok())25.copied()26}2728fn datetime_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>29// (string, fmt) -> PolarsResult30where31F: Fn(&str, &str) -> chrono::ParseResult<K>,32{33patterns::DATETIME_Y_M_D34.iter()35.chain(patterns::DATETIME_D_M_Y)36.find(|fmt| convert(val, fmt).is_ok())37.copied()38}3940fn date_pattern<F, K>(val: &str, convert: F) -> Option<&'static str>41// (string, fmt) -> PolarsResult42where43F: Fn(&str, &str) -> chrono::ParseResult<K>,44{45patterns::DATE_Y_M_D46.iter()47.chain(patterns::DATE_D_M_Y)48.find(|fmt| convert(val, fmt).is_ok())49.copied()50}5152fn get_first_val(ca: &StringChunked) -> PolarsResult<&str> {53let idx = ca.first_non_null().ok_or_else(|| {54polars_err!(ComputeError:55"unable to determine date parsing format, all values are null",56)57})?;58Ok(ca.get(idx).expect("should not be null"))59}6061#[cfg(feature = "dtype-datetime")]62fn sniff_fmt_datetime(ca_string: &StringChunked) -> PolarsResult<&'static str> {63let val = get_first_val(ca_string)?;64datetime_pattern(val, NaiveDateTime::parse_from_str)65.or_else(|| datetime_pattern(val, NaiveDate::parse_from_str))66.ok_or_else(|| polars_err!(parse_fmt_idk = "datetime"))67}6869#[cfg(feature = "dtype-date")]70fn sniff_fmt_date(ca_string: &StringChunked) -> PolarsResult<&'static str> {71let val = get_first_val(ca_string)?;72date_pattern(val, NaiveDate::parse_from_str).ok_or_else(|| polars_err!(parse_fmt_idk = "date"))73}7475#[cfg(feature = "dtype-time")]76fn sniff_fmt_time(ca_string: &StringChunked) -> PolarsResult<&'static str> {77let val = get_first_val(ca_string)?;78time_pattern(val, NaiveTime::parse_from_str).ok_or_else(|| polars_err!(parse_fmt_idk = "time"))79}8081pub trait StringMethods: AsString {82#[cfg(feature = "dtype-time")]83/// Parsing string values and return a [`TimeChunked`]84fn as_time(&self, fmt: Option<&str>, use_cache: bool) -> PolarsResult<TimeChunked> {85let string_ca = self.as_string();86let fmt = match fmt {87Some(fmt) => fmt,88None => sniff_fmt_time(string_ca)?,89};90let use_cache = use_cache && string_ca.len() > 50;9192let mut convert = LruCachedFunc::new(93|s| {94let naive_time = NaiveTime::parse_from_str(s, fmt).ok()?;95Some(time_to_time64ns(&naive_time))96},97(string_ca.len() as f64).sqrt() as usize,98);99let ca = unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache));100Ok(ca.with_name(string_ca.name().clone()).into_time())101}102103#[cfg(feature = "dtype-date")]104/// Parsing string values and return a [`DateChunked`]105/// Different from `as_date` this function allows matches that not contain the whole string106/// e.g. "foo-2021-01-01-bar" could match "2021-01-01"107fn as_date_not_exact(&self, fmt: Option<&str>) -> PolarsResult<DateChunked> {108let string_ca = self.as_string();109let fmt = match fmt {110Some(fmt) => fmt,111None => sniff_fmt_date(string_ca)?,112};113let ca = unary_elementwise(string_ca, |opt_s| {114let mut s = opt_s?;115while !s.is_empty() {116match NaiveDate::parse_and_remainder(s, fmt) {117Ok((nd, _)) => return Some(naive_date_to_date(nd)),118Err(_) => {119let mut it = s.chars();120it.next();121s = it.as_str();122},123}124}125126None127});128Ok(ca.with_name(string_ca.name().clone()).into_date())129}130131#[cfg(feature = "dtype-datetime")]132/// Parsing string values and return a [`DatetimeChunked`]133/// Different from `as_datetime` this function allows matches that not contain the whole string134/// e.g. "foo-2021-01-01-bar" could match "2021-01-01"135fn as_datetime_not_exact(136&self,137fmt: Option<&str>,138tu: TimeUnit,139tz_aware: bool,140tz: Option<&TimeZone>,141_ambiguous: &StringChunked,142// Ensure that the inferred time_zone matches the given time_zone.143ensure_matching_tz: bool,144) -> PolarsResult<DatetimeChunked> {145let string_ca = self.as_string();146let had_format = fmt.is_some();147let fmt = match fmt {148Some(fmt) => fmt,149None => sniff_fmt_datetime(string_ca)?,150};151152let func = match tu {153TimeUnit::Nanoseconds => datetime_to_timestamp_ns,154TimeUnit::Microseconds => datetime_to_timestamp_us,155TimeUnit::Milliseconds => datetime_to_timestamp_ms,156};157158let ca = unary_elementwise(string_ca, |opt_s| {159let mut s = opt_s?;160while !s.is_empty() {161let timestamp = if tz_aware {162DateTime::parse_and_remainder(s, fmt).map(|(dt, _r)| func(dt.naive_utc()))163} else {164NaiveDateTime::parse_and_remainder(s, fmt).map(|(nd, _r)| func(nd))165};166match timestamp {167Ok(ts) => return Some(ts),168Err(_) => {169let mut it = s.chars();170it.next();171s = it.as_str();172},173}174}175None176})177.with_name(string_ca.name().clone());178179polars_ensure!(180!ensure_matching_tz || had_format || !(tz_aware && tz.is_none()),181to_datetime_tz_mismatch182);183184match (tz_aware, tz) {185#[cfg(feature = "timezones")]186(false, Some(tz)) => polars_ops::prelude::replace_time_zone(187&ca.into_datetime(tu, None),188Some(tz),189_ambiguous,190NonExistent::Raise,191),192#[cfg(feature = "timezones")]193(true, tz) => Ok(ca.into_datetime(tu, Some(tz.cloned().unwrap_or(TimeZone::UTC)))),194_ => Ok(ca.into_datetime(tu, None)),195}196}197198#[cfg(feature = "dtype-date")]199/// Parsing string values and return a [`DateChunked`]200fn as_date(&self, fmt: Option<&str>, use_cache: bool) -> PolarsResult<DateChunked> {201let string_ca = self.as_string();202let fmt = match fmt {203Some(fmt) => fmt,204None => return infer::to_date(string_ca),205};206let use_cache = use_cache && string_ca.len() > 50;207let fmt = strptime::compile_fmt(fmt)?;208209// We can use the fast parser.210let ca = if let Some(fmt_len) = strptime::fmt_len(fmt.as_bytes()) {211let mut strptime_cache = StrpTimeState::default();212let mut convert = LruCachedFunc::new(213|s: &str| {214// SAFETY: fmt_len is correct, it was computed with this `fmt` str.215match unsafe { strptime_cache.parse(s.as_bytes(), fmt.as_bytes(), fmt_len) } {216// Fallback to chrono.217None => NaiveDate::parse_from_str(s, &fmt).ok(),218Some(ndt) => Some(ndt.date()),219}220.map(naive_date_to_date)221},222(string_ca.len() as f64).sqrt() as usize,223);224unary_elementwise(string_ca, |val| convert.eval(val?, use_cache))225} else {226let mut convert = LruCachedFunc::new(227|s| {228let naive_date = NaiveDate::parse_from_str(s, &fmt).ok()?;229Some(naive_date_to_date(naive_date))230},231(string_ca.len() as f64).sqrt() as usize,232);233unary_elementwise(string_ca, |val| convert.eval(val?, use_cache))234};235236Ok(ca.with_name(string_ca.name().clone()).into_date())237}238239#[cfg(feature = "dtype-datetime")]240/// Parsing string values and return a [`DatetimeChunked`].241fn as_datetime(242&self,243fmt: Option<&str>,244tu: TimeUnit,245use_cache: bool,246tz_aware: bool,247tz: Option<&TimeZone>,248ambiguous: &StringChunked,249) -> PolarsResult<DatetimeChunked> {250let string_ca = self.as_string();251let fmt = match fmt {252Some(fmt) => fmt,253None => return infer::to_datetime(string_ca, tu, tz, ambiguous, true),254};255let fmt = strptime::compile_fmt(fmt)?;256let use_cache = use_cache && string_ca.len() > 50;257258let func = match tu {259TimeUnit::Nanoseconds => datetime_to_timestamp_ns,260TimeUnit::Microseconds => datetime_to_timestamp_us,261TimeUnit::Milliseconds => datetime_to_timestamp_ms,262};263264if tz_aware {265#[cfg(feature = "timezones")]266{267let mut convert = LruCachedFunc::new(268|s: &str| {269let dt = DateTime::parse_from_str(s, &fmt).ok()?;270Some(func(dt.naive_utc()))271},272(string_ca.len() as f64).sqrt() as usize,273);274Ok(275unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))276.with_name(string_ca.name().clone())277.into_datetime(tu, Some(tz.cloned().unwrap_or(TimeZone::UTC))),278)279}280#[cfg(not(feature = "timezones"))]281{282panic!("activate 'timezones' feature")283}284} else {285let transform = match tu {286TimeUnit::Nanoseconds => infer::transform_datetime_ns,287TimeUnit::Microseconds => infer::transform_datetime_us,288TimeUnit::Milliseconds => infer::transform_datetime_ms,289};290// We can use the fast parser.291let ca = if let Some(fmt_len) = self::strptime::fmt_len(fmt.as_bytes()) {292let mut strptime_cache = StrpTimeState::default();293let mut convert = LruCachedFunc::new(294|s: &str| {295// SAFETY: fmt_len is correct, it was computed with this `fmt` str.296match unsafe { strptime_cache.parse(s.as_bytes(), fmt.as_bytes(), fmt_len) }297{298None => transform(s, &fmt),299Some(ndt) => Some(func(ndt)),300}301},302(string_ca.len() as f64).sqrt() as usize,303);304unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))305} else {306let mut convert = LruCachedFunc::new(307|s| transform(s, &fmt),308(string_ca.len() as f64).sqrt() as usize,309);310unary_elementwise(string_ca, |opt_s| convert.eval(opt_s?, use_cache))311};312let dt = ca313.with_name(string_ca.name().clone())314.into_datetime(tu, None);315match tz {316#[cfg(feature = "timezones")]317Some(tz) => polars_ops::prelude::replace_time_zone(318&dt,319Some(tz),320ambiguous,321NonExistent::Raise,322),323_ => Ok(dt),324}325}326}327}328329pub trait AsString {330fn as_string(&self) -> &StringChunked;331}332333impl AsString for StringChunked {334fn as_string(&self) -> &StringChunked {335self336}337}338339impl StringMethods for StringChunked {}340341342