Path: blob/main/crates/polars-python/src/conversion/any_value.rs
8362 views
use std::borrow::{Borrow, Cow};1use std::sync::{Arc, Mutex};23use chrono::{4DateTime, Datelike, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime, TimeDelta, Timelike,5};6use chrono_tz::Tz;7use hashbrown::HashMap;8use num_traits::ToPrimitive;9#[cfg(feature = "object")]10use polars::chunked_array::object::PolarsObjectSafe;11#[cfg(feature = "object")]12use polars::datatypes::OwnedObject;13use polars::datatypes::{DataType, Field, TimeUnit};14use polars::prelude::{AnyValue, PlSmallStr, Series, TimeZone};15use polars_compute::decimal::{DEC128_MAX_PREC, DecimalFmtBuffer, dec128_fits};16use polars_core::utils::any_values_to_supertype_and_n_dtypes;17use polars_core::utils::arrow::temporal_conversions::date32_to_date;18use polars_utils::aliases::PlFixedStateQuality;19use pyo3::exceptions::{PyOverflowError, PyTypeError, PyValueError};20use pyo3::prelude::*;21use pyo3::sync::PyOnceLock;22use pyo3::types::{23PyBool, PyBytes, PyDate, PyDateTime, PyDelta, PyDict, PyFloat, PyInt, PyList, PyMapping,24PyRange, PySequence, PyString, PyTime, PyTuple, PyType, PyTzInfo,25};26use pyo3::{IntoPyObjectExt, PyTypeCheck, intern};2728use super::datetime::{29datetime_to_py_object, elapsed_offset_to_timedelta, nanos_since_midnight_to_naivetime,30};31use super::{ObjectValue, Wrap, struct_dict};32use crate::error::PyPolarsErr;33use crate::py_modules::{pl_series, pl_utils};34use crate::series::PySeries;3536impl<'py> IntoPyObject<'py> for Wrap<AnyValue<'_>> {37type Target = PyAny;38type Output = Bound<'py, Self::Target>;39type Error = PyErr;4041fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {42any_value_into_py_object(self.0, py)43}44}4546impl<'py> IntoPyObject<'py> for &Wrap<AnyValue<'_>> {47type Target = PyAny;48type Output = Bound<'py, Self::Target>;49type Error = PyErr;5051fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {52self.clone().into_pyobject(py)53}54}5556impl<'a, 'py> FromPyObject<'a, 'py> for Wrap<AnyValue<'static>> {57type Error = PyErr;5859fn extract(ob: Borrowed<'a, 'py, PyAny>) -> PyResult<Self> {60py_object_to_any_value(&ob.to_owned(), true, true).map(Wrap)61}62}6364pub(crate) fn any_value_into_py_object<'py>(65av: AnyValue<'_>,66py: Python<'py>,67) -> PyResult<Bound<'py, PyAny>> {68let utils = pl_utils(py).bind(py);69match av {70AnyValue::UInt8(v) => v.into_bound_py_any(py),71AnyValue::UInt16(v) => v.into_bound_py_any(py),72AnyValue::UInt32(v) => v.into_bound_py_any(py),73AnyValue::UInt64(v) => v.into_bound_py_any(py),74AnyValue::UInt128(v) => v.into_bound_py_any(py),75AnyValue::Int8(v) => v.into_bound_py_any(py),76AnyValue::Int16(v) => v.into_bound_py_any(py),77AnyValue::Int32(v) => v.into_bound_py_any(py),78AnyValue::Int64(v) => v.into_bound_py_any(py),79AnyValue::Int128(v) => v.into_bound_py_any(py),80AnyValue::Float16(v) => v.to_f32().into_bound_py_any(py),81AnyValue::Float32(v) => v.into_bound_py_any(py),82AnyValue::Float64(v) => v.into_bound_py_any(py),83AnyValue::Null => py.None().into_bound_py_any(py),84AnyValue::Boolean(v) => v.into_bound_py_any(py),85AnyValue::String(v) => v.into_bound_py_any(py),86AnyValue::StringOwned(v) => v.into_bound_py_any(py),87AnyValue::Categorical(cat, map) | AnyValue::Enum(cat, map) => unsafe {88map.cat_to_str_unchecked(cat).into_bound_py_any(py)89},90AnyValue::CategoricalOwned(cat, map) | AnyValue::EnumOwned(cat, map) => unsafe {91map.cat_to_str_unchecked(cat).into_bound_py_any(py)92},93AnyValue::Date(v) => {94let date = date32_to_date(v);95date.into_bound_py_any(py)96},97AnyValue::Datetime(v, time_unit, time_zone) => {98datetime_to_py_object(py, v, time_unit, time_zone)99},100AnyValue::DatetimeOwned(v, time_unit, time_zone) => {101datetime_to_py_object(py, v, time_unit, time_zone.as_ref().map(AsRef::as_ref))102},103AnyValue::Duration(v, time_unit) => {104let time_delta = elapsed_offset_to_timedelta(v, time_unit);105time_delta.into_bound_py_any(py)106},107AnyValue::Time(v) => nanos_since_midnight_to_naivetime(v).into_bound_py_any(py),108AnyValue::Array(v, _) | AnyValue::List(v) => PySeries::new(v).to_list(py),109ref av @ AnyValue::Struct(_, _, flds) => {110Ok(struct_dict(py, av._iter_struct_av(), flds)?.into_any())111},112AnyValue::StructOwned(payload) => {113Ok(struct_dict(py, payload.0.into_iter(), &payload.1)?.into_any())114},115#[cfg(feature = "object")]116AnyValue::Object(v) => {117let object = v.as_any().downcast_ref::<ObjectValue>().unwrap();118Ok(object.inner.clone_ref(py).into_bound(py))119},120#[cfg(feature = "object")]121AnyValue::ObjectOwned(v) => {122let object = v.0.as_any().downcast_ref::<ObjectValue>().unwrap();123Ok(object.inner.clone_ref(py).into_bound(py))124},125AnyValue::Binary(v) => PyBytes::new(py, v).into_bound_py_any(py),126AnyValue::BinaryOwned(v) => PyBytes::new(py, &v).into_bound_py_any(py),127AnyValue::Decimal(v, prec, scale) => {128let convert = utils.getattr(intern!(py, "to_py_decimal"))?;129let mut buf = DecimalFmtBuffer::new();130let s = buf.format_dec128(v, scale, false, false);131convert.call1((prec, s))132},133}134}135136/// Holds a Python type object and implements hashing / equality based on the pointer address of the137/// type object. This is used as a hashtable key instead of only the `usize` pointer value, as we138/// need to hold a ref to the Python type object to keep it alive.139#[derive(Debug)]140pub struct TypeObjectKey {141#[allow(unused)]142type_object: Py<PyType>,143/// We need to store this in a field for `Borrow<usize>`144address: usize,145}146147impl TypeObjectKey {148fn new(type_object: Py<PyType>) -> Self {149let address = type_object.as_ptr() as usize;150Self {151type_object,152address,153}154}155}156157impl PartialEq for TypeObjectKey {158fn eq(&self, other: &Self) -> bool {159self.address == other.address160}161}162163impl Eq for TypeObjectKey {}164165impl std::borrow::Borrow<usize> for TypeObjectKey {166fn borrow(&self) -> &usize {167&self.address168}169}170171impl std::hash::Hash for TypeObjectKey {172fn hash<H: std::hash::Hasher>(&self, state: &mut H) {173let v: &usize = self.borrow();174v.hash(state)175}176}177178type InitFn = fn(&Bound<'_, PyAny>, bool) -> PyResult<AnyValue<'static>>;179pub(crate) static LUT: Mutex<HashMap<TypeObjectKey, InitFn, PlFixedStateQuality>> =180Mutex::new(HashMap::with_hasher(PlFixedStateQuality::with_seed(0)));181182/// Convert a Python object to an [`AnyValue`].183pub(crate) fn py_object_to_any_value(184ob: &Bound<'_, PyAny>,185strict: bool,186allow_object: bool,187) -> PyResult<AnyValue<'static>> {188// Conversion functions.189fn get_null(_ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {190Ok(AnyValue::Null)191}192193fn get_bool(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {194let b = ob.extract::<bool>()?;195Ok(AnyValue::Boolean(b))196}197198fn get_int(ob: &Bound<'_, PyAny>, strict: bool) -> PyResult<AnyValue<'static>> {199if let Ok(v) = ob.extract::<i64>() {200Ok(AnyValue::Int64(v))201} else if let Ok(v) = ob.extract::<i128>() {202Ok(AnyValue::Int128(v))203} else if let Ok(v) = ob.extract::<u64>() {204Ok(AnyValue::UInt64(v))205} else if let Ok(v) = ob.extract::<u128>() {206Ok(AnyValue::UInt128(v))207} else if !strict {208let f = ob.extract::<f64>()?;209Ok(AnyValue::Float64(f))210} else {211Err(PyOverflowError::new_err(format!(212"int value too large for Polars integer types: {ob}"213)))214}215}216217fn get_float(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {218Ok(AnyValue::Float64(ob.extract::<f64>()?))219}220221fn get_str(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {222Ok(AnyValue::StringOwned(PlSmallStr::from(223ob.extract::<&str>()?,224)))225}226227fn get_bytes(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {228let value = ob.extract::<Vec<u8>>()?;229Ok(AnyValue::BinaryOwned(value))230}231232fn get_date(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {233const UNIX_EPOCH: NaiveDate = DateTime::UNIX_EPOCH.naive_utc().date();234let date = ob.extract::<NaiveDate>()?;235let elapsed = date.signed_duration_since(UNIX_EPOCH);236Ok(AnyValue::Date(elapsed.num_days() as i32))237}238239fn get_datetime(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {240let py = ob.py();241let tzinfo = ob.getattr(intern!(py, "tzinfo"))?;242243if tzinfo.is_none() {244let datetime = ob.extract::<NaiveDateTime>()?;245let delta = datetime - DateTime::UNIX_EPOCH.naive_utc();246let timestamp = delta.num_microseconds().unwrap();247return Ok(AnyValue::Datetime(timestamp, TimeUnit::Microseconds, None));248}249250// Try converting `pytz` timezone to `zoneinfo` timezone251let (ob, tzinfo) = if let Some(tz) = tzinfo252.getattr(intern!(py, "zone"))253.ok()254.and_then(|tz| (!tz.is_none()).then_some(tz))255{256let tzinfo = PyTzInfo::timezone(py, tz.cast_into::<PyString>()?)?;257(258&ob.call_method(intern!(py, "astimezone"), (&tzinfo,), None)?,259tzinfo,260)261} else {262(ob, tzinfo.cast_into()?)263};264265let (timestamp, tz) = if tzinfo.hasattr(intern!(py, "key"))? {266let datetime = ob.extract::<DateTime<Tz>>()?;267let tz = unsafe { TimeZone::from_static(datetime.timezone().name()) };268if datetime.year() >= 2100 {269// chrono-tz does not support dates after 2100270// https://github.com/chronotope/chrono-tz/issues/135271(272pl_utils(py)273.bind(py)274.getattr(intern!(py, "datetime_to_int"))?275.call1((ob, intern!(py, "us")))?276.extract::<i64>()?,277tz,278)279} else {280let delta = datetime.to_utc() - DateTime::UNIX_EPOCH;281(delta.num_microseconds().unwrap(), tz)282}283} else {284let datetime = ob.extract::<DateTime<FixedOffset>>()?;285let delta = datetime.to_utc() - DateTime::UNIX_EPOCH;286(delta.num_microseconds().unwrap(), TimeZone::UTC)287};288289Ok(AnyValue::DatetimeOwned(290timestamp,291TimeUnit::Microseconds,292Some(Arc::new(tz)),293))294}295296fn get_timedelta(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {297let timedelta = ob.extract::<TimeDelta>()?;298if let Some(micros) = timedelta.num_microseconds() {299Ok(AnyValue::Duration(micros, TimeUnit::Microseconds))300} else {301Ok(AnyValue::Duration(302timedelta.num_milliseconds(),303TimeUnit::Milliseconds,304))305}306}307308fn get_time(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {309let time = ob.extract::<NaiveTime>()?;310311Ok(AnyValue::Time(312(time.num_seconds_from_midnight() as i64) * 1_000_000_000 + time.nanosecond() as i64,313))314}315316fn get_decimal(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {317fn abs_decimal_from_digits(318digits: impl IntoIterator<Item = u8>,319exp: i32,320) -> Option<(i128, usize)> {321let mut v = 0_i128;322for d in digits {323v = v.checked_mul(10)?.checked_add(d as i128)?;324}325let scale = if exp > 0 {326v = 10_i128.checked_pow(exp as u32)?.checked_mul(v)?;3270328} else {329(-exp) as usize330};331dec128_fits(v, DEC128_MAX_PREC).then_some((v, scale))332}333334// Note: Using Vec<u8> is not the most efficient thing here (input is a tuple)335let (sign, digits, exp): (i8, Vec<u8>, i32) = ob336.call_method0(intern!(ob.py(), "as_tuple"))337.unwrap()338.extract()339.unwrap();340let (mut v, scale) = abs_decimal_from_digits(digits, exp).ok_or_else(|| {341PyErr::from(PyPolarsErr::Other(342"Decimal is too large to fit in Decimal128".into(),343))344})?;345if sign > 0 {346v = -v; // Won't overflow since -i128::MAX > i128::MIN347}348Ok(AnyValue::Decimal(v, DEC128_MAX_PREC, scale))349}350351fn get_list(ob: &Bound<'_, PyAny>, strict: bool) -> PyResult<AnyValue<'static>> {352fn get_list_with_constructor(353ob: &Bound<'_, PyAny>,354strict: bool,355) -> PyResult<AnyValue<'static>> {356// Use the dedicated constructor.357// This constructor is able to go via dedicated type constructors358// so it can be much faster.359let py = ob.py();360let kwargs = PyDict::new(py);361kwargs.set_item("strict", strict)?;362let s = pl_series(py).call(py, (ob,), Some(&kwargs))?;363get_list_from_series(s.bind(py), strict)364}365366if ob.is_empty()? {367Ok(AnyValue::List(Series::new_empty(368PlSmallStr::EMPTY,369&DataType::Null,370)))371} else if ob.is_instance_of::<PyList>() | ob.is_instance_of::<PyTuple>() {372let list = ob.cast::<PySequence>()?;373374// Try to find first non-null.375let length = list.len()?;376let mut iter = list.try_iter()?;377let mut avs = Vec::new();378for item in &mut iter {379let av = py_object_to_any_value(&item?, strict, true)?;380let is_null = av.is_null();381avs.push(av);382if is_null {383break;384}385}386387// Try to use a faster converter.388if let Some(av) = avs.last()389&& !av.is_null()390&& av.dtype().is_primitive()391{392// Always use strict, we will filter the error if we're not393// strict and try again using a slower converter with supertype.394match get_list_with_constructor(ob, true) {395Ok(ret) => return Ok(ret),396Err(e) => {397if strict {398return Err(e);399}400},401}402}403404// Push the rest of the anyvalues and use slower converter.405avs.reserve(length);406for item in &mut iter {407avs.push(py_object_to_any_value(&item?, strict, true)?);408}409410let (dtype, _n_dtypes) = any_values_to_supertype_and_n_dtypes(&avs)411.map_err(|e| PyTypeError::new_err(e.to_string()))?;412let s = Series::from_any_values_and_dtype(PlSmallStr::EMPTY, &avs, &dtype, strict)413.map_err(|e| {414PyTypeError::new_err(format!(415"{e}\n\nHint: Try setting `strict=False` to allow passing data with mixed types."416))417})?;418Ok(AnyValue::List(s))419} else {420// range will take this branch421get_list_with_constructor(ob, strict)422}423}424425fn get_list_from_series(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {426let s = super::get_series(ob)?;427Ok(AnyValue::List(s))428}429430fn get_mapping(ob: &Bound<'_, PyAny>, strict: bool) -> PyResult<AnyValue<'static>> {431let mapping = ob.cast::<PyMapping>()?;432let len = mapping.len()?;433let mut keys = Vec::with_capacity(len);434let mut vals = Vec::with_capacity(len);435436for item in mapping.items()?.try_iter()? {437let item = item?.cast_into::<PyTuple>()?;438let (key_py, val_py) = (item.get_item(0)?, item.get_item(1)?);439440let key: Cow<str> = key_py.extract()?;441let val = py_object_to_any_value(&val_py, strict, true)?;442443keys.push(Field::new(key.as_ref().into(), val.dtype()));444vals.push(val);445}446Ok(AnyValue::StructOwned(Box::new((vals, keys))))447}448449fn get_struct(ob: &Bound<'_, PyAny>, strict: bool) -> PyResult<AnyValue<'static>> {450let dict = ob.cast::<PyDict>().unwrap();451let len = dict.len();452let mut keys = Vec::with_capacity(len);453let mut vals = Vec::with_capacity(len);454for (k, v) in dict.into_iter() {455let key = k.extract::<Cow<str>>()?;456let val = py_object_to_any_value(&v, strict, true)?;457let dtype = val.dtype();458keys.push(Field::new(key.as_ref().into(), dtype));459vals.push(val)460}461Ok(AnyValue::StructOwned(Box::new((vals, keys))))462}463464fn get_namedtuple(ob: &Bound<'_, PyAny>, strict: bool) -> PyResult<AnyValue<'static>> {465let tuple = ob.cast::<PyTuple>().unwrap();466let len = tuple.len();467let fields = ob468.getattr(intern!(ob.py(), "_fields"))?469.cast_into::<PyTuple>()?;470let mut keys = Vec::with_capacity(len);471let mut vals = Vec::with_capacity(len);472for (k, v) in fields.into_iter().zip(tuple.into_iter()) {473let key = k.extract::<Cow<str>>()?;474let val = py_object_to_any_value(&v, strict, true)?;475let dtype = val.dtype();476keys.push(Field::new(key.as_ref().into(), dtype));477vals.push(val)478}479Ok(AnyValue::StructOwned(Box::new((vals, keys))))480}481482fn get_object(ob: &Bound<'_, PyAny>, _strict: bool) -> PyResult<AnyValue<'static>> {483#[cfg(feature = "object")]484{485// This is slow, but hey don't use objects.486let v = &ObjectValue {487inner: ob.clone().unbind(),488};489Ok(AnyValue::ObjectOwned(OwnedObject(v.to_boxed())))490}491#[cfg(not(feature = "object"))]492panic!("activate object")493}494495/// Determine which conversion function to use for the given object.496///497/// Note: This function is only ran if the object's type is not already in the498/// lookup table.499fn get_conversion_function(ob: &Bound<'_, PyAny>, allow_object: bool) -> PyResult<InitFn> {500let py = ob.py();501if ob.is_none() {502Ok(get_null)503}504// bool must be checked before int because Python bool is an instance of int.505else if ob.is_instance_of::<PyBool>() {506Ok(get_bool)507} else if ob.is_instance_of::<PyInt>() {508Ok(get_int)509} else if ob.is_instance_of::<PyFloat>() {510Ok(get_float)511} else if ob.is_instance_of::<PyString>() {512Ok(get_str)513} else if ob.is_instance_of::<PyBytes>() {514Ok(get_bytes)515} else if ob.is_instance_of::<PyTuple>() {516// NamedTuple-like object?517if ob.hasattr(intern!(py, "_fields"))? {518Ok(get_namedtuple)519} else {520Ok(get_list)521}522} else if ob.is_instance_of::<PyList>() {523Ok(get_list)524} else if ob.is_instance_of::<PyDict>() {525Ok(get_struct)526} else if PyMapping::type_check(ob) {527Ok(get_mapping)528}529// note: datetime must be checked *before* date530// (as python datetime is an instance of date)531else if PyDateTime::type_check(ob) {532Ok(get_datetime as InitFn)533} else if PyDate::type_check(ob) {534Ok(get_date as InitFn)535} else if PyTime::type_check(ob) {536Ok(get_time as InitFn)537} else if PyDelta::type_check(ob) {538Ok(get_timedelta as InitFn)539} else if ob.is_instance_of::<PyRange>() {540Ok(get_list as InitFn)541} else if ob.is_instance(pl_series(py).bind(py))? {542Ok(get_list_from_series as InitFn)543} else {544static NDARRAY_TYPE: PyOnceLock<Py<PyType>> = PyOnceLock::new();545if let Ok(ndarray_type) = NDARRAY_TYPE.import(py, "numpy", "ndarray") {546if ob.is_instance(ndarray_type)? {547// will convert via Series -> mmap_numpy_array548return Ok(get_list as InitFn);549}550}551static DECIMAL_TYPE: PyOnceLock<Py<PyType>> = PyOnceLock::new();552if ob.is_instance(DECIMAL_TYPE.import(py, "decimal", "Decimal")?)? {553return Ok(get_decimal as InitFn);554}555556// support NumPy scalars557if ob.extract::<i64>().is_ok() || ob.extract::<u64>().is_ok() {558return Ok(get_int as InitFn);559} else if ob.extract::<f64>().is_ok() {560return Ok(get_float as InitFn);561}562563if allow_object {564Ok(get_object as InitFn)565} else {566Err(PyValueError::new_err(format!("Cannot convert {ob}")))567}568}569}570571let py_type = ob.get_type();572let py_type_address = py_type.as_ptr() as usize;573574let conversion_func = {575if let Some(cached_func) = LUT.lock().unwrap().get(&py_type_address) {576*cached_func577} else {578let k = TypeObjectKey::new(py_type.clone().unbind());579assert_eq!(k.address, py_type_address);580581let func = get_conversion_function(ob, allow_object)?;582LUT.lock().unwrap().insert(k, func);583func584}585};586587conversion_func(ob, strict)588}589590591