Path: blob/main/crates/polars-io/src/csv/read/builder.rs
8420 views
use arrow::array::MutableBinaryViewArray;1#[cfg(feature = "dtype-decimal")]2use polars_compute::decimal::str_to_dec128;3#[cfg(feature = "dtype-categorical")]4use polars_core::chunked_array::builder::CategoricalChunkedBuilder;5use polars_core::prelude::*;6use polars_error::to_compute_err;7#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]8use polars_time::chunkedarray::string::Pattern;9#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]10use polars_time::prelude::string::infer::{11DatetimeInfer, StrpTimeParser, TryFromWithUnit, infer_pattern_single,12};13#[cfg(feature = "dtype-f16")]14use polars_utils::float16::pf16;15use polars_utils::vec::PushUnchecked;1617use super::options::CsvEncoding;18use super::parser::{could_be_whitespace_fast, skip_whitespace};19use super::utils::escape_field;2021pub(crate) trait PrimitiveParser: PolarsNumericType {22fn parse(bytes: &[u8]) -> Option<Self::Native>;23}2425#[cfg(feature = "dtype-f16")]26impl PrimitiveParser for Float16Type {27#[inline]28fn parse(bytes: &[u8]) -> Option<pf16> {29use num_traits::FromPrimitive;3031pf16::from_f32(fast_float2::parse(bytes).ok()?)32}33}3435impl PrimitiveParser for Float32Type {36#[inline]37fn parse(bytes: &[u8]) -> Option<f32> {38fast_float2::parse(bytes).ok()39}40}41impl PrimitiveParser for Float64Type {42#[inline]43fn parse(bytes: &[u8]) -> Option<f64> {44fast_float2::parse(bytes).ok()45}46}4748#[cfg(feature = "dtype-u8")]49impl PrimitiveParser for UInt8Type {50#[inline]51fn parse(bytes: &[u8]) -> Option<u8> {52atoi_simd::parse_skipped(bytes).ok()53}54}55#[cfg(feature = "dtype-u16")]56impl PrimitiveParser for UInt16Type {57#[inline]58fn parse(bytes: &[u8]) -> Option<u16> {59atoi_simd::parse_skipped(bytes).ok()60}61}62impl PrimitiveParser for UInt32Type {63#[inline]64fn parse(bytes: &[u8]) -> Option<u32> {65atoi_simd::parse_skipped(bytes).ok()66}67}68impl PrimitiveParser for UInt64Type {69#[inline]70fn parse(bytes: &[u8]) -> Option<u64> {71atoi_simd::parse_skipped(bytes).ok()72}73}74#[cfg(feature = "dtype-u128")]75impl PrimitiveParser for UInt128Type {76#[inline]77fn parse(bytes: &[u8]) -> Option<u128> {78atoi_simd::parse_skipped(bytes).ok()79}80}81#[cfg(feature = "dtype-i8")]82impl PrimitiveParser for Int8Type {83#[inline]84fn parse(bytes: &[u8]) -> Option<i8> {85atoi_simd::parse_skipped(bytes).ok()86}87}88#[cfg(feature = "dtype-i16")]89impl PrimitiveParser for Int16Type {90#[inline]91fn parse(bytes: &[u8]) -> Option<i16> {92atoi_simd::parse_skipped(bytes).ok()93}94}95impl PrimitiveParser for Int32Type {96#[inline]97fn parse(bytes: &[u8]) -> Option<i32> {98atoi_simd::parse_skipped(bytes).ok()99}100}101impl PrimitiveParser for Int64Type {102#[inline]103fn parse(bytes: &[u8]) -> Option<i64> {104atoi_simd::parse_skipped(bytes).ok()105}106}107#[cfg(feature = "dtype-i128")]108impl PrimitiveParser for Int128Type {109#[inline]110fn parse(bytes: &[u8]) -> Option<i128> {111atoi_simd::parse_skipped(bytes).ok()112}113}114115trait ParsedBuilder {116fn parse_bytes(117&mut self,118bytes: &[u8],119ignore_errors: bool,120_needs_escaping: bool,121_missing_is_null: bool,122_time_unit: Option<TimeUnit>,123) -> PolarsResult<()>;124}125126impl<T> ParsedBuilder for PrimitiveChunkedBuilder<T>127where128T: PolarsNumericType + PrimitiveParser,129{130#[inline]131fn parse_bytes(132&mut self,133mut bytes: &[u8],134ignore_errors: bool,135needs_escaping: bool,136_missing_is_null: bool,137_time_unit: Option<TimeUnit>,138) -> PolarsResult<()> {139if !bytes.is_empty() && needs_escaping {140bytes = &bytes[1..bytes.len() - 1];141}142143if !bytes.is_empty() && could_be_whitespace_fast(bytes[0]) {144bytes = skip_whitespace(bytes);145}146147if bytes.is_empty() {148self.append_null();149return Ok(());150}151152match T::parse(bytes) {153Some(value) => self.append_value(value),154None => {155if ignore_errors {156self.append_null()157} else {158polars_bail!(ComputeError: "invalid primitive value found during CSV parsing")159}160},161}162Ok(())163}164}165166pub struct Utf8Field {167name: PlSmallStr,168mutable: MutableBinaryViewArray<[u8]>,169scratch: Vec<u8>,170quote_char: u8,171encoding: CsvEncoding,172}173174impl Utf8Field {175fn new(176name: PlSmallStr,177capacity: usize,178quote_char: Option<u8>,179encoding: CsvEncoding,180) -> Self {181Self {182name,183mutable: MutableBinaryViewArray::with_capacity(capacity),184scratch: vec![],185quote_char: quote_char.unwrap_or(b'"'),186encoding,187}188}189}190191#[inline]192pub fn validate_utf8(bytes: &[u8]) -> bool {193simdutf8::basic::from_utf8(bytes).is_ok()194}195196impl ParsedBuilder for Utf8Field {197#[inline]198fn parse_bytes(199&mut self,200bytes: &[u8],201ignore_errors: bool,202needs_escaping: bool,203missing_is_null: bool,204_time_unit: Option<TimeUnit>,205) -> PolarsResult<()> {206if bytes.is_empty() {207if missing_is_null {208self.mutable.push_null()209} else {210self.mutable.push(Some([]))211}212return Ok(());213}214215// note that one branch writes without updating the length, so we must do that later.216let escaped_bytes = if needs_escaping {217self.scratch.clear();218self.scratch.reserve(bytes.len());219polars_ensure!(bytes.len() > 1 && bytes.last() == Some(&self.quote_char), ComputeError: "invalid csv file\n\nField `{}` is not properly escaped.", std::str::from_utf8(bytes).map_err(to_compute_err)?);220221// SAFETY:222// we just allocated enough capacity and data_len is correct.223unsafe {224let n_written =225escape_field(bytes, self.quote_char, self.scratch.spare_capacity_mut());226self.scratch.set_len(n_written);227}228229self.scratch.as_slice()230} else {231bytes232};233234if matches!(self.encoding, CsvEncoding::LossyUtf8) | ignore_errors {235// It is important that this happens after escaping, as invalid escaped string can produce236// invalid utf8.237let parse_result = validate_utf8(escaped_bytes);238239match parse_result {240true => {241let value = escaped_bytes;242self.mutable.push_value(value)243},244false => {245if matches!(self.encoding, CsvEncoding::LossyUtf8) {246// TODO! do this without allocating247let s = String::from_utf8_lossy(escaped_bytes);248self.mutable.push_value(s.as_ref().as_bytes())249} else if ignore_errors {250self.mutable.push_null()251} else {252// If field before escaping is valid utf8, the escaping is incorrect.253if needs_escaping && validate_utf8(bytes) {254polars_bail!(ComputeError: "string field is not properly escaped");255} else {256polars_bail!(ComputeError: "invalid utf-8 sequence");257}258}259},260}261} else {262self.mutable.push_value(escaped_bytes)263}264265Ok(())266}267}268269#[cfg(feature = "dtype-categorical")]270pub struct CategoricalField<T: PolarsCategoricalType> {271escape_scratch: Vec<u8>,272quote_char: u8,273builder: CategoricalChunkedBuilder<T>,274}275276#[cfg(feature = "dtype-categorical")]277impl<T: PolarsCategoricalType> CategoricalField<T> {278fn new(name: PlSmallStr, capacity: usize, quote_char: Option<u8>, dtype: DataType) -> Self {279let mut builder = CategoricalChunkedBuilder::new(name, dtype);280builder.reserve(capacity);281282Self {283escape_scratch: vec![],284quote_char: quote_char.unwrap_or(b'"'),285builder,286}287}288289#[inline]290fn parse_bytes(291&mut self,292bytes: &[u8],293ignore_errors: bool,294needs_escaping: bool,295_missing_is_null: bool,296_time_unit: Option<TimeUnit>,297) -> PolarsResult<()> {298if bytes.is_empty() {299self.builder.append_null();300return Ok(());301}302if validate_utf8(bytes) {303if needs_escaping {304polars_ensure!(bytes.len() > 1, ComputeError: "invalid csv file\n\nField `{}` is not properly escaped.", std::str::from_utf8(bytes).map_err(to_compute_err)?);305self.escape_scratch.clear();306self.escape_scratch.reserve(bytes.len());307// SAFETY:308// we just allocated enough capacity and data_len is correct.309unsafe {310let n_written = escape_field(311bytes,312self.quote_char,313self.escape_scratch.spare_capacity_mut(),314);315self.escape_scratch.set_len(n_written);316}317318// SAFETY:319// just did utf8 check320let key = unsafe { std::str::from_utf8_unchecked(&self.escape_scratch) };321self.builder.append_str(key)?;322} else {323// SAFETY:324// just did utf8 check325let key = unsafe { std::str::from_utf8_unchecked(bytes) };326self.builder.append_str(key)?;327}328} else if ignore_errors {329self.builder.append_null()330} else {331polars_bail!(ComputeError: "invalid utf-8 sequence");332}333Ok(())334}335}336337impl ParsedBuilder for BooleanChunkedBuilder {338#[inline]339fn parse_bytes(340&mut self,341bytes: &[u8],342ignore_errors: bool,343needs_escaping: bool,344_missing_is_null: bool,345_time_unit: Option<TimeUnit>,346) -> PolarsResult<()> {347let bytes = if needs_escaping {348&bytes[1..bytes.len() - 1]349} else {350bytes351};352if bytes.eq_ignore_ascii_case(b"false") {353self.append_value(false);354} else if bytes.eq_ignore_ascii_case(b"true") {355self.append_value(true);356} else if ignore_errors || bytes.is_empty() {357self.append_null();358} else {359polars_bail!(360ComputeError: "error while parsing value {} as boolean",361String::from_utf8_lossy(bytes),362);363}364Ok(())365}366}367368#[cfg(feature = "dtype-decimal")]369pub struct DecimalField {370builder: PrimitiveChunkedBuilder<Int128Type>,371precision: usize,372scale: usize,373decimal_comma: bool,374}375376#[cfg(feature = "dtype-decimal")]377impl DecimalField {378fn new(379name: PlSmallStr,380capacity: usize,381precision: usize,382scale: usize,383decimal_comma: bool,384) -> Self {385let builder = PrimitiveChunkedBuilder::<Int128Type>::new(name, capacity);386Self {387builder,388precision,389scale,390decimal_comma,391}392}393}394395#[cfg(feature = "dtype-decimal")]396impl ParsedBuilder for DecimalField {397#[inline]398fn parse_bytes(399&mut self,400mut bytes: &[u8],401ignore_errors: bool,402needs_escaping: bool,403_missing_is_null: bool,404_time_unit: Option<TimeUnit>,405) -> PolarsResult<()> {406if !bytes.is_empty() && needs_escaping {407bytes = &bytes[1..bytes.len() - 1];408}409410if !bytes.is_empty() && could_be_whitespace_fast(bytes[0]) {411bytes = skip_whitespace(bytes);412}413414if bytes.is_empty() {415self.builder.append_null();416return Ok(());417}418419match str_to_dec128(bytes, self.precision, self.scale, self.decimal_comma) {420Some(value) => self.builder.append_value(value),421None => {422if ignore_errors {423self.builder.append_null()424} else {425polars_bail!(ComputeError: "invalid decimal value found during CSV parsing")426}427},428}429430Ok(())431}432}433434#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]435pub struct DatetimeField<T: PolarsNumericType> {436compiled: Option<DatetimeInfer<T>>,437builder: PrimitiveChunkedBuilder<T>,438}439440#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]441impl<T: PolarsNumericType> DatetimeField<T> {442fn new(name: PlSmallStr, capacity: usize) -> Self {443let builder = PrimitiveChunkedBuilder::<T>::new(name, capacity);444Self {445compiled: None,446builder,447}448}449}450451#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]452fn slow_datetime_parser<T>(453buf: &mut DatetimeField<T>,454bytes: &[u8],455time_unit: Option<TimeUnit>,456ignore_errors: bool,457) -> PolarsResult<()>458where459T: PolarsNumericType,460DatetimeInfer<T>: TryFromWithUnit<Pattern>,461{462let val = if bytes.is_ascii() {463// SAFETY:464// we just checked it is ascii465unsafe { std::str::from_utf8_unchecked(bytes) }466} else {467match std::str::from_utf8(bytes) {468Ok(val) => val,469Err(_) => {470if ignore_errors {471buf.builder.append_null();472return Ok(());473} else {474polars_bail!(ComputeError: "invalid utf-8 sequence");475}476},477}478};479480let pattern = match &buf.compiled {481Some(compiled) => compiled.pattern,482None => match infer_pattern_single(val) {483Some(pattern) => pattern,484None => {485if ignore_errors {486buf.builder.append_null();487return Ok(());488} else {489polars_bail!(ComputeError: "could not find a 'date/datetime' pattern for '{}'", val)490}491},492},493};494match DatetimeInfer::try_from_with_unit(pattern, time_unit) {495Ok(mut infer) => {496let parsed = infer.parse(val);497let Some(parsed) = parsed else {498if ignore_errors {499buf.builder.append_null();500return Ok(());501} else {502polars_bail!(ComputeError: "could not parse '{}' with pattern '{:?}'", val, pattern)503}504};505506buf.compiled = Some(infer);507buf.builder.append_value(parsed);508Ok(())509},510Err(err) => {511if ignore_errors {512buf.builder.append_null();513Ok(())514} else {515Err(err)516}517},518}519}520521#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]522impl<T> ParsedBuilder for DatetimeField<T>523where524T: PolarsNumericType,525DatetimeInfer<T>: TryFromWithUnit<Pattern> + StrpTimeParser<T::Native>,526{527#[inline]528fn parse_bytes(529&mut self,530mut bytes: &[u8],531ignore_errors: bool,532needs_escaping: bool,533_missing_is_null: bool,534time_unit: Option<TimeUnit>,535) -> PolarsResult<()> {536if needs_escaping && bytes.len() >= 2 {537bytes = &bytes[1..bytes.len() - 1]538}539540if bytes.is_empty() {541// for types other than string `_missing_is_null` is irrelevant; we always append null542self.builder.append_null();543return Ok(());544}545546match &mut self.compiled {547None => slow_datetime_parser(self, bytes, time_unit, ignore_errors),548Some(compiled) => {549match compiled.parse_bytes(bytes, time_unit) {550Some(parsed) => {551self.builder.append_value(parsed);552Ok(())553},554// fall back on chrono parser555// this is a lot slower, we need to do utf8 checking and use556// the slower parser557None => slow_datetime_parser(self, bytes, time_unit, ignore_errors),558}559},560}561}562}563564pub fn init_builders(565projection: &[usize],566capacity: usize,567schema: &Schema,568quote_char: Option<u8>,569encoding: CsvEncoding,570decimal_comma: bool,571) -> PolarsResult<Vec<Builder>> {572projection573.iter()574.map(|&i| {575let (name, dtype) = schema.get_at_index(i).unwrap();576let name = name.clone();577let builder = match dtype {578&DataType::Boolean => Builder::Boolean(BooleanChunkedBuilder::new(name, capacity)),579#[cfg(feature = "dtype-i8")]580&DataType::Int8 => Builder::Int8(PrimitiveChunkedBuilder::new(name, capacity)),581#[cfg(feature = "dtype-i16")]582&DataType::Int16 => Builder::Int16(PrimitiveChunkedBuilder::new(name, capacity)),583&DataType::Int32 => Builder::Int32(PrimitiveChunkedBuilder::new(name, capacity)),584&DataType::Int64 => Builder::Int64(PrimitiveChunkedBuilder::new(name, capacity)),585#[cfg(feature = "dtype-i128")]586&DataType::Int128 => Builder::Int128(PrimitiveChunkedBuilder::new(name, capacity)),587#[cfg(feature = "dtype-u8")]588&DataType::UInt8 => Builder::UInt8(PrimitiveChunkedBuilder::new(name, capacity)),589#[cfg(feature = "dtype-u16")]590&DataType::UInt16 => Builder::UInt16(PrimitiveChunkedBuilder::new(name, capacity)),591&DataType::UInt32 => Builder::UInt32(PrimitiveChunkedBuilder::new(name, capacity)),592&DataType::UInt64 => Builder::UInt64(PrimitiveChunkedBuilder::new(name, capacity)),593#[cfg(feature = "dtype-u128")]594&DataType::UInt128 => {595Builder::UInt128(PrimitiveChunkedBuilder::new(name, capacity))596},597#[cfg(feature = "dtype-f16")]598&DataType::Float16 => {599if decimal_comma {600Builder::DecimalFloat16(601PrimitiveChunkedBuilder::new(name, capacity),602Default::default(),603)604} else {605Builder::Float16(PrimitiveChunkedBuilder::new(name, capacity))606}607},608&DataType::Float32 => {609if decimal_comma {610Builder::DecimalFloat32(611PrimitiveChunkedBuilder::new(name, capacity),612Default::default(),613)614} else {615Builder::Float32(PrimitiveChunkedBuilder::new(name, capacity))616}617},618&DataType::Float64 => {619if decimal_comma {620Builder::DecimalFloat64(621PrimitiveChunkedBuilder::new(name, capacity),622Default::default(),623)624} else {625Builder::Float64(PrimitiveChunkedBuilder::new(name, capacity))626}627},628#[cfg(feature = "dtype-decimal")]629&DataType::Decimal(precision, scale) => Builder::Decimal(DecimalField::new(630name,631capacity,632precision,633scale,634decimal_comma,635)),636&DataType::String => {637Builder::Utf8(Utf8Field::new(name, capacity, quote_char, encoding))638},639#[cfg(feature = "dtype-datetime")]640DataType::Datetime(time_unit, time_zone) => Builder::Datetime {641buf: DatetimeField::new(name, capacity),642time_unit: *time_unit,643time_zone: time_zone.clone(),644},645#[cfg(feature = "dtype-date")]646&DataType::Date => Builder::Date(DatetimeField::new(name, capacity)),647#[cfg(feature = "dtype-categorical")]648DataType::Categorical(_, _) | DataType::Enum(_, _) => {649match dtype.cat_physical().unwrap() {650CategoricalPhysical::U8 => {651Builder::Categorical8(CategoricalField::<Categorical8Type>::new(652name,653capacity,654quote_char,655dtype.clone(),656))657},658CategoricalPhysical::U16 => {659Builder::Categorical16(CategoricalField::<Categorical16Type>::new(660name,661capacity,662quote_char,663dtype.clone(),664))665},666CategoricalPhysical::U32 => {667Builder::Categorical32(CategoricalField::<Categorical32Type>::new(668name,669capacity,670quote_char,671dtype.clone(),672))673},674}675},676dt => polars_bail!(677ComputeError: "unsupported data type when reading CSV: {} when reading CSV", dt,678),679};680Ok(builder)681})682.collect()683}684685#[allow(clippy::large_enum_variant)]686pub enum Builder {687Boolean(BooleanChunkedBuilder),688#[cfg(feature = "dtype-i8")]689Int8(PrimitiveChunkedBuilder<Int8Type>),690#[cfg(feature = "dtype-i16")]691Int16(PrimitiveChunkedBuilder<Int16Type>),692Int32(PrimitiveChunkedBuilder<Int32Type>),693Int64(PrimitiveChunkedBuilder<Int64Type>),694#[cfg(feature = "dtype-i128")]695Int128(PrimitiveChunkedBuilder<Int128Type>),696#[cfg(feature = "dtype-u8")]697UInt8(PrimitiveChunkedBuilder<UInt8Type>),698#[cfg(feature = "dtype-u16")]699UInt16(PrimitiveChunkedBuilder<UInt16Type>),700UInt32(PrimitiveChunkedBuilder<UInt32Type>),701UInt64(PrimitiveChunkedBuilder<UInt64Type>),702#[cfg(feature = "dtype-u128")]703UInt128(PrimitiveChunkedBuilder<UInt128Type>),704#[cfg(feature = "dtype-f16")]705Float16(PrimitiveChunkedBuilder<Float16Type>),706Float32(PrimitiveChunkedBuilder<Float32Type>),707Float64(PrimitiveChunkedBuilder<Float64Type>),708#[cfg(feature = "dtype-decimal")]709Decimal(DecimalField),710/// Stores the Utf8 fields and the total string length seen for that column711Utf8(Utf8Field),712#[cfg(feature = "dtype-datetime")]713Datetime {714buf: DatetimeField<Int64Type>,715time_unit: TimeUnit,716time_zone: Option<TimeZone>,717},718#[cfg(feature = "dtype-date")]719Date(DatetimeField<Int32Type>),720#[cfg(feature = "dtype-categorical")]721Categorical8(CategoricalField<Categorical8Type>),722#[cfg(feature = "dtype-categorical")]723Categorical16(CategoricalField<Categorical16Type>),724#[cfg(feature = "dtype-categorical")]725Categorical32(CategoricalField<Categorical32Type>),726#[cfg(feature = "dtype-f16")]727DecimalFloat16(PrimitiveChunkedBuilder<Float16Type>, Vec<u8>),728DecimalFloat32(PrimitiveChunkedBuilder<Float32Type>, Vec<u8>),729DecimalFloat64(PrimitiveChunkedBuilder<Float64Type>, Vec<u8>),730}731732impl Builder {733pub fn into_series(self) -> PolarsResult<Series> {734let s = match self {735Builder::Boolean(v) => v.finish().into_series(),736#[cfg(feature = "dtype-i8")]737Builder::Int8(v) => v.finish().into_series(),738#[cfg(feature = "dtype-i16")]739Builder::Int16(v) => v.finish().into_series(),740Builder::Int32(v) => v.finish().into_series(),741Builder::Int64(v) => v.finish().into_series(),742#[cfg(feature = "dtype-i128")]743Builder::Int128(v) => v.finish().into_series(),744#[cfg(feature = "dtype-u8")]745Builder::UInt8(v) => v.finish().into_series(),746#[cfg(feature = "dtype-u16")]747Builder::UInt16(v) => v.finish().into_series(),748Builder::UInt32(v) => v.finish().into_series(),749Builder::UInt64(v) => v.finish().into_series(),750#[cfg(feature = "dtype-u128")]751Builder::UInt128(v) => v.finish().into_series(),752#[cfg(feature = "dtype-f16")]753Builder::Float16(v) => v.finish().into_series(),754Builder::Float32(v) => v.finish().into_series(),755Builder::Float64(v) => v.finish().into_series(),756#[cfg(feature = "dtype-f16")]757Builder::DecimalFloat16(v, _) => v.finish().into_series(),758Builder::DecimalFloat32(v, _) => v.finish().into_series(),759Builder::DecimalFloat64(v, _) => v.finish().into_series(),760#[cfg(feature = "dtype-decimal")]761Builder::Decimal(DecimalField {762builder,763precision,764scale,765..766}) => unsafe {767builder768.finish()769.into_series()770.from_physical_unchecked(&DataType::Decimal(precision, scale))771.unwrap()772},773#[cfg(feature = "dtype-datetime")]774Builder::Datetime {775buf,776time_unit,777time_zone,778} => buf779.builder780.finish()781.into_series()782.cast(&DataType::Datetime(time_unit, time_zone))783.unwrap(),784#[cfg(feature = "dtype-date")]785Builder::Date(v) => v786.builder787.finish()788.into_series()789.cast(&DataType::Date)790.unwrap(),791792Builder::Utf8(v) => {793let arr = v.mutable.freeze();794StringChunked::with_chunk(v.name, unsafe { arr.to_utf8view_unchecked() })795.into_series()796},797#[cfg(feature = "dtype-categorical")]798Builder::Categorical8(buf) => buf.builder.finish().into_series(),799#[cfg(feature = "dtype-categorical")]800Builder::Categorical16(buf) => buf.builder.finish().into_series(),801#[cfg(feature = "dtype-categorical")]802Builder::Categorical32(buf) => buf.builder.finish().into_series(),803};804Ok(s)805}806807pub fn add_null(&mut self, valid: bool) {808match self {809Builder::Boolean(v) => v.append_null(),810#[cfg(feature = "dtype-i8")]811Builder::Int8(v) => v.append_null(),812#[cfg(feature = "dtype-i16")]813Builder::Int16(v) => v.append_null(),814Builder::Int32(v) => v.append_null(),815Builder::Int64(v) => v.append_null(),816#[cfg(feature = "dtype-i128")]817Builder::Int128(v) => v.append_null(),818#[cfg(feature = "dtype-u8")]819Builder::UInt8(v) => v.append_null(),820#[cfg(feature = "dtype-u16")]821Builder::UInt16(v) => v.append_null(),822Builder::UInt32(v) => v.append_null(),823Builder::UInt64(v) => v.append_null(),824#[cfg(feature = "dtype-u128")]825Builder::UInt128(v) => v.append_null(),826#[cfg(feature = "dtype-f16")]827Builder::Float16(v) => v.append_null(),828Builder::Float32(v) => v.append_null(),829Builder::Float64(v) => v.append_null(),830#[cfg(feature = "dtype-decimal")]831Builder::Decimal(buf) => buf.builder.append_null(),832#[cfg(feature = "dtype-f16")]833Builder::DecimalFloat16(v, _) => v.append_null(),834Builder::DecimalFloat32(v, _) => v.append_null(),835Builder::DecimalFloat64(v, _) => v.append_null(),836Builder::Utf8(v) => {837if valid {838v.mutable.push_value("")839} else {840v.mutable.push_null()841}842},843#[cfg(feature = "dtype-datetime")]844Builder::Datetime { buf, .. } => buf.builder.append_null(),845#[cfg(feature = "dtype-date")]846Builder::Date(v) => v.builder.append_null(),847#[cfg(feature = "dtype-categorical")]848Builder::Categorical8(buf) => buf.builder.append_null(),849#[cfg(feature = "dtype-categorical")]850Builder::Categorical16(buf) => buf.builder.append_null(),851#[cfg(feature = "dtype-categorical")]852Builder::Categorical32(buf) => buf.builder.append_null(),853};854}855856pub fn dtype(&self) -> DataType {857match self {858Builder::Boolean(_) => DataType::Boolean,859#[cfg(feature = "dtype-i8")]860Builder::Int8(_) => DataType::Int8,861#[cfg(feature = "dtype-i16")]862Builder::Int16(_) => DataType::Int16,863Builder::Int32(_) => DataType::Int32,864Builder::Int64(_) => DataType::Int64,865#[cfg(feature = "dtype-i128")]866Builder::Int128(_) => DataType::Int128,867#[cfg(feature = "dtype-u8")]868Builder::UInt8(_) => DataType::UInt8,869#[cfg(feature = "dtype-u16")]870Builder::UInt16(_) => DataType::UInt16,871Builder::UInt32(_) => DataType::UInt32,872Builder::UInt64(_) => DataType::UInt64,873#[cfg(feature = "dtype-u128")]874Builder::UInt128(_) => DataType::UInt128,875#[cfg(feature = "dtype-f16")]876Builder::Float16(_) | Builder::DecimalFloat16(_, _) => DataType::Float16,877Builder::Float32(_) | Builder::DecimalFloat32(_, _) => DataType::Float32,878Builder::Float64(_) | Builder::DecimalFloat64(_, _) => DataType::Float64,879#[cfg(feature = "dtype-decimal")]880Builder::Decimal(DecimalField {881precision, scale, ..882}) => DataType::Decimal(*precision, *scale),883Builder::Utf8(_) => DataType::String,884#[cfg(feature = "dtype-datetime")]885Builder::Datetime { time_unit, .. } => DataType::Datetime(*time_unit, None),886#[cfg(feature = "dtype-date")]887Builder::Date(_) => DataType::Date,888#[cfg(feature = "dtype-categorical")]889Builder::Categorical8(buf) => buf.builder.dtype().clone(),890#[cfg(feature = "dtype-categorical")]891Builder::Categorical16(buf) => buf.builder.dtype().clone(),892#[cfg(feature = "dtype-categorical")]893Builder::Categorical32(buf) => buf.builder.dtype().clone(),894}895}896897#[inline]898pub fn add(899&mut self,900bytes: &[u8],901ignore_errors: bool,902needs_escaping: bool,903missing_is_null: bool,904) -> PolarsResult<()> {905use Builder::*;906match self {907Boolean(buf) => <BooleanChunkedBuilder as ParsedBuilder>::parse_bytes(908buf,909bytes,910ignore_errors,911needs_escaping,912missing_is_null,913None,914),915#[cfg(feature = "dtype-i8")]916Int8(buf) => <PrimitiveChunkedBuilder<Int8Type> as ParsedBuilder>::parse_bytes(917buf,918bytes,919ignore_errors,920needs_escaping,921missing_is_null,922None,923),924#[cfg(feature = "dtype-i16")]925Int16(buf) => <PrimitiveChunkedBuilder<Int16Type> as ParsedBuilder>::parse_bytes(926buf,927bytes,928ignore_errors,929needs_escaping,930missing_is_null,931None,932),933Int32(buf) => <PrimitiveChunkedBuilder<Int32Type> as ParsedBuilder>::parse_bytes(934buf,935bytes,936ignore_errors,937needs_escaping,938missing_is_null,939None,940),941Int64(buf) => <PrimitiveChunkedBuilder<Int64Type> as ParsedBuilder>::parse_bytes(942buf,943bytes,944ignore_errors,945needs_escaping,946missing_is_null,947None,948),949#[cfg(feature = "dtype-i128")]950Int128(buf) => <PrimitiveChunkedBuilder<Int128Type> as ParsedBuilder>::parse_bytes(951buf,952bytes,953ignore_errors,954needs_escaping,955missing_is_null,956None,957),958#[cfg(feature = "dtype-u8")]959UInt8(buf) => <PrimitiveChunkedBuilder<UInt8Type> as ParsedBuilder>::parse_bytes(960buf,961bytes,962ignore_errors,963needs_escaping,964missing_is_null,965None,966),967#[cfg(feature = "dtype-u16")]968UInt16(buf) => <PrimitiveChunkedBuilder<UInt16Type> as ParsedBuilder>::parse_bytes(969buf,970bytes,971ignore_errors,972needs_escaping,973missing_is_null,974None,975),976UInt32(buf) => <PrimitiveChunkedBuilder<UInt32Type> as ParsedBuilder>::parse_bytes(977buf,978bytes,979ignore_errors,980needs_escaping,981missing_is_null,982None,983),984UInt64(buf) => <PrimitiveChunkedBuilder<UInt64Type> as ParsedBuilder>::parse_bytes(985buf,986bytes,987ignore_errors,988needs_escaping,989missing_is_null,990None,991),992#[cfg(feature = "dtype-u128")]993UInt128(buf) => <PrimitiveChunkedBuilder<UInt128Type> as ParsedBuilder>::parse_bytes(994buf,995bytes,996ignore_errors,997needs_escaping,998missing_is_null,999None,1000),1001#[cfg(feature = "dtype-f16")]1002Float16(buf) => <PrimitiveChunkedBuilder<Float16Type> as ParsedBuilder>::parse_bytes(1003buf,1004bytes,1005ignore_errors,1006needs_escaping,1007missing_is_null,1008None,1009),1010Float32(buf) => <PrimitiveChunkedBuilder<Float32Type> as ParsedBuilder>::parse_bytes(1011buf,1012bytes,1013ignore_errors,1014needs_escaping,1015missing_is_null,1016None,1017),1018Float64(buf) => <PrimitiveChunkedBuilder<Float64Type> as ParsedBuilder>::parse_bytes(1019buf,1020bytes,1021ignore_errors,1022needs_escaping,1023missing_is_null,1024None,1025),1026#[cfg(feature = "dtype-f16")]1027DecimalFloat16(buf, scratch) => {1028prepare_decimal_comma(bytes, scratch);1029<PrimitiveChunkedBuilder<Float16Type> as ParsedBuilder>::parse_bytes(1030buf,1031scratch,1032ignore_errors,1033needs_escaping,1034missing_is_null,1035None,1036)1037},1038DecimalFloat32(buf, scratch) => {1039prepare_decimal_comma(bytes, scratch);1040<PrimitiveChunkedBuilder<Float32Type> as ParsedBuilder>::parse_bytes(1041buf,1042scratch,1043ignore_errors,1044needs_escaping,1045missing_is_null,1046None,1047)1048},1049DecimalFloat64(buf, scratch) => {1050prepare_decimal_comma(bytes, scratch);1051<PrimitiveChunkedBuilder<Float64Type> as ParsedBuilder>::parse_bytes(1052buf,1053scratch,1054ignore_errors,1055needs_escaping,1056missing_is_null,1057None,1058)1059},1060#[cfg(feature = "dtype-decimal")]1061Decimal(buf) => <DecimalField as ParsedBuilder>::parse_bytes(1062buf,1063bytes,1064ignore_errors,1065needs_escaping,1066missing_is_null,1067None,1068),1069Utf8(buf) => <Utf8Field as ParsedBuilder>::parse_bytes(1070buf,1071bytes,1072ignore_errors,1073needs_escaping,1074missing_is_null,1075None,1076),1077#[cfg(feature = "dtype-datetime")]1078Datetime { buf, time_unit, .. } => {1079<DatetimeField<Int64Type> as ParsedBuilder>::parse_bytes(1080buf,1081bytes,1082ignore_errors,1083needs_escaping,1084missing_is_null,1085Some(*time_unit),1086)1087},1088#[cfg(feature = "dtype-date")]1089Date(buf) => <DatetimeField<Int32Type> as ParsedBuilder>::parse_bytes(1090buf,1091bytes,1092ignore_errors,1093needs_escaping,1094missing_is_null,1095None,1096),1097#[cfg(feature = "dtype-categorical")]1098Categorical8(buf) => {1099buf.parse_bytes(bytes, ignore_errors, needs_escaping, missing_is_null, None)1100},1101#[cfg(feature = "dtype-categorical")]1102Categorical16(buf) => {1103buf.parse_bytes(bytes, ignore_errors, needs_escaping, missing_is_null, None)1104},1105#[cfg(feature = "dtype-categorical")]1106Categorical32(buf) => {1107buf.parse_bytes(bytes, ignore_errors, needs_escaping, missing_is_null, None)1108},1109}1110}1111}11121113#[inline]1114fn prepare_decimal_comma(bytes: &[u8], scratch: &mut Vec<u8>) {1115scratch.clear();1116scratch.reserve(bytes.len());11171118// SAFETY: we pre-allocated.1119for &byte in bytes {1120if byte == b',' {1121unsafe { scratch.push_unchecked(b'.') }1122} else {1123unsafe { scratch.push_unchecked(byte) }1124}1125}1126}112711281129