Path: blob/main/crates/polars-io/src/csv/read/options.rs
6939 views
#![allow(unsafe_op_in_unsafe_fn)]1use std::path::PathBuf;2use std::sync::Arc;34use polars_core::datatypes::{DataType, Field};5use polars_core::schema::{Schema, SchemaRef};6use polars_error::PolarsResult;7use polars_utils::pl_str::PlSmallStr;8#[cfg(feature = "serde")]9use serde::{Deserialize, Serialize};1011use crate::RowIndex;1213#[derive(Clone, Debug, PartialEq, Eq, Hash)]14#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]15#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]16pub struct CsvReadOptions {17pub path: Option<PathBuf>,18// Performance related options19pub rechunk: bool,20pub n_threads: Option<usize>,21pub low_memory: bool,22// Row-wise options23pub n_rows: Option<usize>,24pub row_index: Option<RowIndex>,25// Column-wise options26pub columns: Option<Arc<[PlSmallStr]>>,27pub projection: Option<Arc<Vec<usize>>>,28pub schema: Option<SchemaRef>,29pub schema_overwrite: Option<SchemaRef>,30pub dtype_overwrite: Option<Arc<Vec<DataType>>>,31// CSV-specific options32pub parse_options: Arc<CsvParseOptions>,33pub has_header: bool,34pub chunk_size: usize,35/// Skip rows according to the CSV spec.36pub skip_rows: usize,37/// Skip lines according to newline char (e.g. escaping will be ignored)38pub skip_lines: usize,39pub skip_rows_after_header: usize,40pub infer_schema_length: Option<usize>,41pub raise_if_empty: bool,42pub ignore_errors: bool,43pub fields_to_cast: Vec<Field>,44}4546#[derive(Clone, Debug, PartialEq, Eq, Hash)]47#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]48#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]49pub struct CsvParseOptions {50pub separator: u8,51pub quote_char: Option<u8>,52pub eol_char: u8,53pub encoding: CsvEncoding,54pub null_values: Option<NullValues>,55pub missing_is_null: bool,56pub truncate_ragged_lines: bool,57pub comment_prefix: Option<CommentPrefix>,58pub try_parse_dates: bool,59pub decimal_comma: bool,60}6162impl Default for CsvReadOptions {63fn default() -> Self {64Self {65path: None,6667rechunk: false,68n_threads: None,69low_memory: false,7071n_rows: None,72row_index: None,7374columns: None,75projection: None,76schema: None,77schema_overwrite: None,78dtype_overwrite: None,7980parse_options: Default::default(),81has_header: true,82chunk_size: 1 << 18,83skip_rows: 0,84skip_lines: 0,85skip_rows_after_header: 0,86infer_schema_length: Some(100),87raise_if_empty: true,88ignore_errors: false,89fields_to_cast: vec![],90}91}92}9394/// Options related to parsing the CSV format.95impl Default for CsvParseOptions {96fn default() -> Self {97Self {98separator: b',',99quote_char: Some(b'"'),100eol_char: b'\n',101encoding: Default::default(),102null_values: None,103missing_is_null: true,104truncate_ragged_lines: false,105comment_prefix: None,106try_parse_dates: false,107decimal_comma: false,108}109}110}111112impl CsvReadOptions {113pub fn get_parse_options(&self) -> Arc<CsvParseOptions> {114self.parse_options.clone()115}116117pub fn with_path<P: Into<PathBuf>>(mut self, path: Option<P>) -> Self {118self.path = path.map(|p| p.into());119self120}121122/// Whether to makes the columns contiguous in memory.123pub fn with_rechunk(mut self, rechunk: bool) -> Self {124self.rechunk = rechunk;125self126}127128/// Number of threads to use for reading. Defaults to the size of the polars129/// thread pool.130pub fn with_n_threads(mut self, n_threads: Option<usize>) -> Self {131self.n_threads = n_threads;132self133}134135/// Reduce memory consumption at the expense of performance136pub fn with_low_memory(mut self, low_memory: bool) -> Self {137self.low_memory = low_memory;138self139}140141/// Limits the number of rows to read.142pub fn with_n_rows(mut self, n_rows: Option<usize>) -> Self {143self.n_rows = n_rows;144self145}146147/// Adds a row index column.148pub fn with_row_index(mut self, row_index: Option<RowIndex>) -> Self {149self.row_index = row_index;150self151}152153/// Which columns to select.154pub fn with_columns(mut self, columns: Option<Arc<[PlSmallStr]>>) -> Self {155self.columns = columns;156self157}158159/// Which columns to select denoted by their index. The index starts from 0160/// (i.e. [0, 4] would select the 1st and 5th column).161pub fn with_projection(mut self, projection: Option<Arc<Vec<usize>>>) -> Self {162self.projection = projection;163self164}165166/// Set the schema to use for CSV file. The length of the schema must match167/// the number of columns in the file. If this is [None], the schema is168/// inferred from the file.169pub fn with_schema(mut self, schema: Option<SchemaRef>) -> Self {170self.schema = schema;171self172}173174/// Overwrites the data types in the schema by column name.175pub fn with_schema_overwrite(mut self, schema_overwrite: Option<SchemaRef>) -> Self {176self.schema_overwrite = schema_overwrite;177self178}179180/// Overwrite the dtypes in the schema in the order of the slice that's given.181/// This is useful if you don't know the column names beforehand182pub fn with_dtype_overwrite(mut self, dtype_overwrite: Option<Arc<Vec<DataType>>>) -> Self {183self.dtype_overwrite = dtype_overwrite;184self185}186187/// Sets the CSV parsing options. See [map_parse_options][Self::map_parse_options]188/// for an easier way to mutate them in-place.189pub fn with_parse_options(mut self, parse_options: CsvParseOptions) -> Self {190self.parse_options = Arc::new(parse_options);191self192}193194/// Sets whether the CSV file has a header row.195pub fn with_has_header(mut self, has_header: bool) -> Self {196self.has_header = has_header;197self198}199200/// Sets the chunk size used by the parser. This influences performance.201pub fn with_chunk_size(mut self, chunk_size: usize) -> Self {202self.chunk_size = chunk_size;203self204}205206/// Start reading after ``skip_rows`` rows. The header will be parsed at this207/// offset. Note that we respect CSV escaping/comments when skipping rows.208/// If you want to skip by newline char only, use `skip_lines`.209pub fn with_skip_rows(mut self, skip_rows: usize) -> Self {210self.skip_rows = skip_rows;211self212}213214/// Start reading after `skip_lines` lines. The header will be parsed at this215/// offset. Note that CSV escaping will not be respected when skipping lines.216/// If you want to skip valid CSV rows, use ``skip_rows``.217pub fn with_skip_lines(mut self, skip_lines: usize) -> Self {218self.skip_lines = skip_lines;219self220}221222/// Number of rows to skip after the header row.223pub fn with_skip_rows_after_header(mut self, skip_rows_after_header: usize) -> Self {224self.skip_rows_after_header = skip_rows_after_header;225self226}227228/// Set the number of rows to use when inferring the csv schema.229/// The default is 100 rows.230/// Setting to [None] will do a full table scan, which is very slow.231pub fn with_infer_schema_length(mut self, infer_schema_length: Option<usize>) -> Self {232self.infer_schema_length = infer_schema_length;233self234}235236/// Whether to raise an error if the frame is empty. By default an empty237/// DataFrame is returned.238pub fn with_raise_if_empty(mut self, raise_if_empty: bool) -> Self {239self.raise_if_empty = raise_if_empty;240self241}242243/// Continue with next batch when a ParserError is encountered.244pub fn with_ignore_errors(mut self, ignore_errors: bool) -> Self {245self.ignore_errors = ignore_errors;246self247}248249/// Apply a function to the parse options.250pub fn map_parse_options<F: Fn(CsvParseOptions) -> CsvParseOptions>(251mut self,252map_func: F,253) -> Self {254let parse_options = Arc::unwrap_or_clone(self.parse_options);255self.parse_options = Arc::new(map_func(parse_options));256self257}258}259260impl CsvParseOptions {261/// The character used to separate fields in the CSV file. This262/// is most often a comma ','.263pub fn with_separator(mut self, separator: u8) -> Self {264self.separator = separator;265self266}267268/// Set the character used for field quoting. This is most often double269/// quotes '"'. Set this to [None] to disable quote parsing.270pub fn with_quote_char(mut self, quote_char: Option<u8>) -> Self {271self.quote_char = quote_char;272self273}274275/// Set the character used to indicate an end-of-line (eol).276pub fn with_eol_char(mut self, eol_char: u8) -> Self {277self.eol_char = eol_char;278self279}280281/// Set the encoding used by the file.282pub fn with_encoding(mut self, encoding: CsvEncoding) -> Self {283self.encoding = encoding;284self285}286287/// Set values that will be interpreted as missing/null.288///289/// Note: These values are matched before quote-parsing, so if the null values290/// are quoted then those quotes also need to be included here.291pub fn with_null_values(mut self, null_values: Option<NullValues>) -> Self {292self.null_values = null_values;293self294}295296/// Treat missing fields as null.297pub fn with_missing_is_null(mut self, missing_is_null: bool) -> Self {298self.missing_is_null = missing_is_null;299self300}301302/// Truncate lines that are longer than the schema.303pub fn with_truncate_ragged_lines(mut self, truncate_ragged_lines: bool) -> Self {304self.truncate_ragged_lines = truncate_ragged_lines;305self306}307308/// Sets the comment prefix for this instance. Lines starting with this309/// prefix will be ignored.310pub fn with_comment_prefix<T: Into<CommentPrefix>>(311mut self,312comment_prefix: Option<T>,313) -> Self {314self.comment_prefix = comment_prefix.map(Into::into);315self316}317318/// Automatically try to parse dates/datetimes and time. If parsing fails,319/// columns remain of dtype [`DataType::String`].320pub fn with_try_parse_dates(mut self, try_parse_dates: bool) -> Self {321self.try_parse_dates = try_parse_dates;322self323}324325/// Parse floats with a comma as decimal separator.326pub fn with_decimal_comma(mut self, decimal_comma: bool) -> Self {327self.decimal_comma = decimal_comma;328self329}330}331332#[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Hash)]333#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]334#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]335pub enum CsvEncoding {336/// Utf8 encoding.337#[default]338Utf8,339/// Utf8 encoding and unknown bytes are replaced with �.340LossyUtf8,341}342343#[derive(Clone, Debug, Eq, PartialEq, Hash)]344#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]345#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]346pub enum CommentPrefix {347/// A single byte character that indicates the start of a comment line.348Single(u8),349/// A string that indicates the start of a comment line.350/// This allows for multiple characters to be used as a comment identifier.351Multi(PlSmallStr),352}353354impl CommentPrefix {355/// Creates a new `CommentPrefix` for the `Single` variant.356pub fn new_single(prefix: u8) -> Self {357CommentPrefix::Single(prefix)358}359360/// Creates a new `CommentPrefix` for the `Multi` variant.361pub fn new_multi(prefix: PlSmallStr) -> Self {362CommentPrefix::Multi(prefix)363}364365/// Creates a new `CommentPrefix` from a `&str`.366pub fn new_from_str(prefix: &str) -> Self {367if prefix.len() == 1 && prefix.chars().next().unwrap().is_ascii() {368let c = prefix.as_bytes()[0];369CommentPrefix::Single(c)370} else {371CommentPrefix::Multi(PlSmallStr::from_str(prefix))372}373}374}375376impl From<&str> for CommentPrefix {377fn from(value: &str) -> Self {378Self::new_from_str(value)379}380}381382#[derive(Clone, Debug, Eq, PartialEq, Hash)]383#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]384#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]385pub enum NullValues {386/// A single value that's used for all columns387AllColumnsSingle(PlSmallStr),388/// Multiple values that are used for all columns389AllColumns(Vec<PlSmallStr>),390/// Tuples that map column names to null value of that column391Named(Vec<(PlSmallStr, PlSmallStr)>),392}393394impl NullValues {395pub fn compile(self, schema: &Schema) -> PolarsResult<NullValuesCompiled> {396Ok(match self {397NullValues::AllColumnsSingle(v) => NullValuesCompiled::AllColumnsSingle(v),398NullValues::AllColumns(v) => NullValuesCompiled::AllColumns(v),399NullValues::Named(v) => {400let mut null_values = vec![PlSmallStr::from_static(""); schema.len()];401for (name, null_value) in v {402let i = schema.try_index_of(&name)?;403null_values[i] = null_value;404}405NullValuesCompiled::Columns(null_values)406},407})408}409}410411#[derive(Debug, Clone)]412pub enum NullValuesCompiled {413/// A single value that's used for all columns414AllColumnsSingle(PlSmallStr),415// Multiple null values that are null for all columns416AllColumns(Vec<PlSmallStr>),417/// A different null value per column, computed from `NullValues::Named`418Columns(Vec<PlSmallStr>),419}420421impl NullValuesCompiled {422/// # Safety423///424/// The caller must ensure that `index` is in bounds425pub(super) unsafe fn is_null(&self, field: &[u8], index: usize) -> bool {426use NullValuesCompiled::*;427match self {428AllColumnsSingle(v) => v.as_bytes() == field,429AllColumns(v) => v.iter().any(|v| v.as_bytes() == field),430Columns(v) => {431debug_assert!(index < v.len());432v.get_unchecked(index).as_bytes() == field433},434}435}436}437438439