Path: blob/main/crates/polars-lazy/src/scan/file_list_reader.rs
8430 views
use polars_buffer::Buffer;1use polars_core::prelude::*;2use polars_io::RowIndex;3use polars_io::cloud::CloudOptions;4use polars_plan::prelude::UnionArgs;5use polars_utils::pl_path::PlRefPath;67use crate::prelude::*;89/// Reads [LazyFrame] from a filesystem or a cloud storage.10/// Supports glob patterns.11///12/// Use [LazyFileListReader::finish] to get the final [LazyFrame].13pub trait LazyFileListReader: Clone {14/// Get the final [LazyFrame].15fn finish(self) -> PolarsResult<LazyFrame> {16if !self.glob() {17return self.finish_no_glob();18}1920let ScanSources::Paths(paths) = self.sources() else {21unreachable!("opened-files or in-memory buffers should never be globbed");22};2324let lfs = paths25.iter()26.map(|path| {27self.clone()28// Each individual reader should not apply a row limit.29.with_n_rows(None)30// Each individual reader should not apply a row index.31.with_row_index(None)32.with_paths(Buffer::from_iter([path.clone()]))33.with_rechunk(false)34.finish_no_glob()35.map_err(|e| {36polars_err!(37ComputeError: "error while reading {}: {}", path, e38)39})40})41.collect::<PolarsResult<Vec<_>>>()?;4243polars_ensure!(44!lfs.is_empty(),45ComputeError: "no matching files found in {:?}", paths.iter().map(|x| x.as_str()).collect::<Vec<_>>()46);4748let mut lf = self.concat_impl(lfs)?;49if let Some(n_rows) = self.n_rows() {50lf = lf.slice(0, n_rows as IdxSize)51};52if let Some(rc) = self.row_index() {53lf = lf.with_row_index(rc.name.clone(), Some(rc.offset))54};5556Ok(lf)57}5859/// Recommended concatenation of [LazyFrame]s from many input files.60///61/// This method should not take into consideration [LazyFileListReader::n_rows]62/// nor [LazyFileListReader::row_index].63fn concat_impl(&self, lfs: Vec<LazyFrame>) -> PolarsResult<LazyFrame> {64let args = UnionArgs {65rechunk: self.rechunk(),66parallel: true,67to_supertypes: false,68from_partitioned_ds: true,69..Default::default()70};71concat_impl(&lfs, args)72}7374/// Get the final [LazyFrame].75/// This method assumes, that path is *not* a glob.76///77/// It is recommended to always use [LazyFileListReader::finish] method.78fn finish_no_glob(self) -> PolarsResult<LazyFrame>;7980fn glob(&self) -> bool {81true82}8384/// Get the sources for this reader.85fn sources(&self) -> &ScanSources;8687/// Set sources of the scanned files.88#[must_use]89fn with_sources(self, source: ScanSources) -> Self;9091/// Set paths of the scanned files.92#[must_use]93fn with_paths(self, paths: Buffer<PlRefPath>) -> Self {94self.with_sources(ScanSources::Paths(paths))95}9697/// Configure the row limit.98fn with_n_rows(self, n_rows: impl Into<Option<usize>>) -> Self;99100/// Configure the row index.101fn with_row_index(self, row_index: impl Into<Option<RowIndex>>) -> Self;102103/// Rechunk the memory to contiguous chunks when parsing is done.104fn rechunk(&self) -> bool;105106/// Rechunk the memory to contiguous chunks when parsing is done.107#[must_use]108fn with_rechunk(self, toggle: bool) -> Self;109110/// Try to stop parsing when `n` rows are parsed. During multithreaded parsing the upper bound `n` cannot111/// be guaranteed.112fn n_rows(&self) -> Option<usize>;113114/// Add a row index column.115fn row_index(&self) -> Option<&RowIndex>;116117/// [CloudOptions] used to list files.118fn cloud_options(&self) -> Option<&CloudOptions> {119None120}121}122123124