Path: blob/main/crates/polars-parquet/src/parquet/read/metadata.rs
6940 views
use std::cmp::min;1use std::io::{Read, Seek, SeekFrom};23use polars_parquet_format::FileMetaData as TFileMetadata;4use polars_parquet_format::thrift::protocol::TCompactInputProtocol;56use super::super::metadata::FileMetadata;7use super::super::{DEFAULT_FOOTER_READ_SIZE, FOOTER_SIZE, HEADER_SIZE, PARQUET_MAGIC};8use crate::parquet::error::{ParquetError, ParquetResult};910pub(super) fn metadata_len(buffer: &[u8], len: usize) -> u32 {11u32::from_le_bytes(buffer[len - 8..len - 4].try_into().unwrap())12}1314// see (unstable) Seek::stream_len15fn stream_len(seek: &mut impl Seek) -> std::result::Result<u64, std::io::Error> {16let old_pos = seek.stream_position()?;17let len = seek.seek(SeekFrom::End(0))?;1819// Avoid seeking a third time when we were already at the end of the20// stream. The branch is usually way cheaper than a seek operation.21if old_pos != len {22seek.seek(SeekFrom::Start(old_pos))?;23}2425Ok(len)26}2728/// Reads a [`FileMetadata`] from the reader, located at the end of the file.29pub fn read_metadata<R: Read + Seek>(reader: &mut R) -> ParquetResult<FileMetadata> {30// check file is large enough to hold footer31let file_size = stream_len(reader)?;32read_metadata_with_size(reader, file_size)33}3435/// Reads a [`FileMetadata`] from the reader, located at the end of the file, with known file size.36pub fn read_metadata_with_size<R: Read + Seek>(37reader: &mut R,38file_size: u64,39) -> ParquetResult<FileMetadata> {40if file_size < HEADER_SIZE + FOOTER_SIZE {41return Err(ParquetError::oos(42"A parquet file must contain a header and footer with at least 12 bytes",43));44}4546// read and cache up to DEFAULT_FOOTER_READ_SIZE bytes from the end and process the footer47let default_end_len = min(DEFAULT_FOOTER_READ_SIZE, file_size) as usize;48reader.seek(SeekFrom::End(-(default_end_len as i64)))?;4950let mut buffer = Vec::with_capacity(default_end_len);51reader52.by_ref()53.take(default_end_len as u64)54.read_to_end(&mut buffer)?;5556// check this is indeed a parquet file57if buffer[default_end_len - 4..] != PARQUET_MAGIC {58return Err(ParquetError::oos("The file must end with PAR1"));59}6061let metadata_len: u32 = metadata_len(&buffer, default_end_len);62let metadata_len: u64 = metadata_len as u64;6364let footer_len = FOOTER_SIZE + metadata_len;65if footer_len > file_size {66return Err(ParquetError::oos(67"The footer size must be smaller or equal to the file's size",68));69}7071let reader: &[u8] = if (footer_len as usize) < buffer.len() {72// the whole metadata is in the bytes we already read73let remaining = buffer.len() - footer_len as usize;74&buffer[remaining..]75} else {76// the end of file read by default is not long enough, read again including the metadata.77reader.seek(SeekFrom::End(-(footer_len as i64)))?;7879buffer.clear();80buffer.try_reserve(footer_len as usize)?;81reader.take(footer_len).read_to_end(&mut buffer)?;8283&buffer84};8586// a highly nested but sparse struct could result in many allocations87let max_size = reader.len() * 2 + 1024;8889deserialize_metadata(reader, max_size)90}9192/// Parse loaded metadata bytes93pub fn deserialize_metadata<R: Read>(reader: R, max_size: usize) -> ParquetResult<FileMetadata> {94let mut prot = TCompactInputProtocol::new(reader, max_size);95let metadata = TFileMetadata::read_from_in_protocol(&mut prot)?;9697FileMetadata::try_from_thrift(metadata)98}99100101