Path: blob/main/crates/polars-ops/src/chunked_array/binary/slice.rs
7884 views
use std::cmp::Ordering;12use polars_core::prelude::arity::{binary_elementwise, ternary_elementwise, unary_elementwise};3use polars_core::prelude::{BinaryChunked, ChunkFullNull, DataType, Int64Chunked, UInt64Chunked};4use polars_error::{PolarsResult, polars_ensure};56fn head_binary(opt_bytes: Option<&[u8]>, opt_n: Option<i64>) -> Option<&[u8]> {7if let (Some(bytes), Some(n)) = (opt_bytes, opt_n) {8let end_idx = head_binary_values(bytes, n);9Some(&bytes[..end_idx])10} else {11None12}13}1415fn head_binary_values(bytes: &[u8], n: i64) -> usize {16match n.cmp(&0) {17Ordering::Equal => 0,18Ordering::Greater => {19// Take first n bytes20std::cmp::min(n as usize, bytes.len())21},22Ordering::Less => {23// End n bytes from the end24bytes.len().saturating_sub((-n) as usize)25},26}27}2829fn tail_binary(opt_bytes: Option<&[u8]>, opt_n: Option<i64>) -> Option<&[u8]> {30if let (Some(bytes), Some(n)) = (opt_bytes, opt_n) {31let start_idx = tail_binary_values(bytes, n);32Some(&bytes[start_idx..])33} else {34None35}36}3738fn tail_binary_values(bytes: &[u8], n: i64) -> usize {39let max_len = bytes.len();4041match n.cmp(&0) {42Ordering::Equal => max_len,43Ordering::Greater => {44// Start from nth byte from the end45max_len.saturating_sub(n as usize)46},47Ordering::Less => {48// Start after the nth byte49std::cmp::min((-n) as usize, max_len)50},51}52}5354fn slice_ternary_offsets(55opt_bytes: Option<&[u8]>,56opt_offset: Option<i64>,57opt_length: Option<u64>,58) -> Option<(usize, usize)> {59let bytes = opt_bytes?;60let offset = opt_offset?;61Some(slice_ternary_offsets_value(62bytes,63offset,64opt_length.unwrap_or(u64::MAX),65))66}6768pub fn slice_ternary_offsets_value(bytes: &[u8], offset: i64, length: u64) -> (usize, usize) {69// Fast-path: always empty slice70if length == 0 || offset >= bytes.len() as i64 {71return (0, 0);72}7374let start_byte_offset = if offset >= 0 {75std::cmp::min(offset as usize, bytes.len())76} else {77// If `offset` is negative, it counts from the end78let abs_offset = (-offset) as usize;79if abs_offset > bytes.len() {80// Offset is before the start - handle length reduction81let length_reduction = abs_offset - bytes.len();82let adjusted_length = (length as usize).saturating_sub(length_reduction);83return (0, std::cmp::min(adjusted_length, bytes.len()));84}85bytes.len() - abs_offset86};8788let remaining = bytes.len() - start_byte_offset;89let end_byte_offset = start_byte_offset + std::cmp::min(length as usize, remaining);9091(start_byte_offset, end_byte_offset)92}9394fn slice_ternary(95opt_bytes: Option<&[u8]>,96opt_offset: Option<i64>,97opt_length: Option<u64>,98) -> Option<&[u8]> {99let (start, end) = slice_ternary_offsets(opt_bytes, opt_offset, opt_length)?;100opt_bytes.map(|bytes| &bytes[start..end])101}102103pub(super) fn slice(104ca: &BinaryChunked,105offset: &Int64Chunked,106length: &UInt64Chunked,107) -> BinaryChunked {108match (ca.len(), offset.len(), length.len()) {109(1, 1, _) => {110let bytes = ca.get(0);111let offset = offset.get(0);112unary_elementwise(length, |length| slice_ternary(bytes, offset, length))113.with_name(ca.name().clone())114},115(_, 1, 1) => {116let offset = offset.get(0);117let length = length.get(0).unwrap_or(u64::MAX);118119let Some(offset) = offset else {120return BinaryChunked::full_null(ca.name().clone(), ca.len());121};122123ca.apply_nonnull_values_generic(DataType::Binary, |val| {124let (start, end) = slice_ternary_offsets_value(val, offset, length);125&val[start..end]126})127},128(1, _, 1) => {129let bytes = ca.get(0);130let length = length.get(0);131unary_elementwise(offset, |offset| slice_ternary(bytes, offset, length))132.with_name(ca.name().clone())133},134(1, len_b, len_c) if len_b == len_c => {135let bytes = ca.get(0);136binary_elementwise(offset, length, |offset, length| {137slice_ternary(bytes, offset, length)138})139},140(len_a, 1, len_c) if len_a == len_c => {141fn infer<F: for<'a> FnMut(Option<&'a [u8]>, Option<u64>) -> Option<&'a [u8]>>(142f: F,143) -> F {144f145}146let offset = offset.get(0);147binary_elementwise(148ca,149length,150infer(|bytes, length| slice_ternary(bytes, offset, length)),151)152},153(len_a, len_b, 1) if len_a == len_b => {154fn infer<F: for<'a> FnMut(Option<&'a [u8]>, Option<i64>) -> Option<&'a [u8]>>(155f: F,156) -> F {157f158}159let length = length.get(0);160binary_elementwise(161ca,162offset,163infer(|bytes, offset| slice_ternary(bytes, offset, length)),164)165},166_ => ternary_elementwise(ca, offset, length, slice_ternary),167}168}169170pub(super) fn head(ca: &BinaryChunked, n: &Int64Chunked) -> PolarsResult<BinaryChunked> {171match (ca.len(), n.len()) {172(len, 1) => {173let n = n.get(0);174let Some(n) = n else {175return Ok(BinaryChunked::full_null(ca.name().clone(), len));176};177178Ok(ca.apply_nonnull_values_generic(DataType::Binary, |val| {179let end = head_binary_values(val, n);180&val[..end]181}))182},183(1, _) => {184let bytes = ca.get(0);185Ok(unary_elementwise(n, |n| head_binary(bytes, n)).with_name(ca.name().clone()))186},187(a, b) => {188polars_ensure!(a == b, ShapeMismatch: "lengths of arguments do not align in 'bin.head' got length: {} for column: {}, got length: {} for argument 'n'", a, ca.name(), b);189Ok(binary_elementwise(ca, n, head_binary))190},191}192}193194pub(super) fn tail(ca: &BinaryChunked, n: &Int64Chunked) -> PolarsResult<BinaryChunked> {195Ok(match (ca.len(), n.len()) {196(len, 1) => {197let n = n.get(0);198let Some(n) = n else {199return Ok(BinaryChunked::full_null(ca.name().clone(), len));200};201202ca.apply_nonnull_values_generic(DataType::Binary, |val| {203let start = tail_binary_values(val, n);204&val[start..]205})206},207(1, _) => {208let bytes = ca.get(0);209unary_elementwise(n, |n| tail_binary(bytes, n)).with_name(ca.name().clone())210},211(a, b) => {212polars_ensure!(a == b, ShapeMismatch: "lengths of arguments do not align in 'bin.tail' got length: {} for column: {}, got length: {} for argument 'n'", a, ca.name(), b);213binary_elementwise(ca, n, tail_binary)214},215})216}217218219