Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-ops/src/chunked_array/binary/namespace.rs
8375 views
1
#[cfg(feature = "binary_encoding")]
2
use std::borrow::Cow;
3
4
#[cfg(feature = "binary_encoding")]
5
use arrow::array::Array;
6
#[cfg(feature = "binary_encoding")]
7
use base64::Engine as _;
8
#[cfg(feature = "binary_encoding")]
9
use base64::engine::general_purpose;
10
use memchr::memmem::find;
11
use polars_compute::cast::{binview_to_fixed_size_list_dyn, binview_to_primitive_dyn};
12
use polars_compute::size::binary_size_bytes;
13
use polars_core::prelude::arity::{broadcast_binary_elementwise_values, unary_elementwise_values};
14
15
use super::*;
16
17
pub trait BinaryNameSpaceImpl: AsBinary {
18
/// Slice the binary values.
19
///
20
/// Determines a slice starting from `offset` and with length `length` of each of the elements.
21
/// `offset` can be negative, in which case the start counts from the end of the bytes.
22
fn bin_slice(&self, offset: &Column, length: &Column) -> PolarsResult<BinaryChunked> {
23
let ca = self.as_binary();
24
let offset = offset.cast(&DataType::Int64)?;
25
let length = length.strict_cast(&DataType::UInt64)?;
26
27
Ok(super::slice::slice(ca, offset.i64()?, length.u64()?))
28
}
29
/// Slice the first `n` bytes of the binary value.
30
///
31
/// Determines a slice starting at the beginning of the binary data up to offset `n` of each
32
/// element. `n` can be negative, in which case the slice ends `n` bytes from the end.
33
fn bin_head(&self, n: &Column) -> PolarsResult<BinaryChunked> {
34
let ca = self.as_binary();
35
let n = n.strict_cast(&DataType::Int64)?;
36
37
super::slice::head(ca, n.i64()?)
38
}
39
40
/// Slice the last `n` bytes of the binary value.
41
///
42
/// Determines a slice starting at offset `n` of each element. `n` can be
43
/// negative, in which case the slice begins `n` bytes from the start.
44
fn bin_tail(&self, n: &Column) -> PolarsResult<BinaryChunked> {
45
let ca = self.as_binary();
46
let n = n.strict_cast(&DataType::Int64)?;
47
48
super::slice::tail(ca, n.i64()?)
49
}
50
51
/// Check if binary contains given literal
52
fn contains(&self, lit: &[u8]) -> BooleanChunked {
53
let ca = self.as_binary();
54
let f = |s: &[u8]| find(s, lit).is_some();
55
unary_elementwise_values(ca, f)
56
}
57
58
fn contains_chunked(&self, lit: &BinaryChunked) -> PolarsResult<BooleanChunked> {
59
let ca = self.as_binary();
60
Ok(match lit.len() {
61
1 => match lit.get(0) {
62
Some(lit) => ca.contains(lit),
63
None => BooleanChunked::full_null(ca.name().clone(), ca.len()),
64
},
65
_ => {
66
polars_ensure!(
67
ca.len() == lit.len() || ca.len() == 1,
68
length_mismatch = "bin.contains",
69
ca.len(),
70
lit.len()
71
);
72
broadcast_binary_elementwise_values(ca, lit, |src, lit| find(src, lit).is_some())
73
},
74
})
75
}
76
77
/// Check if strings ends with a substring
78
fn ends_with(&self, sub: &[u8]) -> BooleanChunked {
79
let ca = self.as_binary();
80
let f = |s: &[u8]| s.ends_with(sub);
81
ca.apply_nonnull_values_generic(DataType::Boolean, f)
82
}
83
84
/// Check if strings starts with a substring
85
fn starts_with(&self, sub: &[u8]) -> BooleanChunked {
86
let ca = self.as_binary();
87
let f = |s: &[u8]| s.starts_with(sub);
88
ca.apply_nonnull_values_generic(DataType::Boolean, f)
89
}
90
91
fn starts_with_chunked(&self, prefix: &BinaryChunked) -> PolarsResult<BooleanChunked> {
92
let ca = self.as_binary();
93
Ok(match prefix.len() {
94
1 => match prefix.get(0) {
95
Some(s) => self.starts_with(s),
96
None => BooleanChunked::full_null(ca.name().clone(), ca.len()),
97
},
98
_ => {
99
polars_ensure!(
100
ca.len() == prefix.len() || ca.len() == 1,
101
length_mismatch = "bin.starts_with",
102
ca.len(),
103
prefix.len()
104
);
105
broadcast_binary_elementwise_values(ca, prefix, |s, sub| s.starts_with(sub))
106
},
107
})
108
}
109
110
fn ends_with_chunked(&self, suffix: &BinaryChunked) -> PolarsResult<BooleanChunked> {
111
let ca = self.as_binary();
112
Ok(match suffix.len() {
113
1 => match suffix.get(0) {
114
Some(s) => self.ends_with(s),
115
None => BooleanChunked::full_null(ca.name().clone(), ca.len()),
116
},
117
_ => {
118
polars_ensure!(
119
ca.len() == suffix.len() || ca.len() == 1,
120
length_mismatch = "bin.ends_with",
121
ca.len(),
122
suffix.len()
123
);
124
broadcast_binary_elementwise_values(ca, suffix, |s, sub| s.ends_with(sub))
125
},
126
})
127
}
128
129
/// Get the size of the binary values in bytes.
130
fn size_bytes(&self) -> UInt32Chunked {
131
let ca = self.as_binary();
132
ca.apply_kernel_cast(&binary_size_bytes)
133
}
134
135
#[cfg(feature = "binary_encoding")]
136
fn hex_decode(&self, strict: bool) -> PolarsResult<BinaryChunked> {
137
let ca = self.as_binary();
138
if strict {
139
ca.try_apply_nonnull_values_generic(|s| {
140
hex::decode(s).map_err(|_| {
141
polars_err!(
142
ComputeError:
143
"invalid `hex` encoding found; try setting `strict=false` to ignore"
144
)
145
})
146
})
147
} else {
148
Ok(ca.apply(|opt_s| opt_s.and_then(|s| hex::decode(s).ok().map(Cow::Owned))))
149
}
150
}
151
152
#[cfg(feature = "binary_encoding")]
153
fn hex_encode(&self) -> Series {
154
let ca = self.as_binary();
155
unsafe {
156
ca.apply_values(|s| hex::encode(s).into_bytes().into())
157
.cast_unchecked(&DataType::String)
158
.unwrap()
159
}
160
}
161
162
#[cfg(feature = "binary_encoding")]
163
fn base64_decode(&self, strict: bool) -> PolarsResult<BinaryChunked> {
164
let ca = self.as_binary();
165
if strict {
166
ca.try_apply_nonnull_values_generic(|s| {
167
general_purpose::STANDARD.decode(s).map_err(|_e| {
168
polars_err!(
169
ComputeError:
170
"invalid `base64` encoding found; try setting `strict=false` to ignore"
171
)
172
})
173
})
174
} else {
175
Ok(ca.apply(|opt_s| {
176
opt_s.and_then(|s| general_purpose::STANDARD.decode(s).ok().map(Cow::Owned))
177
}))
178
}
179
}
180
181
#[cfg(feature = "binary_encoding")]
182
fn base64_encode(&self) -> Series {
183
let ca = self.as_binary();
184
unsafe {
185
ca.apply_values(|s| general_purpose::STANDARD.encode(s).into_bytes().into())
186
.cast_unchecked(&DataType::String)
187
.unwrap()
188
}
189
}
190
191
#[cfg(feature = "binary_encoding")]
192
fn reinterpret(&self, dtype: &DataType, is_little_endian: bool) -> PolarsResult<Series> {
193
unsafe {
194
Ok(Series::from_chunks_and_dtype_unchecked(
195
self.as_binary().name().clone(),
196
self._reinterpret_inner(dtype, is_little_endian)?,
197
dtype,
198
))
199
}
200
}
201
202
#[cfg(feature = "binary_encoding")]
203
fn _reinterpret_inner(
204
&self,
205
dtype: &DataType,
206
is_little_endian: bool,
207
) -> PolarsResult<Vec<Box<dyn Array>>> {
208
use polars_core::with_match_physical_numeric_polars_type;
209
210
let ca = self.as_binary();
211
212
match dtype {
213
dtype if dtype.is_primitive_numeric() || dtype.is_temporal() => {
214
let dtype = dtype.to_physical();
215
let arrow_data_type = dtype
216
.to_arrow(CompatLevel::newest())
217
.underlying_physical_type();
218
with_match_physical_numeric_polars_type!(dtype, |$T| {
219
unsafe {
220
ca.chunks().iter().map(|chunk| {
221
binview_to_primitive_dyn::<<$T as PolarsNumericType>::Native>(
222
&**chunk,
223
&arrow_data_type,
224
is_little_endian,
225
)
226
}).collect()
227
}
228
})
229
},
230
#[cfg(feature = "dtype-array")]
231
DataType::Array(inner_dtype, array_width)
232
if inner_dtype.is_primitive_numeric() || inner_dtype.is_temporal() =>
233
{
234
let inner_dtype = inner_dtype.to_physical();
235
let result: Vec<ArrayRef> = with_match_physical_numeric_polars_type!(inner_dtype, |$T| {
236
unsafe {
237
ca.chunks().iter().map(|chunk| {
238
binview_to_fixed_size_list_dyn::<<$T as PolarsNumericType>::Native>(
239
&**chunk,
240
*array_width,
241
is_little_endian
242
)
243
}).collect::<Result<Vec<ArrayRef>, _>>()
244
}
245
})?;
246
Ok(result)
247
},
248
_ => Err(
249
polars_err!(InvalidOperation: "unsupported data type {:?} in reinterpret. Only numeric or temporal types, or Arrays of those, are allowed.", dtype),
250
),
251
}
252
}
253
}
254
255
impl BinaryNameSpaceImpl for BinaryChunked {}
256
257