Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-utils/src/plpath.rs
6939 views
1
use core::fmt;
2
use std::path::{Path, PathBuf};
3
use std::str::FromStr;
4
use std::sync::Arc;
5
6
/// A Path or URI
7
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
8
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
9
#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
10
pub enum PlPath {
11
Local(Arc<Path>),
12
Cloud(PlCloudPath),
13
}
14
15
/// A reference to a Path or URI
16
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
17
pub enum PlPathRef<'a> {
18
Local(&'a Path),
19
Cloud(PlCloudPathRef<'a>),
20
}
21
22
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
23
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
24
#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
25
pub struct PlCloudPath {
26
/// The scheme used in cloud e.g. `s3://` or `file://`.
27
scheme: CloudScheme,
28
/// The full URI e.g. `s3://path/to/bucket`.
29
uri: Arc<str>,
30
}
31
32
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
33
pub struct PlCloudPathRef<'a> {
34
/// The scheme used in cloud e.g. `s3://` or `file://`.
35
scheme: CloudScheme,
36
/// The full URI e.g. `s3://path/to/bucket`.
37
uri: &'a str,
38
}
39
40
impl<'a> fmt::Display for PlCloudPathRef<'a> {
41
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
42
f.write_str(self.uri())
43
}
44
}
45
46
impl fmt::Display for PlCloudPath {
47
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
48
self.as_ref().fmt(f)
49
}
50
}
51
52
impl PlCloudPath {
53
pub fn as_ref(&self) -> PlCloudPathRef<'_> {
54
PlCloudPathRef {
55
scheme: self.scheme,
56
uri: self.uri.as_ref(),
57
}
58
}
59
60
pub fn strip_scheme(&self) -> &str {
61
&self.uri[self.scheme.as_str().len() + 3..]
62
}
63
}
64
65
impl PlCloudPathRef<'_> {
66
pub fn into_owned(self) -> PlCloudPath {
67
PlCloudPath {
68
scheme: self.scheme,
69
uri: self.uri.into(),
70
}
71
}
72
73
pub fn scheme(&self) -> CloudScheme {
74
self.scheme
75
}
76
77
pub fn uri(&self) -> &str {
78
self.uri
79
}
80
81
pub fn strip_scheme(&self) -> &str {
82
&self.uri[self.scheme.as_str().len() + "://".len()..]
83
}
84
}
85
86
pub struct AddressDisplay<'a> {
87
addr: PlPathRef<'a>,
88
}
89
90
impl<'a> fmt::Display for AddressDisplay<'a> {
91
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
92
match self.addr {
93
PlPathRef::Local(p) => p.display().fmt(f),
94
PlPathRef::Cloud(p) => p.fmt(f),
95
}
96
}
97
}
98
99
macro_rules! impl_scheme {
100
($($t:ident = $n:literal,)+) => {
101
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
102
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
103
#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
104
pub enum CloudScheme {
105
$($t,)+
106
}
107
108
impl FromStr for CloudScheme {
109
type Err = ();
110
111
fn from_str(s: &str) -> Result<Self, Self::Err> {
112
match s {
113
$($n => Ok(Self::$t),)+
114
_ => Err(()),
115
}
116
}
117
}
118
119
impl CloudScheme {
120
pub fn as_str(&self) -> &'static str {
121
match self {
122
$(Self::$t => $n,)+
123
}
124
}
125
}
126
};
127
}
128
129
impl_scheme! {
130
S3 = "s3",
131
S3a = "s3a",
132
Gs = "gs",
133
Gcs = "gcs",
134
File = "file",
135
Abfs = "abfs",
136
Abfss = "abfss",
137
Azure = "azure",
138
Az = "az",
139
Adl = "adl",
140
Http = "http",
141
Https = "https",
142
Hf = "hf",
143
}
144
145
impl fmt::Display for CloudScheme {
146
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
147
f.write_str(self.as_str())
148
}
149
}
150
151
crate::regex_cache::cached_regex! {
152
static CLOUD_SCHEME_REGEX = r"^(s3a?|gs|gcs|file|abfss?|azure|az|adl|https?|hf)$";
153
}
154
155
impl<'a> PlPathRef<'a> {
156
pub fn scheme(&self) -> Option<CloudScheme> {
157
match self {
158
Self::Local(_) => None,
159
Self::Cloud(p) => Some(p.scheme),
160
}
161
}
162
163
pub fn is_local(&self) -> bool {
164
matches!(self, Self::Local(_))
165
}
166
167
pub fn is_cloud_url(&self) -> bool {
168
matches!(self, Self::Cloud(_))
169
}
170
171
pub fn as_local_path(&self) -> Option<&Path> {
172
match self {
173
Self::Local(p) => Some(p),
174
Self::Cloud(_) => None,
175
}
176
}
177
178
pub fn as_cloud_addr(&self) -> Option<PlCloudPathRef<'_>> {
179
match self {
180
Self::Local(_) => None,
181
Self::Cloud(p) => Some(*p),
182
}
183
}
184
185
pub fn join(&self, other: impl AsRef<str>) -> PlPath {
186
let other = other.as_ref();
187
if other.is_empty() {
188
return self.into_owned();
189
}
190
191
match self {
192
Self::Local(p) => PlPath::Local(p.join(other).into()),
193
Self::Cloud(p) => {
194
let needs_slash = !p.uri.ends_with('/') && !other.starts_with('/');
195
196
let mut out =
197
String::with_capacity(p.uri.len() + usize::from(needs_slash) + other.len());
198
199
out.push_str(p.uri);
200
if needs_slash {
201
out.push('/');
202
}
203
// NOTE: This has as a consequence that pushing an absolute path into a URI
204
// just pushes the slashes while for a path it will make that absolute path the new
205
// path. I think this is acceptable as I don't really know what the alternative
206
// would be.
207
out.push_str(other);
208
209
let uri = out.into();
210
PlPath::Cloud(PlCloudPath {
211
scheme: p.scheme,
212
uri,
213
})
214
},
215
}
216
}
217
218
pub fn display(&self) -> AddressDisplay<'_> {
219
AddressDisplay { addr: *self }
220
}
221
222
pub fn from_local_path(path: &'a Path) -> Self {
223
Self::Local(path)
224
}
225
226
pub fn new(uri: &'a str) -> Self {
227
if let Some(i) = uri.find([':', '/']) {
228
if uri[i..].starts_with("://") && CLOUD_SCHEME_REGEX.is_match(&uri[..i]) {
229
let scheme = CloudScheme::from_str(&uri[..i]).unwrap();
230
return Self::Cloud(PlCloudPathRef { scheme, uri });
231
}
232
}
233
234
Self::from_local_path(Path::new(uri))
235
}
236
237
pub fn into_owned(self) -> PlPath {
238
match self {
239
Self::Local(p) => PlPath::Local(p.into()),
240
Self::Cloud(p) => PlPath::Cloud(p.into_owned()),
241
}
242
}
243
244
pub fn strip_scheme(&self) -> &str {
245
match self {
246
Self::Local(p) => p.to_str().unwrap(),
247
Self::Cloud(p) => p.strip_scheme(),
248
}
249
}
250
251
pub fn parent(&self) -> Option<Self> {
252
Some(match self {
253
Self::Local(p) => Self::Local(p.parent()?),
254
Self::Cloud(p) => {
255
let uri = p.uri;
256
let offset_start = p.scheme.as_str().len() + 3;
257
let last_slash = uri[offset_start..]
258
.char_indices()
259
.rev()
260
.find(|(_, c)| *c == '/')?
261
.0;
262
let uri = &uri[..offset_start + last_slash];
263
264
Self::Cloud(PlCloudPathRef {
265
scheme: p.scheme,
266
uri,
267
})
268
},
269
})
270
}
271
272
pub fn extension(&self) -> Option<&str> {
273
match self {
274
Self::Local(path) => path.extension().and_then(|e| e.to_str()),
275
Self::Cloud(_) => {
276
let offset_path = self.strip_scheme();
277
let separator = '/';
278
279
let mut ext_start = None;
280
for (i, c) in offset_path.char_indices() {
281
if c == separator {
282
ext_start = None;
283
}
284
285
if c == '.' && ext_start.is_none() {
286
ext_start = Some(i);
287
}
288
}
289
290
ext_start.map(|i| &offset_path[i + 1..])
291
},
292
}
293
}
294
295
pub fn to_str(&self) -> &'a str {
296
match self {
297
Self::Local(p) => p.to_str().unwrap(),
298
Self::Cloud(p) => p.uri,
299
}
300
}
301
302
// It is up to the caller to ensure that the offset parameter 'n' matches
303
// a valid path segment starting index
304
pub fn offset_bytes(&'a self, n: usize) -> PathBuf {
305
let s = self.to_str();
306
if let Some(scheme) = self.scheme()
307
&& n > 0
308
{
309
debug_assert!(n >= scheme.as_str().len())
310
}
311
PathBuf::from(&s[n..])
312
}
313
}
314
315
impl PlPath {
316
pub fn new(uri: &str) -> Self {
317
PlPathRef::new(uri).into_owned()
318
}
319
320
pub fn display(&self) -> AddressDisplay<'_> {
321
AddressDisplay {
322
addr: match self {
323
Self::Local(p) => PlPathRef::Local(p.as_ref()),
324
Self::Cloud(p) => PlPathRef::Cloud(p.as_ref()),
325
},
326
}
327
}
328
329
pub fn is_local(&self) -> bool {
330
self.as_ref().is_local()
331
}
332
333
pub fn is_cloud_url(&self) -> bool {
334
self.as_ref().is_cloud_url()
335
}
336
337
// We don't want FromStr since we are infallible.
338
#[expect(clippy::should_implement_trait)]
339
pub fn from_str(uri: &str) -> Self {
340
Self::new(uri)
341
}
342
343
pub fn from_string(uri: String) -> Self {
344
Self::new(&uri)
345
}
346
347
pub fn as_ref(&self) -> PlPathRef<'_> {
348
match self {
349
Self::Local(p) => PlPathRef::Local(p.as_ref()),
350
Self::Cloud(p) => PlPathRef::Cloud(p.as_ref()),
351
}
352
}
353
354
pub fn cloud_scheme(&self) -> Option<CloudScheme> {
355
match self {
356
Self::Local(_) => None,
357
Self::Cloud(p) => Some(p.scheme),
358
}
359
}
360
361
pub fn to_str(&self) -> &str {
362
match self {
363
Self::Local(p) => p.to_str().unwrap(),
364
Self::Cloud(p) => p.uri.as_ref(),
365
}
366
}
367
368
pub fn into_local_path(self) -> Option<Arc<Path>> {
369
match self {
370
PlPath::Local(path) => Some(path),
371
PlPath::Cloud(_) => None,
372
}
373
}
374
}
375
376
#[cfg(test)]
377
mod tests {
378
use super::*;
379
380
#[test]
381
fn plpath_join() {
382
macro_rules! assert_plpath_join {
383
($base:literal + $added:literal => $result:literal$(, $uri_result:literal)?) => {
384
// Normal path test
385
let path_base = $base.chars().map(|c| match c {
386
'/' => std::path::MAIN_SEPARATOR,
387
c => c,
388
}).collect::<String>();
389
let path_added = $added.chars().map(|c| match c {
390
'/' => std::path::MAIN_SEPARATOR,
391
c => c,
392
}).collect::<String>();
393
let path_result = $result.chars().map(|c| match c {
394
'/' => std::path::MAIN_SEPARATOR,
395
c => c,
396
}).collect::<String>();
397
assert_eq!(PlPath::new(&path_base).as_ref().join(path_added).to_str(), path_result);
398
399
// URI path test
400
let uri_base = format!("file://{}", $base);
401
#[allow(unused_variables)]
402
let result = {
403
let x = $result;
404
$(let x = $uri_result;)?
405
x
406
};
407
let uri_result = format!("file://{result}");
408
assert_eq!(
409
PlPath::new(uri_base.as_str())
410
.as_ref()
411
.join($added)
412
.to_str(),
413
uri_result.as_str()
414
);
415
};
416
}
417
418
assert_plpath_join!("a/b/c/" + "d/e" => "a/b/c/d/e");
419
assert_plpath_join!("a/b/c" + "d/e" => "a/b/c/d/e");
420
assert_plpath_join!("a/b/c" + "d/e/" => "a/b/c/d/e/");
421
assert_plpath_join!("a/b/c" + "" => "a/b/c");
422
assert_plpath_join!("a/b/c" + "/d" => "/d", "a/b/c/d");
423
assert_plpath_join!("a/b/c" + "/d/" => "/d/", "a/b/c/d/");
424
assert_plpath_join!("" + "/d/" => "/d/");
425
assert_plpath_join!("/" + "/d/" => "/d/", "//d/");
426
assert_plpath_join!("/x/y" + "/d/" => "/d/", "/x/y/d/");
427
assert_plpath_join!("/x/y" + "/d" => "/d", "/x/y/d");
428
assert_plpath_join!("/x/y" + "d" => "/x/y/d");
429
430
assert_plpath_join!("/a/longer" + "path" => "/a/longer/path");
431
assert_plpath_join!("/a/longer" + "/path" => "/path", "/a/longer/path");
432
assert_plpath_join!("/a/longer" + "path/wow" => "/a/longer/path/wow");
433
assert_plpath_join!("/a/longer" + "/path/wow" => "/path/wow", "/a/longer/path/wow");
434
assert_plpath_join!("/an/even/longer" + "path" => "/an/even/longer/path");
435
assert_plpath_join!("/an/even/longer" + "/path" => "/path", "/an/even/longer/path");
436
assert_plpath_join!("/an/even/longer" + "path/wow" => "/an/even/longer/path/wow");
437
assert_plpath_join!("/an/even/longer" + "/path/wow" => "/path/wow", "/an/even/longer/path/wow");
438
}
439
}
440
441