Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
pola-rs
GitHub Repository: pola-rs/polars
Path: blob/main/crates/polars-utils/src/pl_path.rs
8407 views
1
use std::borrow::{Borrow, Cow};
2
use std::ffi::OsStr;
3
use std::fmt::Display;
4
use std::ops::{Deref, Range};
5
use std::path::{Path, PathBuf};
6
7
use polars_error::{PolarsResult, polars_err};
8
9
use crate::format_pl_refstr;
10
use crate::pl_str::PlRefStr;
11
12
/// Windows paths can be prefixed with this.
13
/// <https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry>
14
pub const WINDOWS_EXTPATH_PREFIX: &str = r#"\\?\"#;
15
16
/// Path represented as a UTF-8 string.
17
///
18
/// Equality and ordering are based on the string value, which can be sensitive to duplicate
19
/// separators. `as_std_path()` can be used to return a `&std::path::Path` for comparisons / API
20
/// access.
21
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
22
#[repr(transparent)]
23
pub struct PlPath {
24
inner: str,
25
}
26
27
#[derive(Default, Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
28
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
29
#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
30
/// Reference-counted [`PlPath`].
31
///
32
/// # Windows paths invariant
33
/// Windows paths will have leading `\\?\` prefix stripped, and all backslashes normalized to
34
/// forward slashes.
35
pub struct PlRefPath {
36
inner: PlRefStr,
37
}
38
39
impl PlPath {
40
// Note: Do not expose the following constructors, they do not normalize paths.
41
fn _new<S: AsRef<str> + ?Sized>(s: &S) -> &PlPath {
42
let s: &str = s.as_ref();
43
// Safety: `PlPath` is `repr(transparent)` on `str`.
44
unsafe { &*(s as *const str as *const PlPath) }
45
}
46
47
fn _try_from_path(path: &Path) -> PolarsResult<&PlPath> {
48
path.to_str()
49
.ok_or_else(|| polars_err!(non_utf8_path))
50
.map(Self::_new)
51
}
52
53
pub fn as_str(&self) -> &str {
54
unsafe { &*(self as *const PlPath as *const str) }
55
}
56
57
pub fn as_bytes(&self) -> &[u8] {
58
self.as_str().as_bytes()
59
}
60
61
pub fn as_os_str(&self) -> &OsStr {
62
OsStr::new(self)
63
}
64
65
pub fn as_std_path(&self) -> &Path {
66
Path::new(self)
67
}
68
69
pub fn to_ref_path(&self) -> PlRefPath {
70
PlRefPath::_new_no_normalize(self.as_str().into())
71
}
72
73
pub fn scheme(&self) -> Option<CloudScheme> {
74
CloudScheme::from_path(self.as_str())
75
}
76
77
/// Shorthand for `self.scheme().is_some()`.
78
pub fn has_scheme(&self) -> bool {
79
self.scheme().is_some()
80
}
81
82
/// Return a string with the scheme prefix removed (if any).
83
pub fn strip_scheme(&self) -> &str {
84
&self.as_str()[self.scheme().map_or(0, |x| x.strip_scheme_index())..self.inner.len()]
85
}
86
87
pub fn file_name(&self) -> Option<&OsStr> {
88
Path::new(self.strip_scheme()).file_name()
89
}
90
91
pub fn extension(&self) -> Option<&str> {
92
Path::new(self.strip_scheme())
93
.extension()
94
.map(|x| x.to_str().unwrap())
95
}
96
97
pub fn parent(&self) -> Option<&str> {
98
Path::new(self.strip_scheme())
99
.parent()
100
.map(|x| x.to_str().unwrap())
101
}
102
103
/// Slices the path.
104
pub fn sliced(&self, range: Range<usize>) -> &PlPath {
105
Self::_new(&self.as_str()[range])
106
}
107
108
/// Strips the scheme, then returns the authority component, and the remaining
109
/// string after the authority component. This can be understood as extracting
110
/// the bucket/prefix for cloud URIs.
111
///
112
/// E.g. `https://user@host:port/dir/file?param=value`
113
/// * Authority: `user@host:port`
114
/// * Remaining: `/dir/file?param=value`
115
///
116
/// Note, for local / `file:` URIs, the returned authority will be empty, and
117
/// the remainder will be the full URI.
118
///
119
/// # Returns
120
/// (authority, remaining).
121
pub fn strip_scheme_split_authority(&self) -> Option<(&'_ str, &'_ str)> {
122
match self.scheme() {
123
None | Some(CloudScheme::FileNoHostname) => Some(("", self.strip_scheme())),
124
Some(scheme) => {
125
let path_str = self.as_str();
126
let position = self.authority_end_position();
127
128
if position < path_str.len() {
129
assert!(path_str[position..].starts_with('/'));
130
}
131
132
(position < path_str.len()).then_some((
133
&path_str[scheme.strip_scheme_index()..position],
134
&path_str[position..],
135
))
136
},
137
}
138
}
139
140
/// Returns 0 if `self.scheme()` is `None`. Otherwise, returns `i` such that
141
/// `&self.to_str()[..i]` trims to the authority.
142
/// * If there is no '/', separator found, `i` will simply be the length of the string.
143
/// * This is except if the scheme is `FileNoHostname`, where instead `i` will be "file:".len()
144
/// * If `self` has no `CloudScheme`, returns 0
145
pub fn authority_end_position(&self) -> usize {
146
match self.scheme() {
147
None => 0,
148
Some(scheme @ CloudScheme::FileNoHostname) => scheme.strip_scheme_index(),
149
Some(_) => {
150
let after_scheme = self.strip_scheme();
151
let offset = self.as_str().len() - after_scheme.len();
152
153
offset + after_scheme.find('/').unwrap_or(after_scheme.len())
154
},
155
}
156
}
157
158
pub fn to_absolute_path(&self) -> PolarsResult<PlRefPath> {
159
PlRefPath::try_from_pathbuf(std::path::absolute(Path::new(self.strip_scheme()))?)
160
}
161
162
pub fn join(&self, other: impl AsRef<str>) -> PlRefPath {
163
let other = other.as_ref();
164
165
if CloudScheme::from_path(other).is_some() {
166
PlRefPath::new(other)
167
} else {
168
PlRefPath::try_from_pathbuf(self.as_std_path().join(other)).unwrap()
169
}
170
}
171
172
/// Converts backslashes to forward-slashes, and removes `\\?\` prefix.
173
pub fn normalize_windows_path(path_str: &str) -> Option<PlRefPath> {
174
let has_extpath_prefix = path_str.starts_with(WINDOWS_EXTPATH_PREFIX);
175
176
if has_extpath_prefix || cfg!(target_family = "windows") {
177
let path_str = path_str
178
.strip_prefix(WINDOWS_EXTPATH_PREFIX)
179
.unwrap_or(path_str);
180
181
if matches!(
182
CloudScheme::from_path(path_str),
183
None | Some(CloudScheme::File | CloudScheme::FileNoHostname)
184
) && path_str.contains('\\')
185
{
186
let new_path = path_str.replace('\\', "/");
187
let inner = PlRefStr::from_string(new_path);
188
return Some(PlRefPath { inner });
189
}
190
}
191
192
None
193
}
194
}
195
196
impl AsRef<str> for PlPath {
197
fn as_ref(&self) -> &str {
198
self.as_str()
199
}
200
}
201
202
impl AsRef<OsStr> for PlPath {
203
fn as_ref(&self) -> &OsStr {
204
OsStr::new(self.as_str())
205
}
206
}
207
208
impl AsRef<Path> for PlPath {
209
fn as_ref(&self) -> &Path {
210
self.as_std_path()
211
}
212
}
213
214
impl From<&PlPath> for Box<PlPath> {
215
fn from(value: &PlPath) -> Self {
216
let s: &str = value.as_str();
217
let s: Box<str> = s.into();
218
// Safety: `PlPath` is `repr(transparent)` on `str`.
219
let out: Box<PlPath> = unsafe { std::mem::transmute(s) };
220
out
221
}
222
}
223
224
impl Display for PlPath {
225
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
226
Display::fmt(self.as_str(), f)
227
}
228
}
229
230
impl PlRefPath {
231
pub fn empty() -> Self {
232
Self::default()
233
}
234
235
/// Normalizes Windows paths.
236
pub fn new(path: impl AsRef<str> + Into<PlRefStr>) -> Self {
237
if let Some(path) = PlPath::normalize_windows_path(path.as_ref()) {
238
return path;
239
}
240
241
Self::_new_no_normalize(path.into())
242
}
243
244
const fn _new_no_normalize(path: PlRefStr) -> Self {
245
Self { inner: path }
246
}
247
248
pub fn try_from_path(path: &Path) -> PolarsResult<PlRefPath> {
249
Ok(Self::new(PlPath::_try_from_path(path)?.as_str()))
250
}
251
252
pub fn try_from_pathbuf(path: PathBuf) -> PolarsResult<PlRefPath> {
253
Self::try_from_path(&path)
254
}
255
256
pub fn as_str(&self) -> &str {
257
&self.inner
258
}
259
260
pub fn as_ref_str(&self) -> &PlRefStr {
261
&self.inner
262
}
263
264
pub fn into_ref_str(self) -> PlRefStr {
265
self.inner
266
}
267
268
/// Slices the path.
269
pub fn sliced(&self, range: Range<usize>) -> PlRefPath {
270
if range == (0..self.as_str().len()) {
271
self.clone()
272
} else {
273
Self::_new_no_normalize(PlPath::sliced(self, range).as_str().into())
274
}
275
}
276
277
/// # Returns
278
/// Returns an absolute local path if this path ref is a relative local path, otherwise returns None.
279
pub fn to_absolute_path(&self) -> PolarsResult<Cow<'_, PlRefPath>> {
280
Ok(if self.has_scheme() || self.as_std_path().is_absolute() {
281
Cow::Borrowed(self)
282
} else {
283
Cow::Owned(PlPath::to_absolute_path(self)?)
284
})
285
}
286
287
/// Checks if references point to the same allocation.
288
pub fn ptr_eq(this: &Self, other: &Self) -> bool {
289
PlRefStr::ptr_eq(this.as_ref_str(), other.as_ref_str())
290
}
291
}
292
293
impl AsRef<str> for PlRefPath {
294
fn as_ref(&self) -> &str {
295
self.as_str()
296
}
297
}
298
299
impl AsRef<OsStr> for PlRefPath {
300
fn as_ref(&self) -> &OsStr {
301
self.as_os_str()
302
}
303
}
304
305
impl AsRef<Path> for PlRefPath {
306
fn as_ref(&self) -> &Path {
307
self.as_std_path()
308
}
309
}
310
311
impl Deref for PlRefPath {
312
type Target = PlPath;
313
314
fn deref(&self) -> &Self::Target {
315
PlPath::_new(self)
316
}
317
}
318
319
impl Display for PlRefPath {
320
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
321
Display::fmt(self.as_str(), f)
322
}
323
}
324
325
impl ToOwned for PlPath {
326
type Owned = PlRefPath;
327
328
fn to_owned(&self) -> Self::Owned {
329
self.to_ref_path()
330
}
331
}
332
333
impl Borrow<PlPath> for PlRefPath {
334
fn borrow(&self) -> &PlPath {
335
self
336
}
337
}
338
339
impl From<&str> for PlRefPath {
340
fn from(value: &str) -> Self {
341
Self::new(value)
342
}
343
}
344
345
macro_rules! impl_cloud_scheme {
346
($($t:ident = $n:literal,)+) => {
347
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
348
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
349
#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
350
pub enum CloudScheme {
351
$($t,)+
352
}
353
354
impl CloudScheme {
355
/// Note, private function. Users should use [`CloudScheme::from_path`], that will handle e.g.
356
/// `file:/` without hostname properly.
357
#[expect(unreachable_patterns)]
358
fn from_scheme_str(s: &str) -> Option<Self> {
359
Some(match s {
360
$($n => Self::$t,)+
361
_ => return None,
362
})
363
}
364
365
pub const fn as_str(&self) -> &'static str {
366
match self {
367
$(Self::$t => $n,)+
368
}
369
}
370
}
371
};
372
}
373
374
impl_cloud_scheme! {
375
Abfs = "abfs",
376
Abfss = "abfss",
377
Adl = "adl",
378
Az = "az",
379
Azure = "azure",
380
File = "file",
381
FileNoHostname = "file",
382
Gcs = "gcs",
383
Gs = "gs",
384
Hf = "hf",
385
Http = "http",
386
Https = "https",
387
S3 = "s3",
388
S3a = "s3a",
389
}
390
391
impl CloudScheme {
392
pub fn from_path(path: &str) -> Option<Self> {
393
if let Some(stripped) = path.strip_prefix("file:") {
394
return Some(if stripped.starts_with("//") {
395
Self::File
396
} else {
397
Self::FileNoHostname
398
});
399
}
400
401
Self::from_scheme_str(&path[..path.find("://")?])
402
}
403
404
/// Returns `i` such that `&self.as_str()[i..]` strips the scheme, as well as the `://` if it
405
/// exists.
406
pub fn strip_scheme_index(&self) -> usize {
407
if let Self::FileNoHostname = self {
408
5
409
} else {
410
self.as_str().len() + 3
411
}
412
}
413
}
414
415
impl Display for CloudScheme {
416
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
417
Display::fmt(self.as_str(), f)
418
}
419
}
420
421
/// Formats a local path to begin with `file:///`.
422
///
423
/// # Panics
424
/// May panic if `absolute_local_path` is not an absolute local path.
425
pub fn format_file_uri(absolute_local_path: &str) -> PlRefPath {
426
// Windows needs an extra slash, i.e.:
427
//
428
// # Windows
429
// Absolute path: "C:/Windows/system32"
430
// Formatted: "file:///C:/Windows/system32"
431
//
432
// # Unix
433
// Absolute path: "/root/.vimrc"
434
// Formatted: "file:///root/.vimrc"
435
if cfg!(target_family = "windows") || absolute_local_path.starts_with(WINDOWS_EXTPATH_PREFIX) {
436
if let Some(path) = PlPath::normalize_windows_path(absolute_local_path) {
437
PlRefPath::new(format_pl_refstr!("file:///{path}"))
438
} else {
439
PlRefPath::new(format_pl_refstr!("file:///{absolute_local_path}"))
440
}
441
} else {
442
PlRefPath::new(format_pl_refstr!("file://{absolute_local_path}"))
443
}
444
}
445
446
#[cfg(test)]
447
mod tests {
448
use super::*;
449
450
#[test]
451
fn test_plpath_file() {
452
let p = PlRefPath::new("file:///home/user");
453
assert_eq!(
454
(
455
p.scheme(),
456
p.scheme().map(|x| x.as_str()),
457
p.as_str(),
458
p.strip_scheme(),
459
),
460
(
461
Some(CloudScheme::File),
462
Some("file"),
463
"file:///home/user",
464
"/home/user"
465
)
466
);
467
468
let p = PlRefPath::new("file:/home/user");
469
assert_eq!(
470
(
471
p.scheme(),
472
p.scheme().map(|x| x.as_str()),
473
p.as_str(),
474
p.strip_scheme(),
475
),
476
(
477
Some(CloudScheme::FileNoHostname),
478
Some("file"),
479
"file:/home/user",
480
"/home/user"
481
)
482
);
483
484
assert_eq!(PlRefPath::new("file://").scheme(), Some(CloudScheme::File));
485
486
assert_eq!(
487
PlRefPath::new("file://").strip_scheme_split_authority(),
488
None
489
);
490
491
assert_eq!(
492
PlRefPath::new("file:///").strip_scheme_split_authority(),
493
Some(("", "/"))
494
);
495
496
assert_eq!(
497
PlRefPath::new("file:///path").strip_scheme_split_authority(),
498
Some(("", "/path"))
499
);
500
501
assert_eq!(
502
PlRefPath::new("file://hostname:80/path").strip_scheme_split_authority(),
503
Some(("hostname:80", "/path"))
504
);
505
506
assert_eq!(
507
PlRefPath::new("file:").scheme(),
508
Some(CloudScheme::FileNoHostname)
509
);
510
assert_eq!(
511
PlRefPath::new("file:/").scheme(),
512
Some(CloudScheme::FileNoHostname)
513
);
514
assert_eq!(
515
PlRefPath::new("file:").strip_scheme_split_authority(),
516
Some(("", ""))
517
);
518
assert_eq!(
519
PlRefPath::new("file:/Local/path").strip_scheme_split_authority(),
520
Some(("", "/Local/path"))
521
);
522
523
assert_eq!(
524
PlRefPath::new(r#"\\?\C:\Windows\system32"#).as_str(),
525
"C:/Windows/system32"
526
);
527
}
528
529
#[test]
530
fn test_plpath_join() {
531
assert_eq!(
532
PlRefPath::new("s3://.../...").join("az://.../...").as_str(),
533
"az://.../..."
534
);
535
536
fn _assert_plpath_join(base: &str, added: &str, expect: &str) {
537
// Normal path test
538
let expect = PlRefPath::new(expect);
539
let base = base.replace('/', std::path::MAIN_SEPARATOR_STR);
540
let added = added.replace('/', std::path::MAIN_SEPARATOR_STR);
541
542
assert_eq!(PlRefPath::new(&base).join(&added), expect);
543
544
// URI path test
545
let uri_base = format_file_uri(&base);
546
let expect_uri = if added.starts_with(std::path::MAIN_SEPARATOR_STR) {
547
expect.clone()
548
} else {
549
format_file_uri(expect.as_str())
550
};
551
552
assert_eq!(PlRefPath::new(uri_base.as_str()).join(added), expect_uri);
553
}
554
555
macro_rules! assert_plpath_join {
556
($base:literal + $added:literal => $expect:literal) => {
557
_assert_plpath_join($base, $added, $expect)
558
};
559
}
560
561
assert_plpath_join!("a/b/c/" + "d/e" => "a/b/c/d/e");
562
assert_plpath_join!("a/b/c" + "d/e" => "a/b/c/d/e");
563
assert_plpath_join!("a/b/c" + "d/e/" => "a/b/c/d/e/");
564
assert_plpath_join!("a/b/c" + "/d" => "/d");
565
assert_plpath_join!("a/b/c" + "/d/" => "/d/");
566
assert_plpath_join!("" + "/d/" => "/d/");
567
assert_plpath_join!("/" + "/d/" => "/d/");
568
assert_plpath_join!("/x/y" + "/d/" => "/d/");
569
assert_plpath_join!("/x/y" + "/d" => "/d");
570
assert_plpath_join!("/x/y" + "d" => "/x/y/d");
571
572
assert_plpath_join!("/a/longer" + "path" => "/a/longer/path");
573
assert_plpath_join!("/a/longer" + "/path" => "/path");
574
assert_plpath_join!("/a/longer" + "path/test" => "/a/longer/path/test");
575
assert_plpath_join!("/a/longer" + "/path/test" => "/path/test");
576
}
577
578
#[test]
579
fn test_plpath_name() {
580
assert_eq!(PlRefPath::new("s3://...").file_name(), Some("...".as_ref()));
581
assert_eq!(
582
PlRefPath::new("a/b/file.parquet").file_name(),
583
Some("file.parquet".as_ref())
584
);
585
assert_eq!(
586
PlRefPath::new("file.parquet").file_name(),
587
Some("file.parquet".as_ref())
588
);
589
590
assert_eq!(PlRefPath::new("s3://").file_name(), None);
591
assert_eq!(PlRefPath::new("").file_name(), None);
592
}
593
}
594
595