15
15
// specific language governing permissions and limitations
16
16
// under the License.
17
17
18
- use std:: fs ;
18
+ use std:: path :: { Component , PathBuf } ;
19
19
20
20
use crate :: datasource:: object_store:: ObjectStoreUrl ;
21
21
use crate :: execution:: context:: SessionState ;
@@ -46,37 +46,49 @@ pub struct ListingTableUrl {
46
46
impl ListingTableUrl {
47
47
/// Parse a provided string as a `ListingTableUrl`
48
48
///
49
+ /// A URL can either refer to a single object, or a collection of objects with a
50
+ /// common prefix, with the presence of a trailing `/` indicating a collection.
51
+ ///
52
+ /// For example, `file:///foo.txt` refers to the file at `/foo.txt`, whereas
53
+ /// `file:///foo/` refers to all the files under the directory `/foo` and its
54
+ /// subdirectories.
55
+ ///
56
+ /// Similarly `s3://BUCKET/blob.csv` refers to `blob.csv` in the S3 bucket `BUCKET`,
57
+ /// wherease `s3://BUCKET/foo/` refers to all objects with the prefix `foo/` in the
58
+ /// S3 bucket `BUCKET`
59
+ ///
49
60
/// # Paths without a Scheme
50
61
///
51
62
/// If no scheme is provided, or the string is an absolute filesystem path
52
- /// as determined [`std::path::Path::is_absolute`], the string will be
63
+ /// as determined by [`std::path::Path::is_absolute`], the string will be
53
64
/// interpreted as a path on the local filesystem using the operating
54
65
/// system's standard path delimiter, i.e. `\` on Windows, `/` on Unix.
55
66
///
56
67
/// If the path contains any of `'?', '*', '['`, it will be considered
57
68
/// a glob expression and resolved as described in the section below.
58
69
///
59
- /// Otherwise, the path will be resolved to an absolute path, returning
60
- /// an error if it does not exist , and converted to a [file URI]
70
+ /// Otherwise, the path will be resolved to an absolute path based on the current
71
+ /// working directory , and converted to a [file URI].
61
72
///
62
- /// If you wish to specify a path that does not exist on the local
63
- /// machine you must provide it as a fully-qualified [file URI]
64
- /// e.g. `file:///myfile.txt`
73
+ /// If the path already exists in the local filesystem this will be used to determine if this
74
+ /// [`ListingTableUrl`] refers to a collection or a single object, otherwise the presence
75
+ /// of a trailing path delimiter will be used to indicate a directory. For the avoidance
76
+ /// of ambiguity it is recommended users always include trailing `/` when intending to
77
+ /// refer to a directory.
65
78
///
66
79
/// ## Glob File Paths
67
80
///
68
81
/// If no scheme is provided, and the path contains a glob expression, it will
69
82
/// be resolved as follows.
70
83
///
71
84
/// The string up to the first path segment containing a glob expression will be extracted,
72
- /// and resolved in the same manner as a normal scheme-less path. That is, resolved to
73
- /// an absolute path on the local filesystem, returning an error if it does not exist,
74
- /// and converted to a [file URI]
85
+ /// and resolved in the same manner as a normal scheme-less path above.
75
86
///
76
87
/// The remaining string will be interpreted as a [`glob::Pattern`] and used as a
77
88
/// filter when listing files from object storage
78
89
///
79
90
/// [file URI]: https://en.wikipedia.org/wiki/File_URI_scheme
91
+ /// [URL]: https://url.spec.whatwg.org/
80
92
pub fn parse ( s : impl AsRef < str > ) -> Result < Self > {
81
93
let s = s. as_ref ( ) ;
82
94
@@ -92,32 +104,6 @@ impl ListingTableUrl {
92
104
}
93
105
}
94
106
95
- /// Get object store for specified input_url
96
- /// if input_url is actually not a url, we assume it is a local file path
97
- /// if we have a local path, create it if not exists so ListingTableUrl::parse works
98
- pub fn parse_create_local_if_not_exists (
99
- s : impl AsRef < str > ,
100
- is_directory : bool ,
101
- ) -> Result < Self > {
102
- let s = s. as_ref ( ) ;
103
- let is_valid_url = Url :: parse ( s) . is_ok ( ) ;
104
-
105
- match is_valid_url {
106
- true => ListingTableUrl :: parse ( s) ,
107
- false => {
108
- let path = std:: path:: PathBuf :: from ( s) ;
109
- if !path. exists ( ) {
110
- if is_directory {
111
- fs:: create_dir_all ( path) ?;
112
- } else {
113
- fs:: File :: create ( path) ?;
114
- }
115
- }
116
- ListingTableUrl :: parse ( s)
117
- }
118
- }
119
- }
120
-
121
107
/// Creates a new [`ListingTableUrl`] interpreting `s` as a filesystem path
122
108
fn parse_path ( s : & str ) -> Result < Self > {
123
109
let ( prefix, glob) = match split_glob_expression ( s) {
@@ -129,15 +115,9 @@ impl ListingTableUrl {
129
115
None => ( s, None ) ,
130
116
} ;
131
117
132
- let path = std:: path:: Path :: new ( prefix) . canonicalize ( ) ?;
133
- let url = if path. is_dir ( ) {
134
- Url :: from_directory_path ( path)
135
- } else {
136
- Url :: from_file_path ( path)
137
- }
138
- . map_err ( |_| DataFusionError :: Internal ( format ! ( "Can not open path: {s}" ) ) ) ?;
139
- // TODO: Currently we do not have an IO-related error variant that accepts ()
140
- // or a string. Once we have such a variant, change the error type above.
118
+ let url = url_from_path ( prefix) . ok_or_else ( || {
119
+ DataFusionError :: Internal ( format ! ( "Can not open path: {s}" ) )
120
+ } ) ?;
141
121
Ok ( Self :: new ( url, glob) )
142
122
}
143
123
@@ -214,7 +194,12 @@ impl ListingTableUrl {
214
194
}
215
195
}
216
196
} ,
217
- false => futures:: stream:: once ( store. head ( & self . prefix ) ) . boxed ( ) ,
197
+ false => futures:: stream:: once ( store. head ( & self . prefix ) )
198
+ . filter ( |r| {
199
+ let p = !matches ! ( r, Err ( object_store:: Error :: NotFound { .. } ) ) ;
200
+ futures:: future:: ready ( p)
201
+ } )
202
+ . boxed ( ) ,
218
203
} ;
219
204
Ok ( list
220
205
. try_filter ( move |meta| {
@@ -257,6 +242,45 @@ impl std::fmt::Display for ListingTableUrl {
257
242
}
258
243
}
259
244
245
+ fn url_from_path ( s : & str ) -> Option < Url > {
246
+ let path = std:: path:: Path :: new ( s) ;
247
+ let is_dir = match path. exists ( ) {
248
+ true => path. is_dir ( ) ,
249
+ // Fallback to inferring from trailing separator
250
+ false => std:: path:: is_separator ( s. chars ( ) . last ( ) ?) ,
251
+ } ;
252
+
253
+ let p = match path. is_absolute ( ) {
254
+ true => resolve_path ( path) ?,
255
+ false => {
256
+ let absolute = std:: env:: current_dir ( ) . ok ( ) ?. join ( path) ;
257
+ resolve_path ( & absolute) ?
258
+ }
259
+ } ;
260
+
261
+ match is_dir {
262
+ true => Url :: from_directory_path ( p) . ok ( ) ,
263
+ false => Url :: from_file_path ( p) . ok ( ) ,
264
+ }
265
+ }
266
+
267
+ fn resolve_path ( path : & std:: path:: Path ) -> Option < PathBuf > {
268
+ let mut base = PathBuf :: with_capacity ( path. as_os_str ( ) . len ( ) ) ;
269
+ for component in path. components ( ) {
270
+ match component {
271
+ Component :: Prefix ( _) | Component :: RootDir => base. push ( component. as_os_str ( ) ) ,
272
+ Component :: Normal ( p) => base. push ( p) ,
273
+ Component :: CurDir => { } // Do nothing
274
+ Component :: ParentDir => {
275
+ if !base. pop ( ) {
276
+ return None ;
277
+ }
278
+ }
279
+ }
280
+ }
281
+ Some ( base)
282
+ }
283
+
260
284
const GLOB_START_CHARS : [ char ; 3 ] = [ '?' , '*' , '[' ] ;
261
285
262
286
/// Splits `path` at the first path segment containing a glob expression, returning
@@ -368,4 +392,25 @@ mod tests {
368
392
Some ( ( "/a/b/c//" , "alltypes_plain*.parquet" ) ) ,
369
393
) ;
370
394
}
395
+
396
+ #[ test]
397
+ fn test_resolve_path ( ) {
398
+ let r = resolve_path ( "/foo/bar/../baz.txt" . as_ref ( ) ) . unwrap ( ) ;
399
+ assert_eq ! ( r. to_str( ) . unwrap( ) , "/foo/baz.txt" ) ;
400
+
401
+ let r = resolve_path ( "/foo/bar/./baz.txt" . as_ref ( ) ) . unwrap ( ) ;
402
+ assert_eq ! ( r. to_str( ) . unwrap( ) , "/foo/bar/baz.txt" ) ;
403
+
404
+ let r = resolve_path ( "/foo/bar/../../../baz.txt" . as_ref ( ) ) ;
405
+ assert_eq ! ( r, None ) ;
406
+ }
407
+
408
+ #[ test]
409
+ fn test_url_from_path ( ) {
410
+ let url = url_from_path ( "foo/bar" ) . unwrap ( ) ;
411
+ assert ! ( url. path( ) . ends_with( "foo/bar" ) ) ;
412
+
413
+ let url = url_from_path ( "foo/bar/" ) . unwrap ( ) ;
414
+ assert ! ( url. path( ) . ends_with( "foo/bar/" ) ) ;
415
+ }
371
416
}
0 commit comments