Turi Create  4.0
fs_utils.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 /**
7  * This file implements a collection of routines that operate and behave
8  * uniformly on all supported protocols. (currently, HDFS, S3, local fs)
9  */
10 #ifndef TURI_FILEIO_FS_UTILS_HPP
11 #define TURI_FILEIO_FS_UTILS_HPP
12 #include <tuple>
13 #include <string>
14 #include <vector>
15 #include <core/storage/fileio/sanitize_url.hpp>
16 
17 namespace turi {
18 namespace fileio {
19 
20 /**
21  * \ingroup fileio
22  * Return values of \ref get_file_status
23  */
24 enum class file_status {
25  MISSING, REGULAR_FILE, DIRECTORY, FS_UNAVAILABLE
26 };
27 
28 /**
29  * \ingroup fileio
30  * Checks a path (can be hdfs, s3, or regular) to see if it is a local path,
31  * or a remote path.
32  *
33  * if file is missing, err_msg will be the second value if provided
34  */
35 std::pair<file_status, std::string> get_file_status(const std::string& path);
36 
37 
38 /**
39  * \ingroup fileio
40  * Enumerates the contents of a directory, listing all the files as well as
41  * the file type. Path can be hdfs, s3, or regular filesystem.
42  */
43 std::vector<std::pair<std::string, file_status>> get_directory_listing(const std::string& path);
44 
45 
46 /**
47  * \ingroup fileio
48  * Creates a directory and all parent required directories (like mkdir -p).
49  * Path can be hdfs, s3, or regular filesystem.
50  * Returns true on creation, false on failure or if the directory already exists.
51  * To get meaningful error messages thrown on failure, use create_directory_or_throw.
52  */
53 bool create_directory(const std::string& path);
54 
55 
56 /**
57  * \ingroup fileio
58  * Creates a directory and all parent required directories (like mkdir -p).
59  * Path can be hdfs, s3, or regular filesystem.
60  * Returns true on creation, false if the directory already exists.
61  * Throws std::ios_base::failure on failure.
62  */
63 bool create_directory_or_throw(const std::string& path);
64 
65 /**
66  * \ingroup fileio
67  * Try delete a given path. Path can be hdfs, s3, or regular filesystem.
68  * If the path is a directory, then try remove all files under directory
69  * If the path is a file, then file will be deleted immediately if the
70  * file is not currently in use, otherwise the files are to be removed
71  * later when the files are not used by anyone
72  * If path doesn't exist, this returns true.
73  * Returns true on success, false on failure.
74  *
75  * \param path The path to delete
76  * \param status The file status if known. (Optional)
77  */
78 bool delete_path(const std::string& path,
79  file_status status = file_status::FS_UNAVAILABLE);
80 
81 /**
82  * \ingroup fileio
83  * \internal
84  * Deletes a path.
85  * Internal function not meant to be called by external components
86  *
87  * If the path is a directory, then try remove all files under directory
88  * If the path is a file, then file will be removed
89  * If path doesn't exist, this returns true.
90  * Returns true on success, false on failure.
91  *
92  * \param path The path to delete
93  * \param status The file status if known. (Optional)
94  */
95 bool delete_path_impl(const std::string& path,
96  file_status status = file_status::FS_UNAVAILABLE);
97 
98 /**
99  * \ingroup fileio
100  * Deletes a path. if path is a directory, deletion will delete
101  * all files and directories it contains.
102  * Path can be hdfs, s3, or regular filesystem.
103  * If path doesn't exist, this returns true.
104  * Returns true on success, false on failure.
105  */
106 bool delete_path_recursive(const std::string& path);
107 
108 
109 
110 // this is slightly out of place at the moment.
111 
112 
113 /**
114  * \ingroup fileio
115  * A helper function to parse the hdfs url.
116  * Return a tuple of host, port, and path.
117  */
118 std::tuple<std::string, std::string, std::string> parse_hdfs_url(std::string url);
119 
120 
121 /**
122  * Returns true if the protocol is writeable S3, HDFS, cache and local
123  * filesystem; returns false otherwise.
124  */
125 bool is_writable_protocol(std::string protocol);
126 /**
127  * Returns true if the protocol is a protocol we will make curl handle.
128  */
129 bool is_web_protocol(std::string protocol);
130 
131 /**
132  * Returns the protocol header. (everything before the ://).
133  *
134  * local file can have protocol "file" or ""
135  *
136  * get_protocol("http://www.google.com") == "http"
137  * get_protocol("s3://www.google.com") == "s3"
138  * get_protocol("/root/test") == ""
139  * get_protocol("file:///root/test") == "file"
140  */
141 std::string get_protocol(std::string path);
142 
143 /**
144  * \ingroup fileio
145  * Returns the path removing the protocol header if there is one. .
146  *
147  * remove_protocol("http://www.google.com") == "www.google.com"
148  * remove_protocol("s3://www.google.com") == "www.google.com"
149  * remove_protocol("/root/test") == "/root/test"
150  * remove_protocol("file:///root/test") == "/root/test"
151  */
152 std::string remove_protocol(std::string path);
153 
154 /**
155  * \ingroup fileio
156  * Extracts the file name from a fully qualified path.
157  * So given: s3://bucket/data/123
158  * This will return "123".
159  *
160  * In short, this will return everything to the right of the last trailing "/".
161  */
162 std::string get_filename(std::string path);
163 
164 /**
165  * \ingroup fileio
166  * Extracts the directory name from a fully qualified path.
167  * So given: s3://bucket/data/123
168  * This will return "s3://bucket/data"
169  *
170  * In short, this will return everything to the left of the last trailing "/".
171  */
172 std::string get_dirname(std::string path);
173 
174 /**
175  * \ingroup fileio
176  * Converts the path to a generic format for operation.
177  *
178  * Currently, all this means is that backslashes are converted to forward slashes.
179  */
180 std::string convert_to_generic(const std::string &path);
181 
182 /**
183  * \ingroup fileio
184  * Given a root directory and an absolute path, tries to create a relative path
185  * address between root_directory and the path; if not possible, returns the
186  * the original path with no changes.
187  *
188  * This function is relatively limited. It will not add "../" structures to the
189  * returned relative path. In other words, the path must point to a file/folder
190  * inside of the root directory for this to return a relative path.
191  *
192  * Example:
193  * make_relative_path("s3://bucket/data", "s3://bucket/data/123")
194  * returns "123".
195  *
196  * make_relative_path("s3://bucket/data", "s3://foo/123")
197  * returns "s3://foo/123".
198  */
199 std::string make_relative_path(std::string root_directory, std::string path);
200 
201 /**
202  * \ingroup fileio
203  * Given a root directory and a relative path, tries to convert the relative
204  * path to an absolute path. If the path is already an absolute path, returns
205  * the original path with no changes.
206  *
207  * This function is absolutely limited. It will not handle "../" structures to
208  * the returned relative path. In other words, the output path path must point
209  * to a file/folder inside of the root directory.
210  *
211  * Example:
212  * make_absolute_path("s3://bucket/data", "123")
213  * returns "s3://bucket/data/123".
214  *
215  * make_absolute_path("s3://bucket/data", "s3://foo/123")
216  * returns "s3://foo/123".
217  */
218 std::string make_absolute_path(std::string root_directory, std::string path);
219 
220 std::pair<std::string, std::string> split_path_elements(
221  const std::string& url, file_status& status);
222 
223 /**
224  * \ingroup fileio
225  * Where URL is a glob of the form directory1/directory2/[glob]
226  * (glob must only be on the file portion), returns a list of files matching
227  * the glob pattern.
228  */
229 std::vector<std::pair<std::string, file_status>> get_glob_files(
230  const std::string& url);
231 
232 
233 /**
234  * \ingroup fileio
235  * Given a URL, returns an ID value where URLs which return different ID values
236  * are ok to be read in parallel, and URLs which return the same ID value
237  * are probably sub-optimal if read in parallel; An ID of (size_t)(-1) indicates
238  * that it can be read in parallel with everything.
239  */
240 size_t get_io_parallelism_id(const std::string url);
241 
242 /**
243  * \ingroup fileio
244  * Returns true if the file can be opened. False otherwise.
245  */
246 bool try_to_open_file(const std::string url);
247 
248 /**
249  * \ingroup fileio
250  * Copies a file from src to dest
251  */
252 void copy(const std::string src, const std::string dest);
253 
254 /**
255  * \ingroup fileio
256  * Changes the file mode bits of the given file or directory in the url
257  */
258 bool change_file_mode(const std::string path, short mode);
259 
260 /**
261  * \ingroup fileio
262  * Return canonical absolute path, eliminating dots, and symlinks
263  */
264 std::string make_canonical_path(const std::string& path);
265 
266 } // namespace fileio
267 } // namespace turi
268 #endif
bool delete_path_recursive(const std::string &path)
bool delete_path_impl(const std::string &path, file_status status=file_status::FS_UNAVAILABLE)
std::string convert_to_generic(const std::string &path)
bool delete_path(const std::string &path, file_status status=file_status::FS_UNAVAILABLE)
bool create_directory(const std::string &path)
bool create_directory_or_throw(const std::string &path)
std::tuple< std::string, std::string, std::string > parse_hdfs_url(std::string url)
bool try_to_open_file(const std::string url)
bool change_file_mode(const std::string path, short mode)
std::string make_canonical_path(const std::string &path)
std::string remove_protocol(std::string path)
std::pair< file_status, std::string > get_file_status(const std::string &path)
size_t get_io_parallelism_id(const std::string url)
std::vector< std::pair< std::string, file_status > > get_directory_listing(const std::string &path)
std::string get_dirname(std::string path)
std::vector< std::pair< std::string, file_status > > get_glob_files(const std::string &url)
std::string make_relative_path(std::string root_directory, std::string path)
void copy(Iterator begin, Iterator end, SWriter &&writer)
Definition: algorithm.hpp:416
std::string make_absolute_path(std::string root_directory, std::string path)
std::string get_filename(std::string path)