Turi Create  4.0
s3_api.hpp
1 /* Copyright © 2020 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at
5  * https://opensource.org/licenses/BSD-3-Clause
6  */
7 #ifndef TURI_S3_UPLOADER_HPP
8 #define TURI_S3_UPLOADER_HPP
9 
10 #ifndef TC_DISABLE_REMOTEFS
11 
12 #include <aws/s3/S3Client.h>
13 
14 #include <core/logging/assertions.hpp>
15 #include <core/storage/fileio/fs_utils.hpp>
16 #include <fstream>
17 #include <future>
18 #include <memory>
19 #include <string>
20 #include <vector>
21 
22 namespace turi {
23 
24 /**
25  * \ingroup fileio
26  * \internal
27  *
28  * constructed **only** from user provided url
29  *
30  * A complete specification of an S3 bucket and object,
31  * including all authentication required.
32  */
33 struct s3url {
34  std::string access_key_id;
35  std::string secret_key;
36  std::string bucket;
37  std::string object_name;
38  // endpoint that embeded in the url
39  std::string endpoint;
40 
41  // endpoint used by sdk, not in the url
42  boost::optional<std::string> sdk_endpoint;
43  boost::optional<std::string> sdk_region;
44  boost::optional<std::string> sdk_proxy;
45 
46  // this call doesn't compare the optional members
47  bool operator==(const s3url& other) const {
48  return access_key_id == other.access_key_id &&
49  secret_key == other.secret_key && bucket == other.bucket &&
50  object_name == other.object_name && endpoint == other.endpoint;
51  }
52 
53  /*
54  * @param with_credentials: user should not see this
55  *
56  * reconstruct to url format,
57  * s3://[access_key_id]:[secret_key]:[endpoint/][bucket]/[object_name],
58  * which turi uses everywhere.
59  */
60  std::string string_from_s3url(bool with_credentials = true) const {
61  std::string ret("s3://");
62  ret.reserve(128);
63  const size_t prot_len = ret.size();
64 
65  if (with_credentials && !access_key_id.empty()) {
66  ASSERT_TRUE(!secret_key.empty());
67  ret.append(access_key_id);
68  ret.append(1, ':');
69  ret.append(secret_key);
70  ret.append(1, ':');
71  }
72 
73  // this is embeded form
74  // something like: s3://s3.amazonaws.com/bucket/object/name
75  if (!endpoint.empty()) {
76  ret.append(endpoint);
77  ret.append(1, '/');
78  }
79 
80  ASSERT_TRUE(!bucket.empty());
81  ret.append(bucket);
82 
83  if (!object_name.empty()) {
84  // s3://object_key is a valid case
85  if (ret.size() > prot_len) ret.append(1, '/');
86  ret.append(object_name);
87  }
88 
89  return ret;
90  }
91 
92  friend std::ostream& operator<<(std::ostream& os, const s3url& url) {
93  if (url.sdk_endpoint)
94  os << "endpoint used by sdk: " << *url.sdk_endpoint << "; ";
95  if (url.sdk_region) os << "region used by sdk: " << *url.sdk_region << "; ";
96  if (url.sdk_proxy) os << "proxy used by sdk: " << *url.sdk_proxy << "; ";
97  return os << url.string_from_s3url(false);
98  }
99 };
100 
101 /**
102  * \ingroup fileio
103  * \internal
104  *
105  * initialize the sdk with TRUI constomized environment variable
106  *
107  * will set the endpoint/region that used to configure the client
108  *
109  * this call will modify optional sdk_* members
110  */
111 Aws::S3::S3Client init_aws_sdk_with_turi_env(s3url& parsed_url);
112 
113 /**
114  * \ingroup fileio
115  * \internal
116  * Get the last modified time stamp of file.
117  *
118  * Throw exception if the url cannot be fetched.
119  *
120  * Return empty string if last modified is not available,
121  * e.g. the url is a directory path or file does not exist.
122  */
123 std::string get_s3_file_last_modified(const std::string& url);
124 
125 /**
126  * \ingroup fileio
127  * \internal
128  * Return type of list_objects;
129  */
131  /// Non-empty if there was an error
132  std::string error;
133  /// A list of all the "sub-directories" found. Encoded with url, see s3url.
134  std::vector<std::string> directories;
135  /// A list of all the objects found. Encoded with url, see s3url.
136  /// this should be really called object_urls;
137  std::vector<std::string> objects;
138  /// A list of all the objects size.
139  std::vector<size_t> objects_size;
140  /// Last modified time for the objects.
141  std::vector<std::string> objects_last_modified;
142 };
143 
144 /**
145  * \ingroup fileio
146  * \internal
147  * Lists objects or prefixes prefixed by a give s3 url.
148  *
149  * This is a thin wrapper around the S3 API
150  * http://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketGET.html
151  * and may not quite do what you think it does.
152  *
153  * if s3_url points to a valid prefix, it will return only the prefix
154  * as a directory. For instance if I have an S3 bucket containing
155  *
156  * foo/hello.txt
157  *
158  * list_objects("s3://foo") will return simply "foo/" as a directory.
159  *
160  * See list_directory() and is_directory() for a more sensible implementation
161  * which behaves somewhat more file system like.
162  *
163  * \returns A list_objects_response object.
164  * If list_objects_response.error is an empty string, it indicates success.
165  * Otherwise, it contains an error code. list_objects_response.directories
166  * indicate all "directories" stored with the requested prefix. And
167  * list_objects_response.objects indicates all regular objects stored with the
168  * requested prefix.
169  *
170  */
171 list_objects_response list_objects(std::string s3_url, std::string proxy = "");
172 
173 /**
174  * \ingroup fileio
175  * \internal
176  * Lists all objects prefixed by a give s3 url.
177  *
178  * if s3_url points to a valid prefix, it return the prefix's contents
179  * like a directory.
180  *
181  * foo/hello.txt
182  *
183  * list_objects("s3://foo") will return "foo/hello.txt"
184  *
185  * If s3_url points to an object it will just return the object.
186  *
187  * \returns A list_objects_response object.
188  * If list_objects_response.error is an empty string, it indicates success.
189  * Otherwise, it contains an error code. list_objects_response.directories
190  * indicate all "directories" stored with the requested prefix. And
191  * list_objects_response.objects indicates all regular objects stored with the
192  * requested prefix.
193  *
194  */
195 list_objects_response list_directory(std::string s3_url,
196  std::string proxy = "");
197 
198 /**
199  * \ingroup fileio
200  * \internal
201  * Tests if url is a directory or a regular file.
202  * Returns a pair of (exists, is_directory). If exists is false,
203  * is_directory should be ignored
204  */
206 std::pair<file_status, list_objects_response> is_directory(
207  std::string s3_url, std::string proxy = "");
208 
209 /**
210  * \ingroup fileio
211  * \internal
212  * Where url points to a single object, this deletes the object.
213  * Returns an empty string on success, and an error string on failure.
214  */
215 std::string delete_object(std::string s3_url, std::string proxy = "");
216 
217 /**
218  * \ingroup fileio
219  * \internal
220  * Where url points to a prefix, this deletes all objects with the
221  * specified prefix.
222  * Returns an empty string on success, and an error string on failure.
223  */
224 std::string delete_prefix(std::string s3_url, std::string proxy = "");
225 
226 /**
227  * \ingroup fileio
228  * \internal
229  * Given an S3 URL of the form expected by parse_s3url,
230  * this function drops the access_key_id and the secret_key from the string
231  * returning s3://[bucket]/[object_name]
232  *
233  * If the url cannot be parsed, we try the best to remove information associated
234  * with ':'.
235  *
236  * If the url does not begin with s3://, return as is.
237  */
238 std::string sanitize_s3_url(const std::string& url);
239 
240 /**
241  * \ingroup fileio
242  * \internal
243  * This splits a URL of the form
244  * s3://[access_key_id]:[secret_key]:[endpoint/][bucket]/[object_name]
245  * into several pieces.
246  *
247  * endpoint and object_name are optional.
248  *
249  * Returns true on success, false on failure.
250  */
251 bool parse_s3url(const std::string& url, s3url& ret, std::string& err_msg);
252 
253 /**
254  * \ingroup fileio
255  * \internal
256  * Set the timeout for S3 upload.
257  * \param timeout Timeout value in secs.
258  */
259 void set_upload_timeout(long timeout);
260 
261 /**
262  * \ingroup fileio
263  * \internal
264  * Set the timeout for S3 download.
265  * \param timeout Timeout value in secs.
266  */
267 void set_download_timeout(long timeout);
268 
269 struct S3Operation {
270  enum ops_enum {
271  Delete,
272  List,
273  HEAD,
274  };
275 
276  static std::string toString(ops_enum operation) {
277  return _enum_to_str.at(operation);
278  }
279 
280  static const std::vector<std::string> _enum_to_str;
281 };
282 
283 template <class Response>
284 std::ostream& reportS3Error(std::ostream& ss, const s3url& parsed_url,
285  S3Operation::ops_enum operation,
286  const Response& outcome) {
287  auto error = outcome.GetError();
288  ss << "('" << parsed_url << ", proxy: '" << parsed_url.sdk_proxy
289  << "', region: '" << parsed_url.sdk_region << "')"
290  << " Error while performing " << S3Operation::toString(operation)
291  << ". Error Name: " << error.GetExceptionName()
292  << ". Error Message: " << error.GetMessage()
293  << ". HTTP Error Code: " << static_cast<int>(error.GetResponseCode());
294 
295  return ss;
296 }
297 
298 #define reportS3ErrorDetailed(ss, parsed_url, operation, outcome) \
299  reportS3Error(ss, parsed_url, operation, outcome) \
300  << " in " << __FILE__ << " at " << __LINE__
301 
302 } // namespace turi
303 
304 #endif // End ifndef TC_DISABLE_REMOTEFS
305 
306 #endif
std::string sanitize_s3_url(const std::string &url)
std::vector< std::string > directories
A list of all the "sub-directories" found. Encoded with url, see s3url.
Definition: s3_api.hpp:134
void set_download_timeout(long timeout)
std::string get_s3_file_last_modified(const std::string &url)
std::string error
Non-empty if there was an error.
Definition: s3_api.hpp:132
std::string delete_object(std::string s3_url, std::string proxy="")
std::string delete_prefix(std::string s3_url, std::string proxy="")
std::vector< std::string > objects
Definition: s3_api.hpp:137
list_objects_response list_directory(std::string s3_url, std::string proxy="")
#define ASSERT_TRUE(cond)
Definition: assertions.hpp:309
bool parse_s3url(const std::string &url, s3url &ret, std::string &err_msg)
std::vector< std::string > objects_last_modified
Last modified time for the objects.
Definition: s3_api.hpp:141
list_objects_response list_objects(std::string s3_url, std::string proxy="")
Aws::S3::S3Client init_aws_sdk_with_turi_env(s3url &parsed_url)
void set_upload_timeout(long timeout)
std::vector< size_t > objects_size
A list of all the objects size.
Definition: s3_api.hpp:139