Turi Create  4.0
dir_archive.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_SERIALIZATION_DIR_ARCHIVE_HPP
7 #define TURI_SERIALIZATION_DIR_ARCHIVE_HPP
8 #include <cstddef>
9 #include <vector>
10 #include <string>
11 #include <memory>
12 #include <map>
13 #include <core/storage/fileio/fs_utils.hpp>
14 #include <core/storage/fileio/general_fstream.hpp>
15 namespace turi {
16 
17 /**
18  * This file is the human readable INI file in the directory containing
19  * information about the archive.
20  */
21 extern const char* DIR_ARCHIVE_INI_FILE;
22 
23 /**
24  * This file is the binary archive used to hold serializable object.
25  */
26 extern const char* DIR_ARCHIVE_OBJECTS_BIN;
27 
28 
29 
30 namespace dir_archive_impl {
31 
32 /**
33  * The archive index.
34  *
35  * The archive index file simply comprises of the following:
36  * \code
37  * [archive]
38  * version = 1
39  * num_prefixes = 4
40  * [prefixes]
41  * 0000 = "dir_archive.ini"
42  * 0001 = "objects.bin"
43  * 0002 = "0001"
44  * 0003 = "0002"
45  * \endcode
46  * The prefix section basically lists all the prefixes stored inside the
47  * directory archive. All files in the directory which have their file name
48  * beginning with a prefix is a file belonging to the archive.
49  *
50  * The objects.bin, and dir_archive.ini file is always in the prefix
51  *
52  * Once read into the archive_index_information struct however, the prefixes
53  * will all be absolute paths.
54  */
56  size_t version = (size_t)(-1);
57  std::vector<std::string> prefixes;
58  std::map<std::string, std::string> metadata;
59 };
60 
61 } // namespace dir_archive_impl
62 
63 
64 /**
65  * \ingroup group_serialization
66  * The dir_archive object manages a directory archive. It is an internal
67  * class which provides two basic containers:
68  * - A single file stream object (a general_ifstream / general_ofstream)
69  * which points to an "objects.bin" file in the directory.
70  * - The ability to obtain prefixes (for instance [directory]/0000) which
71  * consumers can then use for other file storage purposes. (for instance,
72  * an sframe could create 0000.sidx, 0000.0001, 0000.0002, etc),
73  *
74  * The directory archive provide management for the prefixes and the objects
75  * as well as directory archive creation / deletion.
76  *
77  * To use:
78  * \code
79  * dir_archive archive;
80  * archive.open_directory_for_write(dir)
81  * oarchive oarc(archive)
82  * oarc << ...
83  * oarc.get_prefix()
84  * etc.
85  * \endcode
86  * Similarly, to read:
87  * \code
88  * dir_archive archive;
89  * archive.open_directory_for_read(dir)
90  * iarchive iarc(archive)
91  * iarc >> ...
92  * iarc.get_prefix()
93  * etc.
94  * \endcode
95  */
96 class dir_archive {
97  public:
98  inline dir_archive() { }
99 
100  /**
101  * Destructor. Also closes.
102  */
103  ~dir_archive();
104 
105  /**
106  * Opens a directory for writing. Directory must be an absolute path.
107  *
108  * if fail_on_existing is false: (default)
109  * - This function will only fail if the directory exists, and does not
110  * contain an archive. It will overwrite in all other cases.
111  *
112  * if fail_on_existing is true:
113  * - The function will fail if the the directory points to a file name.
114  * - The function will fail if the the directory exists.
115  *
116  * Throws an exception with a string message if the directory cannot
117  * be opened.
118  */
119  void open_directory_for_write(std::string directory,
120  bool fail_on_existing = false);
121 
122  /**
123  * Opens a directory for reading. Directory must be an absolute path.
124  * This function will fail if the directory is not an archive.
125  *
126  * Throws an exception with a string message if the directory cannot
127  * be opened.
128  */
129  void open_directory_for_read(std::string directory);
130 
131  /**
132  * Returns the current directory opened by either
133  * open_directory_for_read() or open_directory_for_write();
134  * if nothing is opened, this returns an empty string.
135  */
136  std::string get_directory() const;
137 
138  /**
139  * The directory must be opened for write.
140  * This returns a new prefix which can be written to.
141  */
142  std::string get_next_write_prefix();
143 
144  /**
145  * The directory must be opened for read.
146  * This returns the next prefix in the sequence of generated prefixes.
147  * The order of prefixes returns is the same order as the prefixes generated
148  * by get_next_write_prefix() when the archive was created.
149  */
150  std::string get_next_read_prefix();
151  /**
152  * Returns a pointer to the object stream reader. Returns NULL if the
153  * input directory is not opened for read.
154  */
155  general_ifstream* get_input_stream();
156  /**
157  * Returns a pointer to the object stream writer. Returns NULL if the
158  * input directory is not opened for write.
159  */
160  general_ofstream* get_output_stream();
161 
162  /**
163  * Closes the directory archive, committing all writes.
164  */
165  void close();
166 
167  /**
168  * Associates additional metadata with the archive that can be read back
169  * with get_metadata() when it is loaded.
170  */
171  void set_metadata(std::string key, std::string val);
172 
173  /**
174  * Reads any metadata associated with the archive.
175  * Returns true if the key exists, false otherwise.
176  */
177  bool get_metadata(std::string key, std::string& val) const;
178 
179  /**
180  * Deletes the contents of an archive safely. (i.e. performing
181  * a non-recursive delete so we don't *ever*, even by accident, delete
182  * stuff we are not meant to delete).
183  *
184  * It will delete the directory the archive is in if the directory is empty
185  * after deletion of all the archive files.
186  *
187  * Never throws.
188  */
189  static void delete_archive(std::string directory);
190 
191  /*
192  * Given a directory where one Turi object is stored, return the requested
193  metadata of the object.
194  Could throw if key does not exist or directory does not store a valid Turi
195  object
196  */
197  static std::string get_directory_metadata(
198  std::string directory,
199  const std::string& key);
200 
201  /**
202  * Returns true if the directory contains an archive
203  */
204  static bool directory_has_existing_archive(
205  const std::vector<std::pair<std::string, fileio::file_status> >& dircontents);
206 
207  private:
208 
209  void set_close_callback(std::function<void()>&);
210 
211  void init_for_read(const std::string& directory);
212 
213  void init_for_write(const std::string& directory);
214 
215  void make_s3_read_cache(const std::string& directory);
216 
217  /**
218  * The index information for the archive
219  */
221 
222  std::string m_directory;
223 
224  /**
225  * The pointer to the objects.bin write stream
226  */
227  std::unique_ptr<general_ofstream> m_objects_out;
228 
229  /**
230  * The pointer to the objects.bin read stream
231  */
232  std::unique_ptr<general_ifstream> m_objects_in;
233 
234  /// The next element in m_index_info.prefixes to return
235  size_t m_read_prefix_index = 0;
236 
237  /// The next prefix number to return
238  size_t m_write_prefix_index = 0;
239 
240  /// Cache dir_archive
241  std::unique_ptr<dir_archive> m_cache_archive;
242 
243  /// callback on close
244  std::function<void()> m_close_callback;
245 };
246 
247 
248 } // namespace turi
249 #endif
const char * DIR_ARCHIVE_INI_FILE
const char * DIR_ARCHIVE_OBJECTS_BIN