Turi Create  4.0
block_cache.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_FILEIO_BLOCK_CACHE_HPP
7 #define TURI_FILEIO_BLOCK_CACHE_HPP
8 #include <cstdint>
9 #include <core/util/lru.hpp>
10 #include <core/parallel/mutex.hpp>
11 #include <core/storage/fileio/fixed_size_cache_manager.hpp>
12 
13 namespace turi {
14 class general_ifstream;
15 
16 /**
17  * \ingroup fileio
18  *
19  * The block cache implements a simple key-value store for extremely large
20  * values (~16MB at least). Every key can only be written to exactly once,
21  * and allows for arbitrary range reads (i.e. read byte X to byte Y of this key)
22  *
23  * Essentially every value is stored as a single file inside the
24  * storage_prefix parameter set at \ref init.
25  *
26  * The block_cache is safe for concurrent use.
27  *
28  * Use On a Distributed File System
29  * --------------------------------
30  *
31  * The storage prefix be located on a distributed filesystem
32  * (for instance HDFS or NFS). In which case, *every* machine sharing the same
33  * storage prefix also shares keys.
34  *
35  * When sharing a storage prefix with other processes on a distributed
36  * filesystem, the atomicity guarantees of the filesystem becomes important.
37  *
38  * In particular, on HDFS, you may find keys in a "indeterminate" state, where
39  * it cannot be written to, but cannot be queried (because the writer has
40  * created the file but has not finished writing to it yet). On NFS multiple
41  * machines may be able to write to the same key, but only one will win. Also
42  * the length and contents of the key may be wrong if you read the key
43  * while someone else is writing to it.
44  *
45  * Design Notes
46  * ------------
47  * We will like these "interesting" distributed file system properties to not
48  * be true when the block_cache is merely used concurrently. So a bit of care
49  * is needed to ensure atomicity, at least within the context of the same
50  * block_cache object. Essentially we want write-once, but arbitrary parallel
51  * reads semantics.
52  */
53 class block_cache {
54  public:
55  /**
56  * Constructs the block cache. init must be called before the block_cache
57  * can be used.
58  *
59  * make sure the cache manager is detroyed after all cache files are deleted.
60  */
61  block_cache() = default;
62 
63  /// Default destructor. Deletes all associated files.
64  ~block_cache();
65 
66  // copying is disabled
67  block_cache(const block_cache&) = delete;
68  block_cache& operator=(const block_cache&) = delete;
69  // but moves are ok.
70  block_cache(block_cache&&) = default;
71  block_cache& operator=(block_cache&&) = default;
72 
73  /**
74  * init must be called exactly once on block cache construction before the
75  * block cache can be used. Multiple calls to init will raise an exception.
76  *
77  * \param storage_prefix The location where all values are stored
78  * \param max_file_handle_cache The maximum number of file handles to cache
79  *
80  * Essentially, every value is stored as a separate file inside the directory.
81  */
82  void init(const std::string& storage_prefix,
83  size_t max_file_handle_cache=16);
84 
85  /**
86  * Writes a string to a key. Returns true on success. The key must not
87  * already exist. If the key already exists this fails and false is returned.
88  * When operating on a distributed filesystem, note that every machine
89  * sharing the same storage prefix have a common key space.
90  *
91  * \param key The key to write to
92  * \param value The value to write
93  *
94  * \returns true on success, false on failure
95  */
96  bool write(const std::string& key, const std::string& value);
97 
98  /**
99  * Evicts a particular key. Returns true on success, false on failure
100  *
101  * \param key The key name
102  */
103  bool evict_key(const std::string& key);
104 
105  /**
106  * Returns the length of the value of a particular key.
107  *
108  * \param key The key to query
109  *
110  * \returns the length of the value on success, a value < 0 on failure.
111  */
112  int64_t value_length(const std::string& key);
113 
114  /**
115  * Reads the value of a key into an output string, resizing the output string
116  * if necessary; Returns the number of bytes read.
117  *
118  * \param key The key to read
119  * \param output A reference to the output string
120  * \param start Optional, denotes the start offset of the value to read.
121  * Defaults to 0.
122  * \param end Optional, denotes the end offset of the value to read. The byte
123  * at end is not read. i.e. to read the first 5 bytes of the file,
124  * you call read(key, output, 0, 5); Defaults to the length of the
125  * file.
126  *
127  * Note that the number of bytes read can be 0 if:
128  * - start is past the end of the value
129  * - end is less than start
130  *
131  * If start and end are not passed, the entire block is read.
132  *
133  * \returns A value less than 0 on failure.
134  */
135  int64_t read(const std::string& key,
136  char* output,
137  size_t start = 0,
138  size_t end = (size_t)(-1)) ;
139 
140  /**
141  * string overload. Note the char* reader is faster.
142  */
143  int64_t read(const std::string& key,
144  std::string& output,
145  size_t start = 0,
146  size_t end = (size_t)(-1)) ;
147  /**
148  * This returns then number of file handle cache hits.
149  * This function is for profiling purposes since file handles are cached for
150  * performance reasons.
151  */
152  size_t file_handle_cache_hits() const;
153 
154  /**
155  * This returns then number of file handle cache misses.
156  * This function is for profiling purposes since file handles are cached for
157  * performance reasons.
158  */
159  size_t file_handle_cache_misses() const;
160 
161  /** Sets the maximum number of files managed.
162  * If 0, there is no max capacity.
163  */
164  size_t get_max_capacity();
165 
166  /** Gets the maximum number of files managed.
167  * If 0, there is no max capacity.
168  */
169  void set_max_capacity(size_t);
170 
171  /** dependency injection, meaning,
172  * underlying cache provider instance should be released
173  * after block_cache singleton is released.
174  */
175  void hold_cache_provider(std::shared_ptr<const fileio::fixed_size_cache_manager> instance) {
176  m_cache_provider = instance;
177  }
178 
179  /**
180  * Gets a singleton instance. The singleton instance has this default behavior:
181  *
182  * Location of storage:
183  * - If temp files are located on HDFS, the cache just writes
184  * through and is always located on HDFS.
185  * - If temp files are located on local disk, the cache is set to the
186  * cache:// file system. This allows for a degree of in-memory caching.
187  *
188  * File handle LRU cache size:
189  * - 4 * ncpus
190  */
191  static block_cache& get_instance();
192  static void release_instance();
193 
194  private:
195  static constexpr size_t KEY_LOCK_SIZE = 256;
196 
197  /// whether the block_cache is initialized
198  bool m_initialized = false;
199 
200  // hold the reference of the underlying cache storage provider
201  // put it at the top of the member variables so it will be destructed at last
202  std::shared_ptr<const fileio::fixed_size_cache_manager> m_cache_provider;
203 
204 
205  /// The storage prefix
206  std::string m_storage_prefix;
207 
208  /// Lock on internal datastructures
209  mutex m_lock;
210 
211  /// The set of files I created.
212  std::set<std::string> m_created_files;
213 
214  /// The maximum number of files managed. If 0 there is no limit.
215  size_t m_max_capacity = 0;
216 
217  /**
218  * A lock on each key.
219  * The key is hashed and key_lock[hash] is used to lock the key.
220  */
221  mutex key_lock[KEY_LOCK_SIZE];
222 
223  /// A cache of files to file handles
225 
226  /// An LRU cache of the set of files we maintain. The value_type (bool) is unused.
227  lru_cache<std::string, bool> m_lru_files;
228 
229 
230 }; // class block_cache
231 } // turicreate
232 #endif
int64_t read(const std::string &key, char *output, size_t start=0, size_t end=(size_t)(-1))
int64_t value_length(const std::string &key)
bool write(const std::string &key, const std::string &value)
bool evict_key(const std::string &key)
block_cache()=default
void init(const std::string &storage_prefix, size_t max_file_handle_cache=16)
static block_cache & get_instance()
~block_cache()
Default destructor. Deletes all associated files.
void set_max_capacity(size_t)
size_t get_max_capacity()
size_t file_handle_cache_misses() const
size_t file_handle_cache_hits() const
void hold_cache_provider(std::shared_ptr< const fileio::fixed_size_cache_manager > instance)