Turi Create  4.0
sarray_file_format_interface.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_UNITY_SARRAY_FILE_FORMAT_INTERFACE_HPP
7 #define TURI_UNITY_SARRAY_FILE_FORMAT_INTERFACE_HPP
8 
9 #define BOOST_SPIRIT_THREADSAFE
10 
11 #include <cstdlib>
12 #include <string>
13 #include <sstream>
14 #include <map>
15 #include <core/storage/fileio/general_fstream.hpp>
16 #include <boost/property_tree/ptree.hpp>
17 #include <boost/property_tree/ini_parser.hpp>
18 #include <core/storage/sframe_data/sarray_index_file.hpp>
19 #include <core/data/flexible_type/flexible_type.hpp>
20 #include <core/storage/sframe_data/sframe_rows.hpp>
21 namespace turi {
22 
23 /**
24  * \internal
25  * \ingroup sframe_physical
26  * \addtogroup sframe_internal SFrame Internal
27  * \{
28  */
29 
30 /**
31  * A generic \ref sarray file format reader interface. File format
32  * implementations should extend this.
33  *
34  * The sarray file layout should generally be a file set (collection of files)
35  * with a common prefix. File format implementations can create or use as many
36  * prefixes as required. There must be an [prefix].sidx in the the Microsoft
37  * INI format with the following sections
38  *
39  * \verbatim
40  * [sarray]
41  * ; The version of the file format. Required.
42  * version=0
43  * \endverbatim
44  */
45 template <typename T>
47  public:
49 
50 
51  /**
52  * Open has to be called before any of the other functions are called.
53  * Throws a string exception if it is unable to open the file set, or if there
54  * is a format error in the sarray.
55  *
56  * Will throw an exception if a file set is already open.
57  */
58  virtual void open(index_file_information index) = 0;
59 
60  /**
61  * Open has to be called before any of the other functions are called.
62  * Throws a string exception if it is unable to open the file set, or if there
63  * is a format error in the sarray.
64  *
65  * Will throw an exception if a file set is already open.
66  */
67  virtual void open(std::string sidx_file) = 0;
68 
69  /**
70  * Closes an sarray file set. No-op if the array is already closed.
71  */
72  virtual void close() = 0;
73 
74  /**
75  * Return the number of segments in the sarray.
76  * Throws an exception if the array is not open.
77  */
78  virtual size_t num_segments() const = 0;
79 
80  /**
81  * Returns the number of elements in a given segment.
82  * should throw an exception if the segment ID does not exist,
83  */
84  virtual size_t segment_size(size_t segmentsid) const = 0;
85 
86  /**
87  * Reads a collection of rows, storing the result in out_obj.
88  * This function is independent of the open_segment/read_segment/close_segment
89  * functions, and can be called anytime. This function is also fully
90  * concurrent.
91  * \param row_start First row to read
92  * \param row_end one past the last row to read (i.e. EXCLUSIVE). row_end can
93  * be beyond the end of the array, in which case,
94  * fewer rows will be read.
95  * \param out_obj The output array
96  * \returns Actual number of rows read. Return (size_t)(-1) on failure.
97  */
98  virtual size_t read_rows(size_t row_start,
99  size_t row_end,
100  std::vector<T>& out_obj) = 0;
101 
102  /**
103  * Returns the file index of the array (the argument in \ref open)
104  */
105  virtual std::string get_index_file() const = 0;
106 
107  /**
108  * Gets the contents of the index file information read from the index file
109  */
110  virtual const index_file_information& get_index_info() const = 0;
111 };
112 
113 template <typename T>
114 class sarray_format_reader : public sarray_format_reader_common_base<T> {
115  public:
116  virtual ~sarray_format_reader() {}
117 };
118 
119 template <>
120 class sarray_format_reader<flexible_type>
121  : public sarray_format_reader_common_base<flexible_type> {
122  public:
123  virtual ~sarray_format_reader() {}
124 
125  /**
126  * Reads a collection of rows, storing the result in out_obj.
127  * This function is independent of the open_segment/read_segment/close_segment
128  * functions, and can be called anytime. This function is also fully
129  * concurrent.
130  * \param row_start First row to read
131  * \param row_end one past the last row to read (i.e. EXCLUSIVE). row_end can
132  * be beyond the end of the array, in which case,
133  * fewer rows will be read.
134  * \param out_obj The output array
135  * \returns Actual number of rows read. Return (size_t)(-1) on failure.
136  */
138 
139  virtual size_t read_rows(size_t row_start,
140  size_t row_end,
141  sframe_rows& out_obj) {
142  size_t ret = 0;
143  out_obj.resize(1);
144  ret = read_rows(row_start, row_end, *(out_obj.get_columns()[0]));
145  return ret;
146  }
147 };
148 
149 
150 
151 
152 /**
153  * A generic \ref sarray group file format writer interface. File format
154  * implementations should extend this.
155  *
156  * The sarray_group is a collection of sarrays in a single file set.
157  *
158  * The writer is assumed to always to writing to new file sets; we are
159  * never modifying an existing file set.
160  */
161 template <typename T>
163  public:
164  virtual ~sarray_group_format_writer() {}
165 
166  /**
167  * Open has to be called before any of the other functions are called.
168  * No files are actually opened at this point.
169  */
170  virtual void open(std::string index_file,
171  size_t segments_to_create,
172  size_t columns_to_create) = 0;
173 
174  /**
175  * Set write options.
176  * Available options are
177  * "disable_padding" = true or false
178  */
179  virtual void set_options(const std::string& option, int64_t value) = 0;
180 
181  /**
182  * Gets a modifiable reference to the index file information which will
183  * be written to the index file. Can only be called after close()
184  */
186 
187 
188  /** Closes all segments.
189  */
190  virtual void close() = 0;
191 
192  /**
193  * Flushes the index_file_information to disk
194  */
195  virtual void write_index_file() = 0;
196 
197 
198  /**
199  * Writes a row to the array group
200  */
201  virtual void write_segment(size_t segmentid,
202  const std::vector<T>&) = 0;
203 
204  /**
205  * Writes a row to the array group
206  */
207  virtual void write_segment(size_t segmentid,
208  std::vector<T>&&) = 0;
209 
210  /**
211  * Writes a row to the array group
212  */
213  virtual void write_segment(size_t columnid,
214  size_t segmentid,
215  const T&) = 0;
216 
217  /**
218  * Writes a row to the array group
219  */
220  virtual void write_segment(size_t columnid,
221  size_t segmentid,
222  T&&) = 0;
223 
224  /**
225  * Writes a bunch of rows to the array group
226  */
227  virtual void write_segment(size_t segmentid,
228  const sframe_rows& rows) = 0;
229 
230  /**
231  * Writes a collection of rows to a column
232  */
233  virtual void write_column(size_t columnid,
234  size_t segmentid,
235  const std::vector<T>& t) = 0;
236 
237  /**
238  * Writes a collection of rows to a column
239  */
240  virtual void write_column(size_t columnid,
241  size_t segmentid,
242  std::vector<T>&& t) = 0;
243 
244  /**
245  * Flush all writes for a particular segment
246  */
247  virtual void flush_segment(size_t segmentid) { }
248 
249  /**
250  * Return the number of segments in the sarray.
251  * Throws an exception if the array is not open.
252  */
253  virtual size_t num_segments() const = 0;
254 
255 
256  /**
257  * Return the number of columns in the sarray.
258  * Throws an exception if the array is not open.
259  */
260  virtual size_t num_columns() const = 0;
261 };
262 
263 
264 /// \}
265 } // namespace turi
266 
267 #endif
virtual size_t num_segments() const =0
virtual size_t segment_size(size_t segmentsid) const =0
std::vector< ptr_to_decoded_column_type > & get_columns()
virtual size_t read_rows(size_t row_start, size_t row_end, std::vector< T > &out_obj)=0
virtual std::string get_index_file() const =0
virtual void flush_segment(size_t segmentid)
void resize(size_t num_cols, ssize_t num_rows=-1)
virtual void open(index_file_information index)=0
virtual const index_file_information & get_index_info() const =0