Turi Create  4.0
sarray_index_file.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_UNITY_SARRAY_INDEX_FILE_HPP
7 #define TURI_UNITY_SARRAY_INDEX_FILE_HPP
8 #include <cstdint>
9 #include <string>
10 #include <vector>
11 #include <map>
12 #include <memory>
13 namespace turi {
14 class oarchive;
15 class iarchive;
16 
17 
18 /**
19  * \internal
20  * \ingroup sframe_physical
21  * \addtogroup sframe_internal SFrame Internal
22  * \{
23  */
24 
25 /**
26  * Describes all the information in an sarray index file.
27  * The index_file_information struct contains all the information assocaited
28  * with a *single sarray column*. In a version 1 SArray, the index file
29  * describes a single column. As such index_file will point to the actual
30  * file location.
31  * In a version 2 SArray, the index file describes
32  * multiple columns. As such, index_file is of the form
33  * [file_location]:[column_number], and column_number may be non-zero.
34  * Column numbers are 0 indexed. segment_files are similar. In the v1 format,
35  * the segment_files point to the actual files. In the v2 format, the segment
36  * files are of the form [file_location]:[column_number].
37  */
39  /// Input file name
40  std::string index_file;
41  /// The format version of the sarray
42  int version = -1;
43  /// The number of segments in the array
44  size_t nsegments = 0;
45  /// block_size; Required for version 1.
46  size_t block_size = 0;
47  /// The datatype of the array (typeid(T).name()).
48  std::string content_type;
49  /// The length of each segment (number of entries).
50  std::vector<size_t> segment_sizes;
51  /// The file name of each segment
52  std::vector<std::string> segment_files;
53  /// Any additional metadata stored with the array
54  std::map<std::string, std::string> metadata;
55 
56  void save(oarchive& oarc) const;
57  void load(iarchive& iarc);
58 };
59 
60 /**
61  * Reads an sarray index file from disk.
62  * This will automatically adapt to v1 and v2 index file formats.
63  * - If index_file is "xxx.sidx", and is a v1 format, it will be read as normal
64  * - If index_file is "xxx.sidx", and is a v2 format (i.e. array group),
65  * it will return the 1st column (column 0) of the group.
66  * - If index_file is "xxx.sidx:n", and is a v2 format (i.e. array group),
67  * it will return column n of the group.
68  * All other conditions will fail.
69  * Raise an exception on failure.
70  *
71  * This function will also automatically de-relativize the
72  * \ref sframe_index_file_information::column_files to get absolute paths
73  */
75 
76 
77 /**
78  * The group index file is the version 2 SArray index file format.
79  * It holds multiple columns in a single fileset. As such, the
80  * group_index_file_information struct basically comprises of some common
81  * information (version, nsegments, etc), but also contains a vector
82  * of an index_file_information for each column in the group.
83  */
85  /// Input file name
86  std::string group_index_file;
87  /// The format version of the sarray
88  int version;
89  /// The number of segments in the array
90  size_t nsegments;
91  /// The file name of each segment
92  std::vector<std::string> segment_files;
93  /**
94  * The index file information for each column.
95  * The index_file_information basically has fields which mirror the
96  * fields here. for instance, version, segment_files
97  * basically are the same. The exceptions are:
98  * - columns[0].index_file = group_index_file + ":0"
99  * - columns[0].column_number = 0
100  * - columns[1].index_file = group_index_file + ":1"
101  * - columns[1].column_number = 1
102  */
103  std::vector<index_file_information> columns;
104 };
105 /**
106  * Reads an sarray group index file from disk.
107  * Raises an exception on failure.
108  *
109  * An array_group is a group of sarrays in a single collection of files.
110  */
111 group_index_file_information read_array_group_index_file(std::string group_index_file);
112 
113 
114 /**
115  * Writes an sarray v2 index file to disk.
116  * Raises an exception on failure.
117  *
118  * This function will also automatically relativize the
119  * \ref sframe_index_file_information::column_files to get relative paths
120  * when writing to disk
121  */
122 void write_array_group_index_file(std::string group_index_file,
123  const group_index_file_information& info);
124 
125 
126 /**
127  * Splits a filename of the form [filename]:N into a pair of {filename, N}.
128  * If the filename is not of that form, or cannot be interpreted as that form,
129  * {filename, 0} is returned.
130  */
131 std::pair<std::string, size_t> parse_v2_segment_filename(std::string fname);
132 
133 /// \}
134 } // namespace turi
135 #endif
std::string index_file
Input file name.
size_t block_size
block_size; Required for version 1.
void write_array_group_index_file(std::string group_index_file, const group_index_file_information &info)
std::vector< index_file_information > columns
std::vector< std::string > segment_files
The file name of each segment.
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
std::vector< std::string > segment_files
The file name of each segment.
std::pair< std::string, size_t > parse_v2_segment_filename(std::string fname)
int version
The format version of the sarray.
size_t nsegments
The number of segments in the array.
std::string group_index_file
Input file name.
group_index_file_information read_array_group_index_file(std::string group_index_file)
int version
The format version of the sarray.
size_t nsegments
The number of segments in the array.
index_file_information read_index_file(std::string index_file)
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
std::vector< size_t > segment_sizes
The length of each segment (number of entries).
std::map< std::string, std::string > metadata
Any additional metadata stored with the array.
std::string content_type
The datatype of the array (typeid(T).name()).