Turi Create  4.0
internal_metadata.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_ML2_DATA_COLUMN_METADATA_H_
7 #define TURI_ML2_DATA_COLUMN_METADATA_H_
8 
9 #include <toolkits/ml_data_2/indexing/column_indexer.hpp>
10 #include <toolkits/ml_data_2/statistics/column_statistics.hpp>
11 #include <core/storage/sframe_data/sarray.hpp>
12 
13 namespace turi {
14 
15 struct metadata_load;
16 
17 namespace v2 {
18 
19 class ml_metadata;
20 
21 namespace ml_data_internal {
22 
23 /** The metadata information for a single column. This is meant to
24  * be used internally to ml_data; there is no reason that the
25  * structures outside of ml_data need to access this; ml_metadata
26  * should be used instead.
27  *
28  * This structure is necessary as many of the internal processing
29  * routines use a vector of column metadata to handle all the
30  * processing. Having this structure, which organizes all the parts
31  * of the column metadata into one place, greatly simplifies this
32  * processing.
33  */
35 
36  ////////////////////////////////////////////////////////////////////////////////
37  // Public data members
38 
39  std::string name = "";
40  ml_column_mode mode;
41  flex_type_enum original_column_type;
42  std::shared_ptr<ml_data_internal::column_indexer> indexer = nullptr;
43  std::shared_ptr<ml_data_internal::column_statistics> statistics = nullptr;
44 
45 
46  ////////////////////////////////////////////////////////////////////////////////
47  // Construction
48 
49  /** Generates a new column_metadata class using the data arrays and
50  * the types.
51  */
52  void setup(bool is_target_column,
53  const std::string& name,
54  const std::shared_ptr<sarray<flexible_type> >& column,
55  const std::map<std::string, ml_column_mode>& mode_overrides,
56  const std::map<std::string, flexible_type>& options);
57 
58  /** Finalize training.
59  */
61  void set_training_index_offset(size_t previous_total);
62 
63  ////////////////////////////////////////////////////////////////////////////////
64  // Some data sizing stuff
65 
66  /** Returns true if the mode of this column has a fixed mode size
67  * and false otherwise.
68  *
69  */
70  bool mode_has_fixed_size() const {
71  bool has_fixed_size = (column_data_size_if_fixed != size_t(-1));
72 
73  DASSERT_TRUE(has_fixed_size == v2::mode_has_fixed_size(mode));
74 
75  return has_fixed_size;
76  }
77 
78  /** Returns true if this column is untranslated and false otherwise.
79  */
80  bool is_untranslated_column() const {
81  return (mode == ml_column_mode::UNTRANSLATED);
82  }
83 
84  /** Returns the size of the index at training time.
85  */
86  size_t index_size() const {
87  DASSERT_TRUE(index_size_at_train_time != size_t(-1));
88  return index_size_at_train_time;
89  }
90 
91  /** Returns the size of the index at training time.
92  */
93  size_t global_index_offset() const {
94 
95  // This should be set
96  DASSERT_TRUE(index_size_at_train_time != size_t(-1));
97  DASSERT_TRUE(global_index_offset_at_train_time != size_t(-1));
98  return global_index_offset_at_train_time;
99  }
100 
101  private:
102 
103  friend struct turi::metadata_load;
104 
105  /** This is set to hold the size of the numeric column if it is
106  * fixed, and size_t(-1) otherwise.
107  */
108  size_t index_size_at_train_time = size_t(-1);
109  size_t column_data_size_if_fixed = size_t(-1);
110 
111  size_t global_index_offset_at_train_time = size_t(-1);
112 
113  public:
114  const size_t fixed_column_size() const {
116  return column_data_size_if_fixed;
117  }
118 
119  /** During loading, we need to verify that the columns indeed have
120  * the correct column sizes.
121  */
122  inline void check_fixed_column_size(const flexible_type& v) const GL_HOT_INLINE_FLATTEN;
123 
124  /** Returns the current size of the column.
125  *
126  */
127  inline size_t column_size() const {
128  if(mode_is_indexed(mode)) {
129  return indexer->indexed_column_size();
130  } else {
132  return column_data_size_if_fixed;
133  }
134  }
135 
136  /** Serialization -- save.
137  */
138  void save(turi::oarchive& oarc) const;
139 
140  /** Serialization -- load.
141  */
142  void load(turi::iarchive& iarc);
143 
144  /** Create a version of the metadata with all the indexing and statistics cleared.
145  *
146  */
147  std::shared_ptr<column_metadata> create_cleared_copy() const;
148 
149 };
150 
151 typedef std::shared_ptr<column_metadata> column_metadata_ptr;
152 
153 ////////////////////////////////////////////////////////////////////////////////
154 
155 /** This structure holds the main data being passed around
156  * internally. It contains all the information needed to quickly
157  * unpack a row from the internal data structure.
158  */
159 struct row_metadata {
160 
161  row_metadata(){}
162 
163  /** Constructs all the information from a vector of points.
164  */
165  void setup(const std::vector<std::shared_ptr<column_metadata> >& _metadata_vect, bool _has_target);
166 
167  /** Constructs all the information from a vector of points.
168  */
169  void set_index_sizes(const std::shared_ptr<ml_metadata>& m);
170 
171  bool has_target = false;
172  bool target_is_indexed = false;
173 
174  /** True if the data size is constant, and false otherwise.
175  */
176  bool data_size_is_constant = false;
177 
178  /** If the data size is constant, then this gives its
179  * size. Otherwise, it's set to 0.
180  */
181  size_t constant_data_size = 0;
182 
183  /** Number of columns, not including target.
184  */
185  size_t num_x_columns = 0;
186 
187  /** Total number of columns, including possible target.
188  */
189  size_t total_num_columns = 0;
190 
191  /** Pointers to the original metadata vectors.
192  */
193  std::vector<column_metadata_ptr> metadata_vect;
194 };
195 
196 ////////////////////////////////////////////////////////////////////////////////
197 // A few implementations of the above functions
198 
200 
201  // The only mode needed to be checked right now is this one.
202  if(mode == ml_column_mode::NUMERIC_VECTOR) {
203 
204  DASSERT_TRUE(original_column_type == flex_type_enum::VECTOR);
206 
207  size_t nv = f.get<flex_vec>().size();
208 
209  DASSERT_TRUE(column_data_size_if_fixed != size_t(-1));
210 
211  if(UNLIKELY(nv != column_data_size_if_fixed)) {
212 
213  auto throw_error = [&]() GL_GCC_ONLY(GL_COLD_NOINLINE) {
214  log_and_throw(
215  std::string("Dataset mismatch between training and prediction. Numeric feature '")
216  + name
217  + "' must contain lists of consistent size. (Found lists/arrays of sizes "
218  + std::to_string(nv) + " and "
219  + std::to_string(column_data_size_if_fixed) + ").");
220  };
221 
222  throw_error();
223  }
224  }
225 }
226 
227 }}}
228 
229 ////////////////////////////////////////////////////////////////////////////////
230 // Implement serialization for
231 // std::shared_ptr<column_metadata>
232 
233 BEGIN_OUT_OF_PLACE_SAVE(arc, std::shared_ptr<turi::v2::ml_data_internal::column_metadata>, m) {
234  if(m == nullptr) {
235  arc << false;
236  } else {
237  arc << true;
238  arc << (*m);
239  }
240 } END_OUT_OF_PLACE_SAVE()
241 
242 BEGIN_OUT_OF_PLACE_LOAD(arc, std::shared_ptr<turi::v2::ml_data_internal::column_metadata>, m) {
243  bool is_not_nullptr;
244  arc >> is_not_nullptr;
245  if(is_not_nullptr) {
247  arc >> (*m);
248  } else {
249  m = std::shared_ptr<turi::v2::ml_data_internal::column_metadata>(nullptr);
250  }
251 } END_OUT_OF_PLACE_LOAD()
252 
253 
254 #endif /* TURI_ML2_DATA_COLUMN_METADATA_H_ */
static GL_HOT_INLINE_FLATTEN bool mode_has_fixed_size(ml_column_mode mode)
#define BEGIN_OUT_OF_PLACE_LOAD(arc, tname, tval)
Macro to make it easy to define out-of-place loads.
Definition: iarchive.hpp:314
std::vector< double > flex_vec
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
std::vector< column_metadata_ptr > metadata_vect
const T & get() const
std::shared_ptr< column_metadata > create_cleared_copy() const
flex_type_enum get_type() const
#define GL_HOT_INLINE_FLATTEN
void check_fixed_column_size(const flexible_type &v) const GL_HOT_INLINE_FLATTEN
void setup(bool is_target_column, const std::string &name, const std::shared_ptr< sarray< flexible_type > > &column, const std::map< std::string, ml_column_mode > &mode_overrides, const std::map< std::string, flexible_type > &options)
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
static GL_HOT_INLINE_FLATTEN bool mode_is_indexed(ml_column_mode mode)
void save(turi::oarchive &oarc) const
#define DASSERT_TRUE(cond)
Definition: assertions.hpp:364
#define BEGIN_OUT_OF_PLACE_SAVE(arc, tname, tval)
Macro to make it easy to define out-of-place saves.
Definition: oarchive.hpp:346