Turi Create  4.0
internal_metadata.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at
5  * https://opensource.org/licenses/BSD-3-Clause
6  */
7 #ifndef TURI_DML_DATA_COLUMN_METADATA_H_
8 #define TURI_DML_DATA_COLUMN_METADATA_H_
9 
10 #include <ml/ml_data/column_indexer.hpp>
11 #include <ml/ml_data/column_statistics.hpp>
12 #include <ml/ml_data/ml_data_column_modes.hpp>
13 #include <core/storage/sframe_data/sarray.hpp>
14 
15 namespace turi {
16 
17 struct metadata_load;
18 class ml_metadata;
19 
20 namespace ml_data_internal {
21 
22 /** The metadata information for a single column. This is meant to
23  * be used internally to ml_data; there is no reason that the
24  * structures outside of ml_data need to access this; ml_metadata
25  * should be used instead.
26  *
27  * This structure is necessary as many of the internal processing
28  * routines use a vector of column metadata to handle all the
29  * processing. Having this structure, which organizes all the parts
30  * of the column metadata into one place, greatly simplifies this
31  * processing.
32  */
34  ////////////////////////////////////////////////////////////////////////////////
35  // Public data members
36 
37  std::string name = "";
38  ml_column_mode mode;
39  flex_type_enum original_column_type;
40  std::shared_ptr<ml_data_internal::column_indexer> indexer = nullptr;
41  std::shared_ptr<ml_data_internal::column_statistics> statistics = nullptr;
42 
43  ////////////////////////////////////////////////////////////////////////////////
44  // Construction
45 
46  /** Generates a new column_metadata class using the data arrays and
47  * the types.
48  */
49  void setup(bool is_target_column, const std::string& name,
50  const std::shared_ptr<sarray<flexible_type> >& column,
51  const std::map<std::string, ml_column_mode>& mode_overrides);
52 
53  /** Finalize training.
54  */
56  void set_training_index_offset(size_t previous_total);
57 
58  ////////////////////////////////////////////////////////////////////////////////
59  // Some data sizing stuff
60 
61  /** Returns true if the mode of this column has a fixed mode size
62  * and false otherwise.
63  *
64  */
65  bool mode_has_fixed_size() const {
66  bool has_fixed_size = (column_data_size_if_fixed != size_t(-1));
67 
68  DASSERT_TRUE(has_fixed_size == turi::mode_has_fixed_size(mode));
69 
70  return has_fixed_size;
71  }
72 
73  /** Returns true if this column is untranslated and false otherwise.
74  */
75  bool is_untranslated_column() const {
76  return (mode == ml_column_mode::UNTRANSLATED);
77  }
78 
79  /** Returns the size of the index at training time.
80  */
81  size_t index_size() const {
82  DASSERT_TRUE(index_size_at_train_time != size_t(-1));
83  return index_size_at_train_time;
84  }
85 
86  /** Returns the size of the index at training time.
87  */
88  size_t global_index_offset() const {
89  // This should be set
90  DASSERT_TRUE(index_size_at_train_time != size_t(-1));
91  DASSERT_TRUE(global_index_offset_at_train_time != size_t(-1));
92  return global_index_offset_at_train_time;
93  }
94 
95  /** For debug testing. Make sure things are equal.
96  */
97 #ifndef NDEBUG
98  void _debug_is_equal(const column_metadata& other) const;
99 #else
100  void _debug_is_equal(const column_metadata& other) const {}
101 #endif
102 
103  private:
104  friend struct turi::metadata_load;
105 
106  /** This is set to hold the size of the numeric column if it is
107  * fixed, and size_t(-1) otherwise.
108  */
109  size_t index_size_at_train_time = size_t(-1);
110  size_t column_data_size_if_fixed = size_t(-1);
111 
112  /** To be used only if it's an ndarray column type.
113  */
114  flex_nd_vec::index_range_type nd_array_size;
115 
116  size_t global_index_offset_at_train_time = size_t(-1);
117 
118  public:
119  const size_t fixed_column_size() const {
121  return column_data_size_if_fixed;
122  }
123 
124  /** During loading, we need to verify that the columns indeed have
125  * the correct column sizes.
126  */
127  inline void check_fixed_column_size(const flexible_type& v) const
129 
130  /** Returns the current size of the column.
131  *
132  */
133  inline size_t column_size() const {
134  if (mode_is_indexed(mode)) {
135  return indexer->indexed_column_size();
136  } else {
138  return column_data_size_if_fixed;
139  }
140  }
141 
142  /** Returns the current shape of the column as if it's an nd_vec
143  *
144  */
145  inline const flex_nd_vec::index_range_type& nd_column_shape() const {
147  return nd_array_size;
148  }
149 
150 
151  /** Serialization -- save.
152  */
153  void save(turi::oarchive& oarc) const;
154 
155  /** Serialization -- load.
156  */
157  void load(turi::iarchive& iarc);
158 };
159 
160 typedef std::shared_ptr<column_metadata> column_metadata_ptr;
161 
162 ////////////////////////////////////////////////////////////////////////////////
163 
164 /** This structure holds the main data being passed around
165  * internally. It contains all the information needed to quickly
166  * unpack a row from the internal data structure.
167  */
168 struct row_metadata {
169  row_metadata() {}
170 
171  /** Constructs all the information from a vector of points.
172  */
173  void setup(
174  const std::vector<std::shared_ptr<column_metadata> >& _metadata_vect,
175  bool _has_target);
176 
177  /** Constructs all the information from a vector of points.
178  */
179  void set_index_sizes(const std::shared_ptr<ml_metadata>& m);
180 
181  bool has_target = false;
182  bool target_is_indexed = false;
183 
184  /** True if the data size is constant, and false otherwise.
185  */
186  bool data_size_is_constant = false;
187 
188  /** If the data size is constant, then this gives its
189  * size. Otherwise, it's set to 0.
190  */
191  size_t constant_data_size = 0;
192 
193  /** To be used only if it's an ndarray column type.
194  */
195  flex_nd_vec::index_range_type nd_array_size;
196 
197  /** Number of columns, not including target.
198  */
199  size_t num_x_columns = 0;
200 
201  /** Total number of columns, including possible target.
202  */
203  size_t total_num_columns = 0;
204 
205  /** Pointers to the original metadata vectors.
206  */
207  std::vector<column_metadata_ptr> metadata_vect;
208 
209  /** Serialization -- save.
210  */
211  void save(turi::oarchive& oarc) const;
212 
213  /** Serialization -- load.
214  */
215  void load(turi::iarchive& iarc);
216 
217 #ifndef NDEBUG
218  void _debug_is_equal(const row_metadata& rm) const;
219 #else
220  void _debug_is_equal(const row_metadata& rm) const {}
221 #endif
222 };
223 
224 ////////////////////////////////////////////////////////////////////////////////
225 // A few implementations of the above functions
226 
228  const flexible_type& f) const {
229 
230  auto throw_error_1d = [&](size_t nv) GL_GCC_ONLY(GL_COLD_NOINLINE) {
231  log_and_throw(std::string("Dataset mismatch. Numeric feature '") + name +
232  "' must contain lists of consistent size. (Found "
233  "lists/arrays of sizes " +
234  std::to_string(nv) + " and " +
235  std::to_string(column_data_size_if_fixed) + ").");
236  };
237 
238  auto throw_error_nd =
239  [&](const flex_nd_vec::index_range_type& shape)
240  GL_GCC_ONLY(GL_COLD_NOINLINE) {
241 
242  if (shape.size() == 1 && nd_array_size.size() <= 1) {
243  throw_error_1d(shape[0]);
244  } else {
245  std::ostringstream error;
246 
247  error << "Dataset mismatch. Numeric feature '" << name
248  << "' must contain lists of consistent size. (Found "
249  "lists/arrays of sizes ";
250 
251  if (nd_array_size.empty()) {
252  error << "(" << column_data_size_if_fixed << ",)";
253  } else {
254  error << "(";
255  for (const auto& d : nd_array_size) {
256  error << d << ",";
257  }
258  error << ")";
259  }
260 
261  error << " and (";
262  for (const auto& d : shape) {
263  error << d << ",";
264  }
265  error << ").";
266  }
267  };
268 
269  // The only mode needed to be checked right now is this one.
270  if (mode == ml_column_mode::NUMERIC_VECTOR) {
271  DASSERT_TRUE(column_data_size_if_fixed != size_t(-1));
272  DASSERT_LE(nd_array_size.size(), 1);
273 
274  if (f.get_type() == flex_type_enum::VECTOR) {
275  const flex_vec& v = f.get<flex_vec>();
276  size_t nv = v.size();
277 
278  DASSERT_TRUE(column_data_size_if_fixed != size_t(-1));
279 
280  if (UNLIKELY(nv != column_data_size_if_fixed)) {
281  throw_error_1d(nv);
282  }
283  } else if (f.get_type() == flex_type_enum::ND_VECTOR) {
284  const flex_nd_vec& v = f.get<flex_nd_vec>();
285  const auto& shape = v.shape();
286 
287  if(UNLIKELY(shape.size() != 1 || shape[0] != column_data_size_if_fixed)) {
288  throw_error_nd(shape);
289  }
290  } else {
291  ASSERT_TRUE(false);
292  }
293 
294  } else if (mode == ml_column_mode::NUMERIC_ND_VECTOR) {
295  DASSERT_TRUE(column_data_size_if_fixed != size_t(-1));
296  DASSERT_FALSE(nd_array_size.empty());
297 
298  if (UNLIKELY(f.get_type() == flex_type_enum::VECTOR)) {
299  const flex_vec& v = f.get<flex_vec>();
300  if (nd_array_size.size() != 1) {
301  throw_error_nd({v.size()});
302  }
303  size_t nv = v.size();
304 
305  DASSERT_TRUE(column_data_size_if_fixed != size_t(-1));
306 
307  if (UNLIKELY(nv != column_data_size_if_fixed)) {
308  throw_error_1d(nv);
309  }
310 
311  } else if (f.get_type() == flex_type_enum::ND_VECTOR) {
312  const flex_nd_vec& v = f.get<flex_nd_vec>();
313  const auto& shape = v.shape();
314 
315  if(UNLIKELY(shape.size() != nd_array_size.size())) {
316  throw_error_nd(shape);
317  }
318 
319  for(size_t i = 0; i < shape.size(); ++i) {
320  if (UNLIKELY(shape[i] != nd_array_size[i])) {
321  throw_error_nd(shape);
322  }
323  }
324  } else {
325  ASSERT_TRUE(false);
326  }
327  }
328 }
329 
330 }}
331 
332 ////////////////////////////////////////////////////////////////////////////////
333 // Implement serialization for
334 // std::shared_ptr<column_metadata>
335 
337  arc, std::shared_ptr<turi::ml_data_internal::column_metadata>, m) {
338  if (m == nullptr) {
339  arc << false;
340  } else {
341  arc << true;
342  arc << (*m);
343  }
344 }
345 END_OUT_OF_PLACE_SAVE()
346 
348  arc, std::shared_ptr<turi::ml_data_internal::column_metadata>, m) {
349  bool is_not_nullptr;
350  arc >> is_not_nullptr;
351  if (is_not_nullptr) {
353  arc >> (*m);
354  } else {
355  m = std::shared_ptr<turi::ml_data_internal::column_metadata>(nullptr);
356  }
357 }
358 END_OUT_OF_PLACE_LOAD()
359 
360 #endif /* TURI_DML_DATA_COLUMN_METADATA_H_ */
static GL_HOT_INLINE_FLATTEN bool mode_has_fixed_size(ml_column_mode mode)
void save(turi::oarchive &oarc) const
#define BEGIN_OUT_OF_PLACE_LOAD(arc, tname, tval)
Macro to make it easy to define out-of-place loads.
Definition: iarchive.hpp:314
std::vector< double > flex_vec
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
void _debug_is_equal(const column_metadata &other) const
const flex_nd_vec::index_range_type & nd_column_shape() const
flex_nd_vec::index_range_type nd_array_size
void check_fixed_column_size(const flexible_type &v) const GL_HOT_INLINE_FLATTEN
const index_range_type & shape() const
Definition: ndarray.hpp:312
void load(turi::iarchive &iarc)
const T & get() const
flex_type_enum get_type() const
#define DASSERT_FALSE(cond)
Definition: assertions.hpp:365
void setup(bool is_target_column, const std::string &name, const std::shared_ptr< sarray< flexible_type > > &column, const std::map< std::string, ml_column_mode > &mode_overrides)
#define GL_HOT_INLINE_FLATTEN
#define ASSERT_TRUE(cond)
Definition: assertions.hpp:309
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
static GL_HOT_INLINE_FLATTEN bool mode_is_indexed(ml_column_mode mode)
#define DASSERT_TRUE(cond)
Definition: assertions.hpp:364
#define BEGIN_OUT_OF_PLACE_SAVE(arc, tname, tval)
Macro to make it easy to define out-of-place saves.
Definition: oarchive.hpp:346
std::vector< column_metadata_ptr > metadata_vect