Turi Create  4.0
metadata.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_ML2_METADATA_H_
7 #define TURI_ML2_METADATA_H_
8 
9 #include <toolkits/ml_data_2/ml_data_column_modes.hpp>
10 #include <toolkits/ml_data_2/data_storage/internal_metadata.hpp>
11 #include <core/export.hpp>
12 
13 namespace turi {
14 
15 struct metadata_load;
16 
17 namespace v2 {
18 class ml_data;
19 class ml_data_side_features;
20 
21 class EXPORT ml_metadata {
22  public:
23 
24  ml_metadata(){}
25  ml_metadata(const ml_metadata&) = delete;
26 
27  /** Returns true if there is a target column present and false
28  * otherwise.
29  */
30  inline bool has_target() const;
31 
32  ////////////////////////////////////////////////////////////////////////////////
33  // Accessing the indexers
34 
35  /** Returns true if the underlying column type is indexed, and false
36  * otherwise. This differs form the is_categorical in that
37  * dictionaries are not treated as pure categorical variables, as
38  * they have values associated with them, but they are indexed.
39  *
40  * \param column_index The index of the column.
41  */
42  inline bool is_indexed(size_t column_index) const;
43 
44  /** Returns true if the underlying column type is indexed, and false
45  * otherwise. This differs form the is_categorical in that
46  * dictionaries are not treated as pure categorical variables, as
47  * they have values associated with them, but they are indexed.
48  *
49  * \overload
50  *
51  * \param column_name The name of the column.
52  */
53  inline bool is_indexed(const std::string& column_name) const;
54 
55  /** Returns the metadata for a particular column, even if that
56  * column is a side feature.
57  *
58  * \param column_index The index of the column.
59  */
60  inline const std::shared_ptr<ml_data_internal::column_indexer>&
61  indexer(size_t column_index) const;
62 
63  /** Returns the metadata for a particular column, even if that
64  * column is a side feature.
65  *
66  * \overload
67  *
68  * \param column_name The name of the column.
69  */
70  inline const std::shared_ptr<ml_data_internal::column_indexer>&
71  indexer(const std::string& column_name) const;
72 
73  /** Returns true if the underlying target type is indexed, and false
74  * otherwise. This differs form the is_categorical in that
75  * dictionaries are not treated as pure categorical variables, as
76  * they have values associated with them, but they are indexed.
77  */
78  inline bool target_is_indexed() const;
79 
80  /** Returns the metadata for a particular column, even if that
81  * column is a side feature.
82  */
83  inline const std::shared_ptr<ml_data_internal::column_indexer>& target_indexer() const;
84 
85  ////////////////////////////////////////////////////////////////////////////////
86  // Accessing the statistics
87 
88  /** Returns the metadata for a particular column, even if that
89  * column is a side feature.
90  *
91  * \param column_index The index of the column.
92  */
93  inline const std::shared_ptr<ml_data_internal::column_statistics>&
94  statistics(size_t column_index) const;
95 
96  /** Returns the metadata for a particular column, even if that
97  * column is a side feature.
98  *
99  * \overload
100  *
101  * \param column_name The name of the column.
102  */
103  inline const std::shared_ptr<ml_data_internal::column_statistics>&
104  statistics(const std::string& column_name) const;
105 
106  /** Returns the metadata for a particular column, even if that
107  * column is a side feature.
108  */
109  inline const std::shared_ptr<ml_data_internal::column_statistics>& target_statistics() const;
110 
111 
112  ////////////////////////////////////////////////////////////////////////////////
113  // Aggregate statistics of the columns
114 
115  /** Returns the number of columns present, including any possible
116  * side columns.
117  */
118  inline size_t num_columns(bool include_side_columns_if_present = true,
119  bool include_untranslated_columns = true) const;
120 
121  /** Returns the number of untranslated columns present.
122  */
123  inline size_t num_untranslated_columns() const;
124 
125  /** Returns true if there are translated columns present, and false
126  * otherwise.
127  */
128  inline bool has_translated_columns() const;
129 
130  /** Returns true if there are untranslated columns present, and false
131  * otherwise.
132  */
133  inline bool has_untranslated_columns() const;
134 
135  /** Returns the name of the column at column_index.
136  *
137  * \param column_index The index of the column.
138  */
139  inline const std::string& column_name(size_t column_index) const;
140 
141  /** Returns all column names as a vector.
142  */
143  std::vector<std::string> column_names(bool include_side_columns_if_present = true) const;
144 
145  /** Returns the index of the column matching column_name, or throws
146  * an error if it does not exist.
147  *
148  * \param column_name The name of the column.
149  */
150  inline size_t column_index(const std::string& column_name) const;
151 
152  /**
153  * Returns true if the metadata contains the given column.
154  *
155  * \param column_name The name of the column.
156  */
157  inline bool contains_column(const std::string& column_name) const;
158 
159  /** Returns the name of the column at column_index.
160  */
161  inline const std::string& target_column_name() const;
162 
163  /** Returns the current index size of the columns in the metadata.
164  *
165  * \param column_index The index of the column.
166  */
167  inline size_t column_size(size_t column_index) const;
168 
169  /** Returns the current index size of the columns in the metadata.
170  */
171  inline size_t target_column_size() const;
172 
173  ////////////////////////////////////////////////////////////////////////////////
174  // Index sizes
175 
176  /** Returns the index size of the column in the metadata that were
177  * present at train time. Index size differs from column size in
178  * that column_size may grow on test, but index_size is constant.
179  *
180  * \param column_index The index of the column.
181  */
182  inline size_t index_size(size_t column_index) const;
183 
184  /** Returns the index size of the column in the metadata that were
185  * present at train time. Index size differs from column size in
186  * that column_size may grow on test, but index_size is constant.
187  *
188  * \overload
189  *
190  * \param column_name The name of the column.
191  */
192  inline size_t index_size(const std::string& column_name) const;
193 
194 
195  /** Returns the global index offset of the columns in the metadata
196  * that were present at train time. This is fixed at setup time;
197  * global indices for the column c_idx are in the interval
198  * [global_index_offset(c_idx), global_index_offset(c_idx) + index_size(c_idx) - 1]
199  *
200  * \param column_index The index of the column.
201  */
202  inline size_t global_index_offset(size_t column_index) const;
203 
204  /** Returns the global index offset of the columns in the metadata
205  * that were present at train time. This is fixed at setup time;
206  * global indices for the column c_idx are in the interval
207  * [global_index_offset(c_idx), global_index_offset(c_idx) + index_size(c_idx) - 1]
208  *
209  * \overload
210  *
211  * \param column_name The name of the column.
212  */
213  inline size_t global_index_offset(const std::string& column_name) const;
214 
215  /** Returns the index size of the columns in the metadata that were
216  * present at train time. Index size differs from column size in
217  * that column_size may grow on test, but index_size is constant.
218  */
219  inline size_t target_index_size() const;
220 
221  /** Returns the number of distinct dimensions, including all
222  * categorical features, side features, etc.
223  */
224  inline size_t num_dimensions() const;
225 
226  ////////////////////////////////////////////////////////////////////////////////
227  // Accessing flags of the columns
228 
229  /** Returns true if the underlying type is treated as a categorical
230  * variable, and false otherwise.
231  *
232  * \param column_index The index of the column.
233  */
234  inline bool is_categorical(size_t column_index) const;
235 
236  /** Returns true if the underlying type is treated as a categorical
237  * variable, and false otherwise.
238  *
239  * \overload
240  *
241  * \param column_name The name of the column.
242  */
243  inline bool is_categorical(const std::string& column_name) const;
244 
245  /** Returns true if the underlying target type is treated as a
246  * categorical variable, and false otherwise.
247  *
248  * \overload
249  *
250  * \param column_name The name of the column.
251  */
252  inline bool target_is_categorical() const;
253 
254  /** Returns true if the underlying column is a side column handled
255  * by an implicit join, and false otherwise.
256  *
257  * \param column_index The index of the column.
258  */
259  inline bool is_side_column(size_t column_index) const;
260 
261  /** Returns true if the underlying column is a side column handled
262  * by an implicit join, and false otherwise.
263  *
264  * \overload
265  *
266  * \param column_name The name of the column.
267  */
268  inline bool is_side_column(const std::string& column_name) const;
269 
270  /** Returns true if the underlying column type is untranslated.
271  * This means it will only be available as flexible_type later on.
272  *
273  * \param column_index The index of the column.
274  */
275  inline bool is_untranslated_column(size_t column_index) const;
276 
277  /** Returns true if the underlying column type is untranslated.
278  * This means it will only be available as flexible_type later on.
279  *
280  * \overload
281  *
282  * \param column_name The name of the column.
283  */
284  inline bool is_untranslated_column(const std::string& column_name) const;
285 
286  /** Returns the mode of the column. See ml_data_column_modes.hpp
287  * for details on the column modes.
288  *
289  * \param column_index The index of the column.
290  */
291  inline ml_column_mode column_mode(size_t column_index) const;
292 
293  /** Returns the mode of the column. See ml_data_column_modes.hpp
294  * for details on the column modes.
295  *
296  * \overload
297  *
298  * \param column_name The name of the column.
299  */
300  inline ml_column_mode column_mode(const std::string& column_name) const;
301 
302  /** Returns the mode of the target column. See
303  * ml_data_column_modes.hpp for details on the column modes.
304  */
305  inline ml_column_mode target_column_mode() const;
306 
307  /** Returns the size of the columns in the metadata that were
308  * present at train time.
309  *
310  * \param column_index The index of the column.
311  */
312  inline flex_type_enum column_type(size_t column_index) const;
313 
314  /** Returns the size of the columns in the metadata that were
315  * present at train time.
316  *
317  * \overload
318  *
319  * \param column_name The name of the column.
320  */
321  inline flex_type_enum column_type(const std::string& column_name) const;
322 
323  /** Returns the mode of the target column. See
324  * ml_data_column_modes.hpp for details on the column modes.
325  */
326  inline flex_type_enum target_column_type() const;
327 
328  ////////////////////////////////////////////////////////////////////////////////
329  // Other information.
330 
331  /** Returns the current options.
332  */
333  inline const std::map<std::string, flexible_type>& get_current_options() const;
334 
335  /**
336  * Returns the feature name of a specific feature present in the metadata.
337  *
338  * Numeric columns are represented by the column name.
339  *
340  * Categorical / Categorical List / Dictionary columns are represented by
341  * "name[category]".
342  *
343  * Vectors are represented by "vector[index]", where index is numerical.
344  *
345  * \returns Names of features
346  */
347  std::string feature_name(size_t column_idx, size_t index) const;
348 
349  /**
350  * Returns a list of all the feature names present in the metadata.
351  *
352  * Numeric columns are represented by the column name.
353  *
354  * Categorical / Categorical List / Dictionary columns are represented by
355  * "name[category]".
356  *
357  * Vectors are represented by "vector[index]", where index is numerical.
358  *
359  * \returns Names of features
360  */
361  std::vector<std::string> feature_names(bool unpack_categorical_columns = true) const;
362 
363 
364  /** Serialization version.
365  */
366  size_t get_version() const { return 2; }
367 
368  /** Serialization -- save.
369  */
370  void save(turi::oarchive& oarc) const;
371 
372  /** Serialization -- load.
373  */
374  void load(turi::iarchive& iarc);
375 
376  /** Returns true if there is side data and false otherwise.
377  */
378  inline bool has_side_features() const;
379 
380  /**
381  * Set the missing value action on predict time.
382  */
383  inline void set_missing_value_on_predict(const std::string& missing_value_action) {
384  options["missing_value_action_on_predict"] = missing_value_action;
385  }
386 
387  /** Returns the side feature storage class.
388  */
389  inline std::shared_ptr<ml_data_side_features> get_side_features() const;
390 
391  /** Sets the values of all future calls to index_size() to return
392  * the column_size values currently present in the indexers. This
393  * is done automatically at the end of fill(), but it can be useful
394  * if more is done to the indexers after that that is still
395  * considered part of training.
396  */
397  void set_training_index_sizes_to_current_column_sizes();
398 
399  /** Create a new metadata object that shares the same indexing as
400  * the previous one, but has possibly different and possibly
401  * subsetted columns.
402  *
403  * The indexing on the new columns is preserved from the original
404  * metadata. Thus all the index_size, global_index_offset,
405  * etc. remain the same. The indexer classes are shared between
406  * the two metadata objects.
407  *
408  * If columns_with_cleared_metadata is given, any specified columns will have
409  * their index, metadata and statistics cleared. These columns have the
410  * essential metadata -- column dimensions and type -- retained, but all indices
411  * and statistics are reset.
412  *
413  * Example:
414  *
415  * ml_data data_user_item({{"sort_by_first_two_columns_on_train", true}});
416  * data_user_item.fill(X);
417  *
418  * // data_user_item is now sorted by user, then by item.
419  *
420  * ml_data data_item_user(data_user_item.metadata()->select_columns({"item", "user"});
421  * data_item_user.fill(X);
422  *
423  * // data_item_user is now sorted by item, then by user.
424  *
425  */
426  std::shared_ptr<ml_metadata> select_columns(
427  const std::vector<std::string>& columns, bool include_target = true,
428  const std::vector<std::string>& columns_with_cleared_metadata = {}) const;
429 
430  private:
431 
432  ////////////////////////////////////////////////////////////////////////////////
433  //
434  // Internal routines for the data stuff
435 
436 
437  /** Returns a pointer to the internal column metadata of column
438  * column_index. Useful for dealing with the column_index
439  */
440  inline ml_data_internal::column_metadata_ptr get_column_metadata(size_t column_index) const;
441 
442  ////////////////////////////////////////////////////////////////////////////////
443  // Data
444 
445  friend class ml_data;
446  friend struct turi::metadata_load;
447 
448  // column-specific metadata
449  std::vector<ml_data_internal::column_metadata_ptr> columns;
450  ml_data_internal::column_metadata_ptr target;
451 
452  // Side features
453  std::shared_ptr<ml_data_side_features> side_features;
454 
455  // The original names of the columns. This may be in a different
456  // order than the columns above (e.g. user and items are moved to
457  // index 0 and 1 in the recommender). this allows us to reorder the
458  // columns as needed
459  std::vector<std::string> original_column_names;
460 
461  // The options the model was created with.
462  std::map<std::string, flexible_type> options;
463 
464  // Cached values; this is a small optimization here to allow
465  // statistics to be used in time-sensitive places. The
466  // setup_cached_values function prepares these from the current
467  // metadata. This is only called by the load and creation routines.
468  void setup_cached_values();
469 
470  size_t _num_dimensions = size_t(-1);
471  size_t _num_untranslated_columns = size_t(-1);
472  std::map<std::string, size_t> _column_name_to_index_map;
473 };
474 
475 }}
476 
477 // Include the implementations of the above inline functions.
478 #include <toolkits/ml_data_2/metadata_impl.hpp>
479 
480 ////////////////////////////////////////////////////////////////////////////////
481 // Implement serialization for
482 // std::shared_ptr<ml_metadata>
483 
484 BEGIN_OUT_OF_PLACE_SAVE(arc, std::shared_ptr<turi::v2::ml_metadata>, m) {
485  if(m == nullptr) {
486  arc << false;
487  } else {
488  arc << true;
489  arc << (*m);
490  }
491 } END_OUT_OF_PLACE_SAVE()
492 
493 BEGIN_OUT_OF_PLACE_LOAD(arc, std::shared_ptr<turi::v2::ml_metadata>, m) {
494  bool is_not_nullptr;
495  arc >> is_not_nullptr;
496  if(is_not_nullptr) {
497  m.reset(new turi::v2::ml_metadata);
498  arc >> (*m);
499  } else {
500  m = std::shared_ptr<turi::v2::ml_metadata>(nullptr);
501  }
502 } END_OUT_OF_PLACE_LOAD()
503 
504 #endif
#define BEGIN_OUT_OF_PLACE_LOAD(arc, tname, tval)
Macro to make it easy to define out-of-place loads.
Definition: iarchive.hpp:314
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
STL namespace.
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
#define BEGIN_OUT_OF_PLACE_SAVE(arc, tname, tval)
Macro to make it easy to define out-of-place saves.
Definition: oarchive.hpp:346