Turi Create  4.0
metadata.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_DML_METADATA_H_
7 #define TURI_DML_METADATA_H_
8 
9 #include <ml/ml_data/ml_data_column_modes.hpp>
10 #include <ml/ml_data/data_storage/internal_metadata.hpp>
11 
12 namespace turi {
13 
14 struct metadata_load;
15 class ml_data;
16 
17 
18 /**
19  * \ingroup mldata
20  * ml_metadata provides all the column-wise statistics and column translation
21  * information for \ref ml_data.
22  */
23 class ml_metadata {
24  public:
25 
26  ml_metadata(){}
27  ml_metadata(const ml_metadata&) = delete;
28 
29  /** Returns true if there is a target column present and false
30  * otherwise.
31  */
32  inline bool has_target() const;
33 
34  ////////////////////////////////////////////////////////////////////////////////
35  // Accessing the indexers
36 
37  /** Returns true if the underlying column type is indexed, and false
38  * otherwise. This differs form the is_categorical in that
39  * dictionaries are not treated as pure categorical variables, as
40  * they have values associated with them, but they are indexed.
41  *
42  * \param column_index The index of the column.
43  */
44  inline bool is_indexed(size_t column_index) const;
45 
46  /** Returns true if the underlying column type is indexed, and false
47  * otherwise. This differs form the is_categorical in that
48  * dictionaries are not treated as pure categorical variables, as
49  * they have values associated with them, but they are indexed.
50  *
51  * \overload
52  *
53  * \param column_name The name of the column.
54  */
55  inline bool is_indexed(const std::string& column_name) const;
56 
57  /** Returns the indexer for a particular column.
58  *
59  * \param column_index The index of the column.
60  */
61  inline const std::shared_ptr<ml_data_internal::column_indexer>&
62  indexer(size_t column_index) const;
63 
64  /** Returns the indexer for a particular column.
65  *
66  * \overload
67  *
68  * \param column_name The name of the column.
69  */
70  inline const std::shared_ptr<ml_data_internal::column_indexer>&
71  indexer(const std::string& column_name) const;
72 
73  /** Returns true if the underlying target type is indexed, and false
74  * otherwise. This differs form the is_categorical in that
75  * dictionaries are not treated as pure categorical variables, as
76  * they have values associated with them, but they are indexed.
77  */
78  inline bool target_is_indexed() const;
79 
80  inline const std::shared_ptr<ml_data_internal::column_indexer>& target_indexer() const;
81 
82  ////////////////////////////////////////////////////////////////////////////////
83  // Accessing the statistics
84 
85  /** Returns the statistics for a particular column.
86  *
87  * \param column_index The index of the column.
88  */
89  inline const std::shared_ptr<ml_data_internal::column_statistics>&
90  statistics(size_t column_index) const;
91 
92  /** Returns the statistics for a particular column.
93  *
94  * \overload
95  *
96  * \param column_name The name of the column.
97  */
98  inline const std::shared_ptr<ml_data_internal::column_statistics>&
99  statistics(const std::string& column_name) const;
100 
101  inline const std::shared_ptr<ml_data_internal::column_statistics>& target_statistics() const;
102 
103 
104  ////////////////////////////////////////////////////////////////////////////////
105  // Aggregate statistics of the columns
106 
107  /** Returns the number of columns present.
108  */
109  inline size_t num_columns(bool include_untranslated_columns = true) const;
110 
111  /** Returns the number of untranslated columns present.
112  */
113  inline size_t num_untranslated_columns() const;
114 
115  /** Returns true if there are translated columns present, and false
116  * otherwise.
117  */
118  inline bool has_translated_columns() const;
119 
120  /** Returns true if there are untranslated columns present, and false
121  * otherwise.
122  */
123  inline bool has_untranslated_columns() const;
124 
125  /** Returns the name of the column at column_index.
126  *
127  * \param column_index The index of the column.
128  */
129  inline const std::string& column_name(size_t column_index) const;
130 
131  /** Returns all column names as a vector.
132  */
133  std::vector<std::string> column_names() const;
134 
135  /** Returns the index of the column matching column_name, or throws
136  * an error if it does not exist.
137  *
138  * \param column_name The name of the column.
139  *
140  * \param max_on_error If true, then size_t(-1) is returned if the
141  * column is not present.
142  */
143  inline size_t column_index(const std::string& column_name, bool max_on_error = false) const;
144 
145  /**
146  * Returns true if the metadata contains the given column.
147  *
148  * \param column_name The name of the column.
149  */
150  inline bool contains_column(const std::string& column_name) const;
151 
152  /** Returns the name of the column at column_index.
153  */
154  inline const std::string& target_column_name() const;
155 
156  /** Returns the current index size of the columns in the metadata.
157  *
158  * \param column_index The index of the column.
159  */
160  inline size_t column_size(size_t column_index) const;
161 
162  /** If the type of the column is an ND vector, returns the shape of the nd_vector
163  * held by that coulmn.
164  *
165  * \param column_index The index of the column.
166  */
167  inline const flex_nd_vec::index_range_type& nd_column_shape(size_t column_index) const;
168 
169  /** If the type of the column is an ND vector, returns the shape of the nd_vector
170  * held by that coulmn.
171  *
172  * \param column_index The index of the column.
173  */
174  inline const flex_nd_vec::index_range_type& nd_column_shape(const std::string& column_name) const;
175 
176  /** Returns the current index size of the columns in the metadata.
177  */
178  inline size_t target_column_size() const;
179 
180  ////////////////////////////////////////////////////////////////////////////////
181  // Index sizes
182 
183  /** Returns the index size of the column in the metadata that were
184  * present at train time. Index size differs from column size in
185  * that column_size may grow on test, but index_size is constant.
186  *
187  * \param column_index The index of the column.
188  */
189  inline size_t index_size(size_t column_index) const;
190 
191  /** Returns the index size of the column in the metadata that were
192  * present at train time. Index size differs from column size in
193  * that column_size may grow on test, but index_size is constant.
194  *
195  * \overload
196  *
197  * \param column_name The name of the column.
198  */
199  inline size_t index_size(const std::string& column_name) const;
200 
201  /** Returns the global index offset of the columns in the metadata
202  * that were present at train time. This is fixed at setup time;
203  * global indices for the column c_idx are in the interval
204  * [global_index_offset(c_idx), global_index_offset(c_idx) + index_size(c_idx) - 1]
205  *
206  * \param column_index The index of the column.
207  */
208  inline size_t global_index_offset(size_t column_index) const;
209 
210  /** Returns the global index offset of the columns in the metadata
211  * that were present at train time. This is fixed at setup time;
212  * global indices for the column c_idx are in the interval
213  * [global_index_offset(c_idx), global_index_offset(c_idx) + index_size(c_idx) - 1]
214  *
215  * \overload
216  *
217  * \param column_name The name of the column.
218  */
219  inline size_t global_index_offset(const std::string& column_name) const;
220 
221  /** Returns the index size of the columns in the metadata that were
222  * present at train time. Index size differs from column size in
223  * that column_size may grow on test, but index_size is constant.
224  */
225  inline size_t target_index_size() const;
226 
227  /** Returns the number of distinct dimensions, including all
228  * categorical features, etc.
229  */
230  inline size_t num_dimensions() const;
231 
232  ////////////////////////////////////////////////////////////////////////////////
233  // Accessing flags of the columns
234 
235  /** Returns true if the underlying type is treated as a categorical
236  * variable, and false otherwise.
237  *
238  * \param column_index The index of the column.
239  */
240  inline bool is_categorical(size_t column_index) const;
241 
242  /** Returns true if the underlying type is treated as a categorical
243  * variable, and false otherwise.
244  *
245  * \overload
246  *
247  * \param column_name The name of the column.
248  */
249  inline bool is_categorical(const std::string& column_name) const;
250 
251  /** Returns true if the underlying target type is treated as a
252  * categorical variable, and false otherwise.
253  *
254  * \overload
255  *
256  * \param column_name The name of the column.
257  */
258  inline bool target_is_categorical() const;
259 
260  /** Returns true if the underlying column type is untranslated.
261  * This means it will only be available as flexible_type later on.
262  *
263  * \param column_index The index of the column.
264  */
265  inline bool is_untranslated_column(size_t column_index) const;
266 
267  /** Returns true if the underlying column type is untranslated.
268  * This means it will only be available as flexible_type later on.
269  *
270  * \overload
271  *
272  * \param column_name The name of the column.
273  */
274  inline bool is_untranslated_column(const std::string& column_name) const;
275 
276  /** Returns the mode of the column. See ml_data_column_modes.hpp
277  * for details on the column modes.
278  *
279  * \param column_index The index of the column.
280  */
281  inline ml_column_mode column_mode(size_t column_index) const;
282 
283  /** Returns the mode of the column. See ml_data_column_modes.hpp
284  * for details on the column modes.
285  *
286  * \overload
287  *
288  * \param column_name The name of the column.
289  */
290  inline ml_column_mode column_mode(const std::string& column_name) const;
291 
292  /** Returns the mode of the target column. See
293  * ml_data_column_modes.hpp for details on the column modes.
294  */
295  inline ml_column_mode target_column_mode() const;
296 
297  /** Returns the size of the columns in the metadata that were
298  * present at train time.
299  *
300  * \param column_index The index of the column.
301  */
302  inline flex_type_enum column_type(size_t column_index) const;
303 
304  /** Returns the size of the columns in the metadata that were
305  * present at train time.
306  *
307  * \overload
308  *
309  * \param column_name The name of the column.
310  */
311  inline flex_type_enum column_type(const std::string& column_name) const;
312 
313  /** Returns the mode of the target column. See
314  * ml_data_column_modes.hpp for details on the column modes.
315  */
316  inline flex_type_enum target_column_type() const;
317 
318  ////////////////////////////////////////////////////////////////////////////////
319  // Other information.
320 
321  /** Serialization version.
322  */
323  size_t get_version() const { return 3; }
324 
325  /**
326  * Returns the feature name of a specific feature present in the metadata.
327  *
328  * Numeric columns are represented by the column name.
329  *
330  * Categorical / Categorical List / Dictionary columns are represented by
331  * "name[category]".
332  *
333  * Vectors are represented by "vector[index]", where index is numerical.
334  *
335  * ND vectors are represented by "nd_vector[idx1,idx2]" etc.
336  *
337  * \returns Names of features
338  */
339  std::string feature_name(size_t column_idx, size_t index, bool quote_string_values = false) const;
340 
341  /**
342  * Returns a list of all the feature names present in the metadata.
343  *
344  * Numeric columns are represented by the column name.
345  *
346  * Categorical / Categorical List / Dictionary columns are represented by
347  * "name[category]".
348  *
349  * Vectors are represented by "vector[index]", where index is numerical.
350  *
351  * ND vectors are represented by "nd_vector[idx1,idx2]" etc.
352  *
353  * \returns Names of features
354  */
355  std::vector<std::string> feature_names(bool unpack_categorical_columns = true) const;
356 
357  /** Serialization -- save.
358  */
359  void save(turi::oarchive& oarc) const;
360 
361  /** Serialization -- load.
362  */
363  void load(turi::iarchive& iarc);
364 
365  /** Sets the values of all future calls to index_size() to return
366  * the column_size values currently present in the indexers. This
367  * is done automatically at the end of fill(), but it can be useful
368  * if more is done to the indexers after that that is still
369  * considered part of training.
370  */
372 
373  /** Returns a pointer to the internal column metadata of column
374  * column_index. Useful for dealing with the column_index
375  */
376  inline ml_data_internal::column_metadata_ptr get_column_metadata(size_t column_index) const;
377 
378 
379 
380 #ifndef NDEBUG
381  void _debug_is_equal(const std::shared_ptr<ml_metadata>& m) const;
382 #else
383  void _debug_is_equal(const std::shared_ptr<ml_metadata>& m) const {}
384 #endif
385 
386  private:
387 
388  ////////////////////////////////////////////////////////////////////////////////
389  //
390  // Internal routines for the data stuff
391 
392  ////////////////////////////////////////////////////////////////////////////////
393  // Data
394 
395  friend class ml_data;
396  friend struct turi::metadata_load;
397 
398  // column-specific metadata
399  std::vector<ml_data_internal::column_metadata_ptr> columns;
400  ml_data_internal::column_metadata_ptr target;
401 
402  // The original names of the columns. This may be in a different
403  // order than the columns above (e.g. user and items are moved to
404  // index 0 and 1 in the recommender). this allows us to reorder the
405  // columns as needed
406  std::vector<std::string> original_column_names;
407 
408  // Cached values; this is a small optimization here to allow
409  // statistics to be used in time-sensitive places. The
410  // setup_cached_values function prepares these from the current
411  // metadata. This is only called by the load and creation routines.
412  void setup_cached_values();
413 
414  size_t _num_dimensions = size_t(-1);
415  size_t _num_untranslated_columns = size_t(-1);
416  std::map<std::string, size_t> _column_name_to_index_map;
417 
418  // Cached variables for the row metadata stuff. This allows us to
419  // quickly translate other things on the fly.
420  friend class ml_data_row_reference;
421  ml_data_internal::row_metadata cached_rm_with_target;
422  ml_data_internal::row_metadata cached_rm_without_target;
423 
424 };
425 
426 }
427 
428 // Include the implementations of the above inline functions.
429 #include <ml/ml_data/metadata_impl.hpp>
430 
431 ////////////////////////////////////////////////////////////////////////////////
432 // Implement serialization for
433 // std::shared_ptr<ml_metadata>
434 
435 BEGIN_OUT_OF_PLACE_SAVE(arc, std::shared_ptr<turi::ml_metadata>, m) {
436  if(m == nullptr) {
437  arc << false;
438  } else {
439  arc << true;
440  arc << (*m);
441  }
442 } END_OUT_OF_PLACE_SAVE()
443 
444 BEGIN_OUT_OF_PLACE_LOAD(arc, std::shared_ptr<turi::ml_metadata>, m) {
445  bool is_not_nullptr;
446  arc >> is_not_nullptr;
447  if(is_not_nullptr) {
448  m.reset(new turi::ml_metadata);
449  arc >> (*m);
450  } else {
451  m = std::shared_ptr<turi::ml_metadata>(nullptr);
452  }
453 } END_OUT_OF_PLACE_LOAD()
454 
455 #endif
const flex_nd_vec::index_range_type & nd_column_shape(size_t column_index) const
#define BEGIN_OUT_OF_PLACE_LOAD(arc, tname, tval)
Macro to make it easy to define out-of-place loads.
Definition: iarchive.hpp:314
std::vector< std::string > column_names() const
size_t target_column_size() const
void set_training_index_sizes_to_current_column_sizes()
std::string feature_name(size_t column_idx, size_t index, bool quote_string_values=false) const
bool has_translated_columns() const
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
bool is_categorical(size_t column_index) const
size_t column_size(size_t column_index) const
bool has_untranslated_columns() const
bool is_indexed(size_t column_index) const
const std::string & column_name(size_t column_index) const
const std::string & target_column_name() const
bool target_is_indexed() const
size_t num_untranslated_columns() const
const std::shared_ptr< ml_data_internal::column_indexer > & indexer(size_t column_index) const
ml_column_mode target_column_mode() const
void save(turi::oarchive &oarc) const
const std::shared_ptr< ml_data_internal::column_statistics > & statistics(size_t column_index) const
size_t index_size(size_t column_index) const
ml_column_mode column_mode(size_t column_index) const
flex_type_enum target_column_type() const
size_t target_index_size() const
size_t column_index(const std::string &column_name, bool max_on_error=false) const
size_t get_version() const
Definition: metadata.hpp:323
size_t num_dimensions() const
std::vector< std::string > feature_names(bool unpack_categorical_columns=true) const
size_t num_columns(bool include_untranslated_columns=true) const
void load(turi::iarchive &iarc)
ml_data_internal::column_metadata_ptr get_column_metadata(size_t column_index) const
bool target_is_categorical() const
bool contains_column(const std::string &column_name) const
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
bool has_target() const
bool is_untranslated_column(size_t column_index) const
size_t global_index_offset(size_t column_index) const
#define BEGIN_OUT_OF_PLACE_SAVE(arc, tname, tval)
Macro to make it easy to define out-of-place saves.
Definition: oarchive.hpp:346
flex_type_enum column_type(size_t column_index) const