Turi Create  4.0
ml_data.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_DML_DATA_H_
7 #define TURI_DML_DATA_H_
8 
9 #include <vector>
10 #include <memory>
11 #include <core/storage/sframe_data/sframe.hpp>
12 #include <ml/ml_data/metadata.hpp>
13 #include <ml/ml_data/ml_data_entry.hpp>
14 #include <ml/ml_data/ml_data_column_modes.hpp>
15 
16 #include <Eigen/SparseCore>
17 #include <Eigen/Core>
18 
19 namespace turi {
20 
21 class ml_data_iterator;
22 
23 namespace ml_data_internal {
24 class ml_data_block_manager;
25 struct row_data_block;
26 class ml_data_reconciler;
27 }
28 
29 /**
30  * \defgroup mldata ML Data
31  * Data Normalization.
32  * See \ref ml_data for details
33  */
34 
35 /**
36  * \ingroup mldata
37  * Row based, SFrame-Like Data storage for Learning and Optimization tasks.
38  *
39  * `ml_data` is a data normalization datastructure that translates user input tables
40  * (which can contain arbitrary types like strings, lists, dictionaries, etc) into
41  * sparse and dense numeric vectors. This allows toolkits to be implemented in a
42  * way that operates on fully mathematical, numeric assumptions, but support a
43  * much richer surface area outside.
44  *
45  * To support this, `ml_data` is kind of a complicated datastructure that
46  * performs several things.
47  * - interpret string columns as categorical onto a sparse vector representation,
48  * using either one-hot encoding or reference encoding.
49  * - map list columns onto a sparse vector representation.
50  * - map dictionary columns onto a sparse vector representation.
51  * - map dense numeric arrays onto a dense vector representation.
52  * - etc.
53  * Each row of a user input table is hence translated into a mixed
54  * dense-sparse vector. This vector then has to be materialized as an SFrame
55  * (allowing it to scale to datasets larger than memory).
56  *
57  * This can then be used to train other Machine Learning models with.
58  *
59  * Finally, the `ml_data` datastructure has to remember and store the translation
60  * mappings so that the exact procedure can be performed later on new data
61  * (when using the trained model)
62  *
63  * Additionally `ml_data` also implement strategies for automatic imputation of missing
64  * data. For instance, missing numeric columns can be imputed with the mean,
65  * missing categorical columns can be imputed with the most common value, etc.
66  *
67  *
68  * ml_data loads data from an existing sframe, indexes it by mapping
69  * all categorical values to unique indices in 0, 1,2,...,n, and
70  * records statistics about the values. It then puts it into an
71  * efficient row-based data storage structure for use in learning
72  * algorithms that need fast row-wise iteration through the features
73  * and target. The row based storage structure is designed for fast
74  * iteration through the rows and target. ml_data also speeds up data
75  * access via caching and a compact layout.
76  *
77  *
78  * Illustration of the API
79  * =======================
80  *
81  * Using ml_data
82  * -------------
83  *
84  * There are a number of use cases for ml_data. The following should
85  * address the current use cases.
86  *
87  * ### To construct the data at train time:
88  * \code
89  * // Constructs an empty ml_data object
90  * ml_data data;
91  *
92  * // Sets the data source from X, with target_column_name being the
93  * // target column. (Alternatively, target_column_name may be a
94  * // single-column SFrame giving the target. "" denotes no target
95  * // column present).
96  * data.fill(X, target_column_name);
97  *
98  * // After filling, a serializable shared pointer to the metadata
99  * // can be saved for the predict stage. this->metadata is of type
100  * // std::shared_ptr<ml_metadata>.
101  * this->metadata = data.metadata();
102  * \endcode
103  *
104  *
105  * ### To iterate through the data, single threaded.
106  * \code
107  * for(auto it = data.get_iterator(); !it.done(); ++it) {
108  * ....
109  * it->target_value();
110  * it->fill(...);
111  * }
112  * \endcode
113  *
114  *
115  * ### To iterate through the data, threaded.
116  *
117  * \code
118  * in_parallel([&](size_t thread_idx, size_t num_threads) {
119  *
120  * for(auto it = data.get_iterator(thread_idx, num_threads); !it.done(); ++it) {
121  * ....
122  * it->target_value();
123  * it->fill(...);
124  * }
125  * });
126  * \endcode
127  *
128  *
129  * ### To construct the data at predict time:
130  *
131  * \code
132  * // Constructs an empty ml_data object, takes construction options
133  * // from original ml_data.
134  * ml_data data(this->metadata);
135  *
136  * // Sets the data source from X, with no target column.
137  * data.fill(X);
138  * \endcode
139  *
140  *
141  * ### To serialize the metadata for model serialization
142  *
143  * \code
144  * // Type std::shared_ptr<ml_metadata> is fully serializable.
145  * oarc << this->metadata;
146  *
147  * iarc >> this->metadata;
148  * \endcode
149  *
150  * ### To access statistics at train/predict time.
151  *
152  * Statistics about each of the columns is fully accessible at any point
153  * after training time, and does not change. This is stored with the
154  * metadata.
155  *
156  *
157  * \code
158  * // The number of columns. column_index
159  * // below is between 0 and this value.
160  * this->metadata->num_columns();
161  *
162  * // This gives the number of index value at train time. Will never
163  * // change after training time. For categorical types, it gives
164  * // the number of categories at train time. For numerical it is 1
165  * // if scalar and the width of the vector if numeric. feature_idx
166  * // below is between 0 and this value.
167  * this->metadata->index_size(column_index);
168  *
169  * // The number of rows having this feature.
170  * this->metadata->statistics(column_index)->count(feature_idx);
171  *
172  * // The mean of this feature. Missing is counted as 0.
173  * this->metadata->statistics(column_index)->mean(idx);
174  *
175  * // The std dev of this feature. Missing is counted as 0.
176  * this->metadata->statistics(column_index)->stdev(idx);
177  *
178  * // The number of rows in which the value of this feature is
179  * // strictly greater than 0.
180  * this->metadata->statistics(column_index)->num_positive(idx);
181  *
182  * // The same methods above, but for the target.
183  * this->metadata->target_statistics()->count();
184  * this->metadata->target_statistics()->mean();
185  * this->metadata->target_statistics()->stdev();
186  * \endcode
187  *
188  *
189  *
190  * ### Forcing certain column modes.
191  *
192  * The different column modes control the behavior of each column. These
193  * modes are defined in ml_data_column_modes as an enum and currently
194  * allow NUMERIC, NUMERIC_VECTOR, CATEGORICAL, CATEGORICAL_VECTOR,
195  * DICTIONARY.
196  *
197  * In most cases, there is an obvious default. However, to force some
198  * columns to be set to a particular mode, a mode_override parameter is
199  * available to the set_data and add_side_data functions as a map from
200  * column name to column_mode. This overrides the default choice. The
201  * main use case for this is recsys, where user_id and item_id will
202  * always be categorical:
203  *
204  * \code
205  * data.fill(recsys_data, "rating",
206  * {{"user_id", column_mode::CATEGORICAL},
207  * {"item_id", column_mode::CATEGORICAL}});
208  * \endcode
209  *
210  * Untranslated Columns
211  * ----------------------------------------
212  *
213  * Untranslated columns can be specified with the set_data(...)
214  * method. The untranslated columns are tracked alongside the regular
215  * ones, but are not themselves translated, indexed, or even loaded
216  * until iteration. These additional columns are then available using
217  * the iterator's fill_untranslated_values function.
218  *
219  * The way to mark a column as untranslated is to manually specify its
220  * type as ml_column_mode::UNTRANSLATED using the mode_overrides
221  * parameter in the set_data method. The example code below
222  * illustrates this:
223  *
224  *
225  * \code
226  * sframe X = make_integer_testing_sframe( {"C1", "C2"}, { {0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4} } );
227  *
228  * ml_data data;
229  *
230  * data.set_data(X, "", {}, { {"C2", ml_column_mode::UNTRANSLATED} });
231  *
232  * data.fill();
233  *
234  *
235  * std::vector<ml_data_entry> x_d;
236  * std::vector<flexible_type> x_f;
237  *
238  * ////////////////////////////////////////
239  *
240  * for(auto it = data.get_iterator(); !it.done(); ++it) {
241  *
242  * it->fill(x_d);
243  *
244  * ASSERT_EQ(x_d.size(), 1);
245  * ASSERT_EQ(x_d[0].column_index, 0);
246  * ASSERT_EQ(x_d[0].index, 0);
247  * ASSERT_EQ(x_d[0].value, it.row_index());
248  *
249  * it->fill_untranslated(x_f);
250  *
251  * ASSERT_EQ(x_f.size(), 1);
252  * ASSERT_TRUE(x_f[0] == it.row_index());
253  * }
254  * \endcode
255  *
256  */
257 class ml_data {
258  public:
259 
260  ml_data(const ml_data&);
261  const ml_data& operator=(const ml_data&);
262  ml_data& operator=(ml_data&&) = default;
263 
264  ml_data(ml_data&&) = default;
265 
266  /**
267  * Construct an ml_data object based current options.
268  */
269  ml_data();
270 
271 
272  /**
273  * Construct an ml_data object based on previous ml_data metadata.
274  */
275  explicit ml_data(const std::shared_ptr<ml_metadata>& metadata);
276 
277  /// This is here to get around 2 clang bugs!
278  typedef std::map<std::string, ml_column_mode> column_mode_map;
279 
280  /*********************************************************************************
281  *
282  * Missing Value Action
283  * ================================================================================
284  *
285  * IMPUTE
286  * ------
287  * Imputes the data with the mean. Do not use this during creation time because
288  * the means will change over time. Imnputation only makes sense when you
289  * do it during predict/evaluate time.
290  *
291  *
292  * ERROR
293  * ------
294  * Error out when a missing value occurs in a numeric columns. Keys (categorical
295  * variables of dictionary keys) can accept missing values.
296  *
297  *
298  * USE_NAN
299  * ------
300  * Use NAN as value for missing values.
301  */
302 
303  /** Fills the data from an SFrame.
304  *
305  * \param data The data sframe.
306  *
307  * \param target_column If not reusing metadat, specifies the
308  * target column. If no target column is present, then use "".
309  *
310  * \param mode_overrides A dictionary of column-name to
311  * ml_column_mode mode overrides. These will be used instead of
312  * the default flex_type_enum -> ml_column_mode mappings. The main
313  * use is to specify integers as categorical or designate some
314  * columns as untranslated.
315  *
316  * \param immutable_metadata If true, then any new values in
317  * categorical columns will be mapped to size_t(-1) and not
318  * indexed.
319  *
320  * \param mva The behavior when missing values are present.
321  */
322  void fill(const sframe& data,
323  const std::string& target_column = "",
324  const column_mode_map mode_overrides = column_mode_map(),
325  bool immutable_metadata = false,
326  ml_missing_value_action mva = ml_missing_value_action::ERROR);
327 
328  /** Fills the data from an SFrame.
329  *
330  * \param data The data sframe.
331  *
332  * \param row_bounds The (lower, upper) bounds on which rows from
333  * the original data sframe are considered. It is as if the
334  * original sframe has only these rows.
335  *
336  * \param target_column If not reusing metadat, specifies the
337  * target column. If no target column is present, then use "".
338  *
339  * \param mode_overrides A dictionary of column-name to
340  * ml_column_mode mode overrides. These will be used instead of
341  * the default flex_type_enum -> ml_column_mode mappings. The main
342  * use is to specify integers as categorical or designate some
343  * columns as untranslated.
344  *
345  * \param immutable_metadata If true, then any new values in
346  * categorical columns will be mapped to size_t(-1) and not
347  * indexed.
348  *
349  * \param mva The behavior when missing values are present.
350  */
351  void fill(const sframe& data,
352  const std::pair<size_t, size_t>& row_bounds,
353  const std::string& target_column = "",
354  const column_mode_map mode_overrides = column_mode_map(),
355  bool immutable_metadata = false,
356  ml_missing_value_action _mva = ml_missing_value_action::ERROR);
357 
358 
359  ////////////////////////////////////////////////////////////////////////////////
360  //
361  // Metadata access
362  //
363  ////////////////////////////////////////////////////////////////////////////////
364 
365  /** Direct access to the metadata.
366  */
367  inline const std::shared_ptr<ml_metadata>& metadata() const {
368  return _metadata;
369  }
370 
371  /** Returns the number of columns present.
372  */
373  inline size_t num_columns() const {
374  return _metadata->num_columns();
375  }
376 
377  /** The number of rows present.
378  */
379  inline size_t num_rows() const {
380  return _row_end - _row_start;
381  }
382 
383  /** The number of rows present.
384  */
385  inline size_t size() const {
386  return num_rows();
387  }
388 
389  /** Returns true if there is no data in the container.
390  */
391  inline bool empty() const {
392  return _row_start == _row_end;
393  }
394 
395  ////////////////////////////////////////////////////////////////////////////////
396  //
397  // Iteration Access
398  //
399  ////////////////////////////////////////////////////////////////////////////////
400 
401  /** Return an iterator over part of the data. See
402  * iterators/ml_data_iterator.hpp for documentation on the returned
403  * iterator.
404  */
405  ml_data_iterator get_iterator(size_t thread_idx=0, size_t num_threads=1) const;
406 
407 
408  /** Returns true if a target column is present, and false otherwise.
409  */
410  bool has_target() const {
411  return rm.has_target;
412  }
413 
414  /** Returns true if there are untranslated columns present, and
415  * false otherwise.
416  */
418  return (!untranslated_columns.empty());
419  }
420 
421  /** Returns true if any of the non-target columns are translated.
422  */
423  bool has_translated_columns() const {
424  return (untranslated_columns.size() != metadata()->num_columns(false));
425  }
426 
427  /**
428  * Returns the maximum row size present in the data. This information is
429  * calculated when the data is indexed and the ml_data structure is filled.
430  * A buffer sized to this is guaranteed to hold any row encountered while
431  * iterating through the data.
432  */
433  size_t max_row_size() const {
434  return _max_row_size;
435  }
436 
437  ////////////////////////////////////////////////////////////////////////////////
438  //
439  // Utility routines to convert ml_data to other formats.
440  //
441  ////////////////////////////////////////////////////////////////////////////////
442 
443  /**
444  * Create a subsampled copy of the current ml_data structure. This
445  * allows us quickly create a subset of the data to be used for things
446  * like sgd, etc.
447  *
448  * If n_rows < size(), exactly n_rows are sampled IID from the
449  * dataset. Otherwise, a copy of the current ml_data is returned.
450  */
451  ml_data create_subsampled_copy(size_t n_rows, size_t random_seed) const;
452 
453  /**
454  * Create a copy of the current ml_data structure, selecting the rows
455  * given by selection_indices.
456  *
457  * \param selection_indices A vector of row indices that must be in
458  * sorted order. Duplicates are allowed. The returned ml_data
459  * contains all the rows given by selection_indices.
460  *
461  * \return A new ml_data object with containing only the rows given
462  * by selection_indices.
463  */
464  ml_data select_rows(const std::vector<size_t>& selection_indices) const;
465 
466  /**
467  * Create a sliced copy of the current ml_data structure. This
468  * copy is cheap.
469  */
470  ml_data slice(size_t start_row, size_t end_row) const;
471 
472  ////////////////////////////////////////////////////////////////////////////////
473  // Serialization routines
474 
475  /** Get the current serialization format.
476  */
477  size_t get_version() const { return 1; }
478 
479  /** Remap all the block indices.
480  */
481  void _reindex_blocks(const std::vector<std::vector<size_t> >& reindex_maps);
482 
483  private:
484 
485  friend class ml_data_iterator;
486  friend class ml_data_internal::ml_data_reconciler;
487  friend void reconcile_distributed_ml_data(ml_data& data, const std::vector<std::string>&);
488 
489  ////////////////////////////////////////////////////////////////////////////////
490  //
491  // Internal data
492  //
493  ////////////////////////////////////////////////////////////////////////////////
494 
495  std::shared_ptr<ml_metadata> _metadata = nullptr;
496 
497  size_t _row_start = 0;
498  size_t _row_end = 0;
499  size_t _original_num_rows = 0;
500  size_t _max_row_size = 0;
501 
502 
503  /** The row metadata. This is what is needed to interact with the
504  * raw data contained in this data set, and gives the schema for
505  * the data laid out in the data_blocks variable below.
506  */
508 
509  // The row block size. Set so that each row is at least 64K. This
510  // balances the buffering and sorting speed with not using too much
511  // memory at once. This value is set initially on fill.
512  size_t row_block_size = size_t(-1);
513 
514  /** The main storage container for the indexed, compactly
515  * represented rows.
516  */
517  std::shared_ptr<sarray<ml_data_internal::row_data_block> > data_blocks;
518 
519  /** The main storage container for untranslated columns. These
520  * columns are not put through the indexer or anything else.
521  */
522  std::vector<std::shared_ptr<sarray<flexible_type> > > untranslated_columns;
523 
524  /** The block manager -- holds the readers, as well as a cache of
525  * currently referenced blocks. Each block holds both the
526  * translated.
527  */
528  std::shared_ptr<ml_data_internal::ml_data_block_manager> block_manager;
529 
530  /** Convenience function to create the block manager given the
531  * current data in the model.
532  */
533  void _reset_block_manager();
534 
535  ////////////////////////////////////////////////////////////////////////////////
536  //
537  // Internal routines for setting up and filling the ml_data. These
538  // are defined in ml_data_setup.cpp.
539  //
540  ////////////////////////////////////////////////////////////////////////////////
541 
542  /** Sets the ml metadata for the whole class based on the options
543  * given.
544  */
545  void _setup_ml_metadata(const sframe& data,
546  const std::string& target_column_name,
547  const column_mode_map& mode_overrides);
548 
549  /**
550  * Fill the ml_data structure with the raw data in raw_data.
551  *
552  * Only call this function when metadata and optionally target metadata are
553  * set. It uses them to import the data.
554  *
555  * \param[in] raw_data Input SFrame (with target column)
556  * \param[in] track_statistics Tracks stats (mean, variance etc.)
557  * \param[in] allow_new_catgorical_values Modify metadata for new categories?
558  * \param[in] none_action Missing value action?
559  *
560  *
561  * \note track_statistics and allow_new_catgorical_values when set to true
562  * will modify the underlying metadata.
563  *
564  */
565  void _fill_data_blocks(const sframe& raw_data,
566  bool immutable_metadata,
567  bool track_statistics,
569  const std::pair<size_t, size_t>& row_bounds,
570  const std::set<std::string>& sorted_columns);
571 
572  /** Sets up the untranslated columns and column readers.
573  */
574  void _setup_untranslated_columns(const sframe& original_data, size_t row_lb, size_t row_ub);
575 
576  /** Set up the untranslated column readers.
577  */
578  void _setup_untranslated_column_readers();
579 
580  ////////////////////////////////////////////////////////////////////////////////
581  // Stuff for reconciling the ml_data stuff
582 
583  friend class ml_data_reconciler;
584 
585 
586 };
587 
588 }
589 
590 ////////////////////////////////////////////////////////////////////////////////
591 // Implement serialization for
592 // std::shared_ptr<std::vector<sarray<ml_data_internal::entry_value> > >
593 
595  if(m == nullptr) {
596  arc << false;
597  } else {
598  arc << true;
599  arc << (*m);
600  }
601 } END_OUT_OF_PLACE_SAVE()
602 
604  bool is_not_nullptr;
605  arc >> is_not_nullptr;
606  if(is_not_nullptr) {
608  arc >> (*m);
609  } else {
610  m = std::shared_ptr<sarray<turi::ml_data_internal::row_data_block> >(nullptr);
611  }
612 } END_OUT_OF_PLACE_LOAD()
613 
614 // A few includes for convenience.
615 
616 #include <ml/ml_data/ml_data_iterator.hpp>
617 
618 #endif
#define BEGIN_OUT_OF_PLACE_LOAD(arc, tname, tval)
Macro to make it easy to define out-of-place loads.
Definition: iarchive.hpp:314
bool has_untranslated_columns() const
Definition: ml_data.hpp:417
bool empty() const
Definition: ml_data.hpp:391
size_t size() const
Definition: ml_data.hpp:385
const std::shared_ptr< ml_metadata > & metadata() const
Definition: ml_data.hpp:367
bool has_translated_columns() const
Definition: ml_data.hpp:423
size_t num_rows() const
Definition: ml_data.hpp:379
std::map< std::string, ml_column_mode > column_mode_map
This is here to get around 2 clang bugs!
Definition: ml_data.hpp:278
size_t max_row_size() const
Definition: ml_data.hpp:433
size_t get_version() const
Definition: ml_data.hpp:477
size_t num_columns() const
Definition: ml_data.hpp:373
bool has_target() const
Definition: ml_data.hpp:410
#define BEGIN_OUT_OF_PLACE_SAVE(arc, tname, tval)
Macro to make it easy to define out-of-place saves.
Definition: oarchive.hpp:346