Turi Create  4.0
ml_data.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_ML2_DATA_H_
7 #define TURI_ML2_DATA_H_
8 
9 #include <vector>
10 #include <memory>
11 #include <core/storage/sframe_data/sframe.hpp>
12 #include <model_server/lib/extensions/option_manager.hpp>
13 #include <toolkits/ml_data_2/metadata.hpp>
14 #include <toolkits/ml_data_2/ml_data_entry.hpp>
15 #include <toolkits/ml_data_2/ml_data_column_modes.hpp>
16 #include <toolkits/ml_data_2/data_storage/ml_data_row_format.hpp>
17 #include <toolkits/ml_data_2/side_features.hpp>
18 
19 #include <Eigen/SparseCore>
20 #include <Eigen/Core>
21 
22 namespace turi { namespace v2 {
23 
24 class ml_data_iterator;
25 class ml_data_block_iterator;
26 class ml_data_iterator_base;
27 
28 namespace ml_data_internal {
29 class ml_data_block_manager;
30 }
31 
32 /*********************************************************************************
33  *
34  * Row based, SFrame-Like Data storage for Learning and Optimization tasks.
35  * ================================================================================
36  *
37  * ml_data loads data from an existing sframe, indexes it by mapping
38  * all categorical values to unique indices in 0, 1,2,...,n, and
39  * records statistics about the values. It then puts it into an
40  * efficient row-based data storage structure for use in learning
41  * algorithms that need fast row-wise iteration through the features
42  * and target. The row based storage structure is designed for fast
43  * iteration through the rows and target. ml_data also speeds up data
44  * access via caching and a compact layout.
45  *
46  * ml_data v2 design.
47  *
48  * ml_data loads data from an existing sframe, indexes it by mapping all
49  * categorical values to numerical index values, and records statistics
50  * about the values. It then puts it into an efficient row-based data
51  * storage structure for use in learning algorithms that need fast
52  * row-wise iteration through the features and target. The row based
53  * storage structure is designed for fast iteration through the rows and
54  * target. ml_data also speeds up data access via caching and a compact
55  * layout.
56  *
57  * Since ml_data is now used extensively in the different toolkits, a
58  * redesign of the interface is needed.
59  *
60  * The current ml_data design have a number of issues:
61  * - Confusing to construct.
62  * - Metadata is confusing to work with.
63  * - Not easily extendible (e.g. with other indexing strategies).
64  * - The code is difficult to dive into.
65  *
66  * The new design addresses some of these:
67  *
68  * - API for construction is greatly simplified.
69  * - API for saving and working with the metadata is greatly simplified.
70  * - Indexing and Statistics tracking are easy to extend.
71  *
72  * Illustration of the new API:
73  * ================================================================================
74  *
75  * Using ml_data.
76  * --------------------------------------------------------------------------------
77  *
78  * There are a number of use cases for ml_data. The following should
79  * address the current use cases.
80  *
81  * To construct the data at train time:
82  * ++++++++++++++++++++++++++++++++++++++++++++++++++
83  *
84  * // Constructs an empty ml_data object
85  * ml_data data(options);
86  *
87  * // Sets the data source from X, with target_column_name being the
88  * // target column. (Alternatively, target_column_name may be a
89  * // single-column SFrame giving the target. "" denotes no target
90  * // column present).
91  * data.set_data(X, target_column_name);
92  *
93  * // Finalize the filling.
94  * data.fill();
95  *
96  * // After filling, a serializable shared pointer to the metadata
97  * // can be saved for the predict stage. this->metadata is of type
98  * // std::shared_ptr<ml_metadata>.
99  * this->metadata = data.metadata();
100  *
101  *
102  * To iterate through the data, single threaded.
103  * ++++++++++++++++++++++++++++++++++++++++++++++++++
104  *
105  * for(auto it = data.get_iterator(); !it.done(); ++it) {
106  * ....
107  * it.target_value();
108  * it.fill_observation(...);
109  * }
110  *
111  *
112  * To iterate through the data, threaded.
113  * ++++++++++++++++++++++++++++++++++++++++++++++++++
114  *
115  * in_parallel([&](size_t thread_idx, size_t num_threads) {
116  *
117  * for(auto it = data.get_iterator(thread_idx, num_threads); !it.done(); ++it) {
118  * ....
119  * it.target_value();
120  * it.fill_observation(...);
121  * }
122  * });
123  *
124  *
125  * To construct the data at predict time:
126  * ++++++++++++++++++++++++++++++++++++++++++++++++++
127  *
128  * // Constructs an empty ml_data object, takes construction options
129  * // from original ml_data.
130  * ml_data data(this->metadata);
131  *
132  * // Sets the data source from X, with no target column.
133  * data.set_data(X, "");
134  *
135  * // Finalize the filling.
136  * data.fill();
137  *
138  * To construct the data at predict time, with tracking of new
139  * categorical variables.
140  * ++++++++++++++++++++++++++++++++++++++++++++++++++
141  *
142  * There is currently no use case for the data statistics (column means,
143  * std dev, count, etc.) to change after training. However, some models,
144  * e.g. recsys, need to change parts of the metadata -- e.g. track new
145  * categories. Thus we allow this part of the metadata to change.
146  *
147  * // Constructs an empty ml_data object, takes construction options
148  * // from original ml_data. The "true" here says that the metadata
149  * // indexing should be mutable, allowing new categories to be
150  * // tracked (this is needed for recsys).
151  * ml_data data(this->metadata, true);
152  *
153  * // Sets the data source from X, with no target column.
154  * data.set_data(X, "");
155  *
156  * // Finalize the filling.
157  * data.fill();
158  *
159  *
160  * To serialize the metadata for model serialization
161  * ++++++++++++++++++++++++++++++++++++++++++++++++++
162  *
163  * // Type std::shared_ptr<ml_metadata> is fully serializable.
164  * oarc << this->metadata;
165  *
166  * iarc >> this->metadata;
167  *
168  * To add side data at construction
169  * ++++++++++++++++++++++++++++++++++++++++++++++++++
170  *
171  * // Constructs an empty ml_data object
172  * ml_data data(options);
173  *
174  * // Sets the data source from X, with target_column_name being the
175  * // target column.
176  * data.set_data(X, target_column_name);
177  *
178  * // Sets the data source from X2
179  * data.add_side_data(X2);
180  *
181  * // Finalize the filling.
182  * data.fill();
183  *
184  * // After filling, a serializable shared pointer to the metadata
185  * // can be saved for the predict stage. This metadata contains the
186  * // side features.
187  * this->metadata = data.metadata();
188  *
189  *
190  * To access statistics at train/predict time.
191  * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
192  *
193  * Statistics about each of the columns is fully accessible at any point
194  * after training time, and does not change. This is stored with the
195  * metadata.
196  *
197  *
198  * // The number of columns, including side features. column_index
199  * // below is between 0 and this value.
200  * this->metadata->num_columns();
201  *
202  * // This gives the size of the column at train time. Will never
203  * // change after training time. For categorical types, it gives
204  * // the number of categories at train time. For numerical it is 1
205  * // if scalar and the width of the vector if numeric. feature_idx
206  * // below is between 0 and this value.
207  * this->metadata->column_size(column_index);
208  *
209  * // The number of rows having this feature.
210  * this->metadata->statistics(column_index)->count(feature_idx);
211  *
212  * // The mean of this feature. Missing is counted as 0.
213  * this->metadata->statistics(column_index)->mean(idx);
214  *
215  * // The std dev of this feature. Missing is counted as 0.
216  * this->metadata->statistics(column_index)->stdev(idx);
217  *
218  * // The number of rows in which the value of this feature is
219  * // strictly greater than 0.
220  * this->metadata->statistics(column_index)->num_positive(idx);
221  *
222  * // The same methods above, but for the target.
223  * this->metadata->target_statistics()->count();
224  * this->metadata->target_statistics()->mean();
225  * this->metadata->target_statistics()->stdev();
226  *
227  * Forcing the ordering of certain columns
228  * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
229  *
230  * As the SFrame is intended to work with column names, ml_data may
231  * reorder the columns in the original SFrame for optimization or
232  * convenience reasons. This ordering will always be consistent, even if
233  * the column orderings in the data SFrame change between train and
234  * test. To force ml_data to put some columns at the start, a partial
235  * column ordering may be passed to set_data(...) to force certain
236  * columns to come first. For example, to force the "user_id" column to
237  * come first, and the "item_id" column to come second, do
238  *
239  * data.set_data(recsys_data, "rating", {"user_id", "item_id"});
240  *
241  * These columns are guaranteed to be first.
242  *
243  *
244  * Forcing certain column modes.
245  * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
246  *
247  * The different column modes control the behavior of each column. These
248  * modes are defined in ml_data_column_modes as an enum and currently
249  * allow NUMERIC, NUMERIC_VECTOR, CATEGORICAL, CATEGORICAL_VECTOR,
250  * DICTIONARY.
251  *
252  * In most cases, there is an obvious default. However, to force some
253  * columns to be set to a particular mode, a mode_override parameter is
254  * available to the set_data and add_side_data functions as a map from
255  * column name to column_mode. This overrides the default choice. The
256  * main use case for this is recsys, where user_id and item_id will
257  * always be categorical:
258  *
259  * data.set_data(recsys_data, "rating",
260  * {"user_id", "item_id"},
261  * {{"user_id", column_mode::CATEGORICAL},
262  * {"item_id", column_mode::CATEGORICAL}});
263  *
264  *
265  * Customizing the behavior of ml_data
266  * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
267  *
268  * The options parameter of the constructor provides a set of possible
269  * options that can get passed in to the ml_data class and govern how the
270  * model is created, which in turn control the functionality available
271  * later on.
272  *
273  *
274  * Separating out Train and Predict Modes
275  * ---------------------------------------
276  *
277  * In practical use of the current ml_data, it seems wise to distinguish
278  * between “train” and “predict” modes. Train mode is when the ml_data
279  * class creates the metadata class as part of it’s construction/filling.
280  * Predict mode is when the ml_data class uses an existing ml_metadata
281  * class obtained from the ml_data structure after it was filled. This
282  * ml_metadata class can be saved/loaded or used for multiple training.
283  *
284  * The reason it is important to distinguish between these two cases is
285  * based on the following observations about the current usage and the
286  * design decision.
287  *
288  * First, the main practical way the training mode and predict mode are
289  * different is that in predict mode, all the rows in the original SFrame
290  * are expected to be represented in the output SFrame in the same order
291  * as the original SFrame. Thus the ml_data structure must also preserve
292  * this ordering. However, reordering rows at train time is often
293  * needed. SGD needs the data shuffled, and recsys needs it sorted by
294  * user/item.
295  *
296  * Second, for simplicity, the options are set once at ml_data creation,
297  * at train time. Following that, the current options for the ml_data
298  * structure are stored with the metadata. Practically, this means that
299  * the options for setting up the ml_data class are consolidated into one
300  * place, but has the side effect that some options are specific for the
301  * training time and others for the predict time, as noted in the first
302  * point.
303  *
304  * Thus, some of the options apply only at train time and some only at
305  * predict time. Options labeled with _on_train or _on_predict only
306  * apply at train or predict time -- the rest apply to both modes.
307  *
308  * Data ordering options
309  * ----------------------------------------
310  *
311  * - "sort_by_first_two_columns_on_train":
312  *
313  * If true (default = false), then for the training data set, sort the
314  * rows by the feature indices of the first two columns. The first two
315  * columns must be categorical. This ensures that all rows with equal
316  * first column are in a group. (Used by recsys, matrix factorization
317  * for ranking, etc.).
318  *
319  * This option is only relevant at train time; data for predict/test is
320  * not reordered.
321  *
322  * - "sort_by_first_two_columns":
323  *
324  * If true (default = false), then always sort the data by the first
325  * two columns in similar fashion to that above.
326  *
327  * - "shuffle_rows_on_train":
328  *
329  * If true (default = false), then for the training data set, do a
330  * simple random shuffle of the input rows. If sort is also on, then
331  * the order of the index mapping is random.
332  *
333  * This option is only relevant at train time; data for predict/test is
334  * not reordered.
335  *
336  * - "shuffle_rows":
337  *
338  * If true (default = false), then always do a simple random shuffle of
339  * the input rows. If sort is also on, then the order of the index
340  * mapping is random.
341  *
342  *
343  * Indexing options
344  * ----------------------------------------
345  *
346  * - "column_indexer_type".
347  *
348  * Gives the type of the indexer to use on the columns (default =
349  * "unique"). Currently, only "unique" is available, but "hash" will
350  * be supported in the future. (See Extending Column Indexing below to
351  * create your own indexer).
352  *
353  * - "target_column_indexer_type".
354  *
355  * Gives the type of the indexer to use on the target columns (default = "unique").
356  *
357  * - "integer_columns_categorical_by_default".
358  *
359  * By default, integer columns are treated as numeric. If this option
360  * is true (default = false), then they are treated as categorical.
361  *
362  * Missing value options
363  * ----------------------------------------
364  *
365  * - "missing_value_action_on_train"
366  *
367  * This option controls what the default missing value behavior will be
368  * at training time (default = "error"). Currently, only "error" is
369  * supported at train time, but other options, e.g. "NAN", will be
370  * supported in the future.
371  *
372  * - "missing_value_action_on_predict",
373  *
374  * This option controls what the action on missing value after the
375  * train stage should be (default = "impute"). Currently, only
376  * "impute" and "error" are supported.
377  *
378  * Error checking options
379  * ----------------------------------------
380  *
381  * - "target_column_always_numeric"
382  *
383  * If true (default), then the target column must be a numeric scalar
384  * column. If not, then an error is raised.
385  *
386  * Extending Indexing and Statistics
387  * =============================================================================
388  *
389  * The current design is set up to make extending the indexer and the
390  * statistics trackers easy.
391  *
392  * To extend the indexer:
393  *
394  * 1. Subclass from column_indexer, given in indexing/column_indexer.hpp.
395  * Implement the appropriate virtual functions.
396  *
397  * 2. Register the class by adding a line to
398  * indexing/column_indexer_factory.cpp so it can get instantiated by
399  * name.
400  *
401  * The same can be done with statistics -- inherit column_statistics, as
402  * given in statistics/column_statistics.hpp, and modify
403  * statistics/column_statistics_factory.cpp.
404  *
405  * Untranslated Columns
406  * ----------------------------------------
407  *
408  * Untranslated columns can be specified with the set_data(...)
409  * method. The untranslated columns are tracked alongside the regular
410  * ones, but are not themselves translated, indexed, or even loaded
411  * until iteration. These additional columns are then available using
412  * the iterator's fill_untranslated_values function.
413  *
414  * The way to mark a column as untranslated is to manually specify its
415  * type as ml_column_mode::UNTRANSLATED using the mode_overrides
416  * parameter in the set_data method. The example code below
417  * illustrates this:
418  *
419  *
420  * sframe X = make_integer_testing_sframe( {"C1", "C2"}, { {0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4} } );
421  *
422  * v2::ml_data data;
423  *
424  * data.set_data(X, "", {}, { {"C2", v2::ml_column_mode::UNTRANSLATED} });
425  *
426  * data.fill();
427  *
428  *
429  * std::vector<v2::ml_data_entry> x_d;
430  * std::vector<flexible_type> x_f;
431  *
432  * ////////////////////////////////////////
433  *
434  * for(auto it = data.get_iterator(); !it.done(); ++it) {
435  *
436  * it.fill_observation(x_d);
437  *
438  * ASSERT_EQ(x_d.size(), 1);
439  * ASSERT_EQ(x_d[0].column_index, 0);
440  * ASSERT_EQ(x_d[0].index, 0);
441  * ASSERT_EQ(x_d[0].value, it.row_index());
442  *
443  * it.fill_untranslated_values(x_f);
444  *
445  * ASSERT_EQ(x_f.size(), 1);
446  * ASSERT_TRUE(x_f[0] == it.row_index());
447  * }
448  *
449  */
450 class ml_data {
451  public:
452 
453  // ml_data is cheap to copy. However, it cannot be copied before
454  // fill() is called.
455 
456  ml_data(const ml_data&);
457  const ml_data& operator=(const ml_data&);
458  ml_data& operator=(ml_data&&) = default;
459 
460  ml_data(ml_data&&) = default;
461 
462  /** Default option list. See above for explanation.
463  */
464  static std::map<std::string, flexible_type> default_options() {
465  return {
466  /**
467  *
468  */
469  {"sort_by_first_two_columns_on_train", false},
470  {"sort_by_first_two_columns", false},
471 
472  {"shuffle_rows_on_train", false},
473  {"shuffle_rows", false},
474 
475  {"column_indexer_type", "unique"},
476  {"column_statistics_type", "basic-dense"},
477 
478  {"missing_value_action_on_train", "error"},
479  {"missing_value_action_on_predict", "impute"},
480 
481  {"integer_columns_categorical_by_default", false},
482 
483  {"target_column_always_numeric", false},
484  {"target_column_always_categorical", false},
485 
486  {"target_column_indexer_type", "unique"},
487 
488  {"target_column_statistics_type", "basic-dense"},
489 
490  {"uniquify_side_column_names", false},
491 
492  {"ignore_new_columns_after_train", false}
493 
494  };
495  }
496 
497  /**
498  * Construct an ml_data object based on previous ml_data metadata.
499  */
500  explicit ml_data(const std::shared_ptr<ml_metadata>& metadata,
501  bool immutable_metadata = false);
502 
503  /// STUPID CLANG PARSING BUG
504  typedef std::map<std::string, flexible_type> flex_map;
505 
506  /**
507  * Construct an ml_data object based current options.
508  */
509  explicit ml_data(const std::map<std::string, flexible_type>& options = flex_map());
510 
511  /**
512  * Special case the explicit initializer list to overload to the map of options.
513  */
514  explicit ml_data(std::initializer_list<std::map<std::string, flexible_type>::value_type> l)
515  : ml_data(std::map<std::string, flexible_type>(l))
516  {}
517 
518  /// This is here to get around 2 clang bugs!
519  typedef std::map<std::string, ml_column_mode> column_mode_map;
520 
521  /** Sets the data source.
522  *
523  * If target_column is null, then there is no target column.
524  */
525  void set_data(const sframe& data,
526  const std::string& target_column = "",
527  const std::vector<std::string>& partial_column_ordering = std::vector<std::string>(),
528  const column_mode_map mode_overrides = column_mode_map());
529 
530  /** Sets the data source.
531  *
532  * An overload of the previous one. Here, the target is supplied separately
533  * as a one-column sframe.
534  *
535  */
536  void set_data(const sframe& data,
537  const sframe& target,
538  const std::vector<std::string>& partial_column_ordering = std::vector<std::string>(),
539  const column_mode_map mode_overrides = column_mode_map());
540 
541  /** Add in the side data to the mix. If forced_join_column is
542  * given, that column must be present and the one to join on.
543  * Otherwise, there must be exactly one column in common between
544  * the main data and the side data.
545  */
546  void add_side_data(const sframe& data,
547  const std::string& forced_join_column = "",
548  const column_mode_map mode_overrides = column_mode_map());
549 
550 
551  /** Convenience function -- short for calling set_data(data,
552  * target_column), then fill().
553  */
554  void fill(const sframe& data,
555  const std::string& target_column = "");
556 
557 
558  /** Convenience function -- short for calling set_data(data,
559  * target), then fill().
560  */
561  void fill(const sframe& data,
562  const sframe& target);
563 
564 
565  /** Call this function when all the data is added. This executes
566  * the filling process based on everything given.
567  */
568  void fill();
569 
570 
571  ////////////////////////////////////////////////////////////////////////////////
572  //
573  // Metadata access
574  //
575  ////////////////////////////////////////////////////////////////////////////////
576 
577  /** Returns True if the ml_data structure has been created
578  * completely and is ready to use.
579  */
580  inline bool creation_complete() const { return incoming_data == nullptr; }
581 
582  /** Direct access to the metadata.
583  */
584  inline const std::shared_ptr<ml_metadata>& metadata() const {
585  return _metadata;
586  }
587 
588  /** Returns the number of columns present, including any possible
589  * side columns.
590  */
591  inline size_t num_columns() const {
592  return _metadata->num_columns();
593  }
594 
595  /** The number of rows present.
596  */
597  inline size_t num_rows() const {
598  return _row_end - _row_start;
599  }
600 
601  /** The number of rows present.
602  */
603  inline size_t size() const {
604  return num_rows();
605  }
606 
607  /** Returns true if there is no data in the container.
608  */
609  inline bool empty() const {
610  return _row_start == _row_end;
611  }
612 
613  /**
614  * Returns the maximum row size present in the data. This information is
615  * calculated when the data is indexed and the ml_data structure is filled.
616  * A buffer sized to this is guaranteed to hold any row encountered while
617  * iterating through the data.
618  */
619  size_t max_row_size() const;
620 
621  ////////////////////////////////////////////////////////////////////////////////
622  //
623  // Iteration Access
624  //
625  ////////////////////////////////////////////////////////////////////////////////
626 
627  /** Return an iterator over part of the data. See
628  * iterators/ml_data_iterator.hpp for documentation on the returned
629  * iterator.
630  */
631  ml_data_iterator get_iterator(
632  size_t thread_idx=0, size_t num_threads=1,
633  bool add_side_information_if_present = true,
634  bool use_reference_encoding = false) const;
635 
636 
637  /** Return a block iterator over part of the data. See
638  * iterators/ml_data_block_iterator.hpp for documentation on the returned
639  * iterator.
640  */
641  ml_data_block_iterator get_block_iterator(
642  size_t thread_idx=0, size_t num_threads=1,
643  bool add_side_information_if_present = true,
644  bool use_reference_encoding = false) const;
645 
646  ////////////////////////////////////////////////////////////////////////////////
647  //
648  // Utility routines to handle side data
649  //
650  ////////////////////////////////////////////////////////////////////////////////
651 
652  /** Returns the current side features that work with this class.
653  */
654  std::shared_ptr<ml_data_side_features> get_side_features() const {
655 
656  DASSERT_TRUE(side_features != nullptr);
657  return side_features;
658  }
659 
660  /** Returns true if a target column is present, and false otherwise.
661  */
662  bool has_target() const {
663  return rm.has_target;
664  }
665 
666  /** Returns true if there are side features, and false otherwise
667  */
668  bool has_side_features() const {
669  return (side_features != nullptr);
670  }
671 
672  /** Returns true if there are untranslated columns present, and
673  * false otherwise.
674  */
675  bool has_untranslated_columns() const {
676  return (!untranslated_columns.empty());
677  }
678 
679  /** Returns true if any of the non-target columns are translated.
680  */
681  bool has_translated_columns() const {
682  return (untranslated_columns.size() != metadata()->num_columns(false));
683  }
684 
685  typedef Eigen::Matrix<double, Eigen::Dynamic,1> DenseVector;
686  typedef Eigen::SparseVector<double> SparseVector;
687 
688  /** Translates the ml_data_entry row format to the original flexible
689  * types.
690  */
691  std::vector<flexible_type> translate_row_to_original(const std::vector<ml_data_entry>& v) const;
692 
693  /** Translates the ml_data_entry_global_index row format to the original flexible
694  * types.
695  */
696  std::vector<flexible_type> translate_row_to_original(const std::vector<ml_data_entry_global_index>& v) const;
697 
698  /** Translates the original dense row format to the original flexible
699  * types.
700  */
701  std::vector<flexible_type> translate_row_to_original(const DenseVector& v) const;
702 
703  /** Translates the original dense row format to the original flexible
704  * types.
705  */
706  std::vector<flexible_type> translate_row_to_original(const SparseVector& v) const;
707 
708 
709  ////////////////////////////////////////////////////////////////////////////////
710  // Direct access to creating and working with the indexers
711 
712 
713  typedef std::shared_ptr<ml_data_internal::column_indexer> indexer_type;
714 
715  /** Occasionally, we need to create a tempororay indexer for a
716  * specific column. This allows us to do just that.
717  *
718  */
719  static indexer_type create_indexer(
720  const std::string& column_name,
721  ml_column_mode mode,
722  flex_type_enum column_type,
723  const std::string& indexer_type = "unique",
724  const std::map<std::string, flexible_type>& options = flex_map());
725 
726 
727  private:
728  void _check_is_iterable() const;
729 
730  public:
731 
732 
733  ////////////////////////////////////////////////////////////////////////////////
734  //
735  // Utility routines to convert ml_data to other formats.
736  //
737  ////////////////////////////////////////////////////////////////////////////////
738 
739  /**
740  * Create a subsampled copy of the current ml_data structure. This
741  * allows us quickly create a subset of the data to be used for things
742  * like sgd, etc.
743  *
744  * If n_rows < size(), exactly n_rows are sampled IID from the
745  * dataset. Otherwise, a copy of the current ml_data is returned.
746  */
747  ml_data create_subsampled_copy(size_t n_rows, size_t random_seed) const;
748 
749  /**
750  * Create a copy of the current ml_data structure, selecting the rows
751  * given by selection_indices.
752  *
753  * \param selection_indices A vector of row indices that must be in
754  * sorted order. Duplicates are allowed. The returned ml_data
755  * contains all the rows given by selection_indices.
756  *
757  * \return A new ml_data object with containing only the rows given
758  * by selection_indices.
759  */
760  ml_data select_rows(const std::vector<size_t>& selection_indices) const;
761 
762  /**
763  * Create a sliced copy of the current ml_data structure. This
764  * copy is cheap.
765  */
766  ml_data slice(size_t start_row, size_t end_row) const;
767 
768  /**
769  * Create a sliced copy of the current ml_data structure, with the
770  * slice indices referenced from the original structure
771  */
772  ml_data absolute_slice(size_t start_row, size_t end_row) const;
773 
774 
775  ////////////////////////////////////////////////////////////////////////////////
776  // Serialization routines
777 
778  /** Get the current serialization format.
779  */
780  size_t get_version() const { return 1; }
781 
782  /**
783  * Serialize the object (save).
784  */
785  void save(turi::oarchive& oarc) const;
786 
787  /**
788  * Load the object.
789  */
790  void load(turi::iarchive& iarc);
791 
792  private:
793 
794  friend class ml_data_iterator_base;
795 
796  ////////////////////////////////////////////////////////////////////////////////
797  //
798  // Internal data
799  //
800  ////////////////////////////////////////////////////////////////////////////////
801 
802  std::shared_ptr<ml_metadata> _metadata = nullptr;
803 
804  size_t _row_start = 0;
805  size_t _row_end = 0;
806  size_t _original_num_rows = 0;
807  size_t _max_row_size = 0;
808 
809 
810  /** The row metadata. This is what is needed to interact with the
811  * raw data contained in this data set, and gives the schema for
812  * the data laid out in the data_blocks variable below.
813  */
814  ml_data_internal::row_metadata rm;
815 
816  /** The current side features. This may be different from the
817  * original side features if additional data has been provided.
818  *
819  */
820  std::shared_ptr<ml_data_side_features> side_features = nullptr;
821 
822 
823  // The row block size. Set so that each row is at least 64K. This
824  // balances the buffering and sorting speed with not using too much
825  // memory at once. This value is set initially on fill.
826  size_t row_block_size = size_t(-1);
827 
828  /** The main storage container for the indexed, compactly
829  * represented rows.
830  */
831  std::shared_ptr<sarray<ml_data_internal::row_data_block> > data_blocks;
832 
833  /** The main storage container for untranslated columns. These
834  * columns are not put through the indexer or anything else.
835  */
836  std::vector<std::shared_ptr<sarray<flexible_type> > > untranslated_columns;
837 
838  /** The block manager -- holds the readers, as well as a cache of
839  * currently referenced blocks. Each block holds both the
840  * translated and untranslated columns.
841  */
842  std::shared_ptr<ml_data_internal::ml_data_block_manager> block_manager;
843 
844  /** Convenience function to create the block manager given the
845  * current data in the model.
846  */
847  void _create_block_manager();
848 
849 
850 
851  ////////////////////////////////////////////////////////////////////////////////
852  //
853  // Temporary variables to hold the filling parameters.
854  //
855  ////////////////////////////////////////////////////////////////////////////////
856 
857  struct _data_for_filling {
858 
859  // This is moved to the metadata at creation time.
860  std::map<std::string, flexible_type> options;
861 
862  bool immutable_metadata;
863 
864  sframe data;
865  std::string target_column_name;
866 
867  /** Column ordering holds a partial ordering of the incoming
868  * columns. Can be empty, in which case the columns are chosen
869  * arbitrarily.
870  */
871  std::vector<std::string> column_ordering;
872 
873 
874  typedef std::map<std::string, ml_column_mode> mode_override_map;
875 
876  mode_override_map mode_overrides;
877 
878  struct incoming_side_feature {
879  sframe data;
880  std::string forced_join_column;
881  mode_override_map mode_overrides;
882  };
883 
884  std::vector<incoming_side_feature> incoming_side_features;
885  };
886 
887  std::unique_ptr<_data_for_filling> incoming_data = nullptr;
888 
889 
890  ////////////////////////////////////////////////////////////////////////////////
891  //
892  // Internal routines for setting up and filling the ml_data. These
893  // are defined in ml_data_setup.cpp.
894  //
895  ////////////////////////////////////////////////////////////////////////////////
896 
897  /** Sets the ml metadata for the whole class based on the options
898  * given.
899  *
900  */
901  void _setup_ml_metadata();
902 
903  /**
904  * Fill the ml_data structure with the raw data in raw_data.
905  *
906  * Only call this function when metadata and optionally target metadata are
907  * set. It uses them to import the data.
908  *
909  * \param[in] raw_data Input SFrame (with target column)
910  * \param[in] track_statistics Tracks stats (mean, variance etc.)
911  * \param[in] allow_new_catgorical_values Modify metadata for new categories?
912  * \param[in] none_action Missing value action?
913  *
914  *
915  * \note track_statistics and allow_new_catgorical_values when set to true
916  * will modify the underlying metadata.
917  *
918  */
919  void _fill_data_blocks(bool in_training_mode);
920 
921  /** Sets up the untranslated columns and column readers.
922  */
923  void _setup_untranslated_columns(const sframe& original_data);
924 
925  /** Set up the untranslated column readers.
926  */
927  void _setup_untranslated_column_readers();
928 
929  ////////////////////////////////////////////////////////////////////////////////
930  //
931  // Internal routines for sorting the ml_data. These are defined in
932  // ml_data_sorting.cpp.
933  //
934  ////////////////////////////////////////////////////////////////////////////////
935 
936  std::unique_ptr<ml_data_iterator> _merge_sorted_ml_data_sources(
937  const std::vector<std::unique_ptr<ml_data_iterator> >& sources);
938 
939  void _sort_user_item_data_blocks();
940 };
941 
942 }}
943 
944 ////////////////////////////////////////////////////////////////////////////////
945 // Implement serialization for
946 // std::shared_ptr<std::vector<sarray<ml_data_internal::entry_value> > >
947 
948 BEGIN_OUT_OF_PLACE_SAVE(arc, std::shared_ptr<sarray<turi::v2::ml_data_internal::row_data_block> >, m) {
949  if(m == nullptr) {
950  arc << false;
951  } else {
952  arc << true;
953  arc << (*m);
954  }
955 } END_OUT_OF_PLACE_SAVE()
956 
957 BEGIN_OUT_OF_PLACE_LOAD(arc, std::shared_ptr<sarray<turi::v2::ml_data_internal::row_data_block> >, m) {
958  bool is_not_nullptr;
959  arc >> is_not_nullptr;
960  if(is_not_nullptr) {
961  m.reset(new sarray<turi::v2::ml_data_internal::row_data_block>);
962  arc >> (*m);
963  } else {
964  m = std::shared_ptr<sarray<turi::v2::ml_data_internal::row_data_block> >(nullptr);
965  }
966 } END_OUT_OF_PLACE_LOAD()
967 
968 
969 #endif
#define BEGIN_OUT_OF_PLACE_LOAD(arc, tname, tval)
Macro to make it easy to define out-of-place loads.
Definition: iarchive.hpp:314
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
STL namespace.
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
#define DASSERT_TRUE(cond)
Definition: assertions.hpp:364
#define BEGIN_OUT_OF_PLACE_SAVE(arc, tname, tval)
Macro to make it easy to define out-of-place saves.
Definition: oarchive.hpp:346