Turi Create  4.0
factorization_model.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_FACTORIZATION_MODEL_BASE_H_
7 #define TURI_FACTORIZATION_MODEL_BASE_H_
8 
9 #include <Eigen/Core>
10 #include <cmath>
11 #include <vector>
12 #include <string>
13 #include <map>
14 #include <memory>
15 #include <toolkits/factorization/loss_model_profiles.hpp>
16 #include <core/storage/serialization/serialization_includes.hpp>
17 #include <core/data/flexible_type/flexible_type.hpp>
18 #include <model_server/lib/variant.hpp>
19 #include <model_server/lib/variant_deep_serialize.hpp>
20 
21 
22 namespace turi {
23 
24 class sframe;
25 class oarchive;
26 class iarchive;
27 class option_manager;
28 
29 namespace v2 {
30 class ml_data;
31 class ml_metadata;
32 class ml_data_side_features;
33 struct ml_data_entry;
34 }
35 
36 namespace factorization {
37 
38 ////////////////////////////////////////////////////////////////////////////////
39 
40 /** Factorization model class.
41  *
42  * This class is the base model for all the factorization models.
43  * All interaction with these models should go through this class.
44  *
45  * This class is intended to be embedded within other models as the
46  * matrix factorization interface. For example, the recommender
47  * model holds a std::shared_ptr<factorization_model> pointer. The
48  * matrix factorization class exposed to the user as a standalone
49  * model will also embed this class.
50  *
51  * The details of the model are implemented in a subclass of
52  * factorization_model with template parameters controlling some
53  * aspects of the model's functionality. In particular, if the model
54  * is a matrix factorization model, only the first two dimensions
55  * have latent factors, whereas a factorization model has latent
56  * factors for all dimensions. This is in
57  * factorization_model_impl.hpp.
58  *
59  * To train a model, use the static train(...) function below.
60  * Similarly, to load such a model, use the static load(...) function
61  * below. These instantiate the correct type of subclass, then
62  * return a pointer to this class.
63  *
64  */
66  public:
67 
68  ////////////////////////////////////////////////////////////////////////////////
69  //
70  // Part 1: Model Training
71 
72  // Constants dictating how the training data is laid out.
73  static constexpr size_t USER_COLUMN_INDEX = 0;
74  static constexpr size_t ITEM_COLUMN_INDEX = 1;
75 
76 
77  /** Create and train a factorization model with the given options.
78  * Uses the factory method pattern.
79  *
80  * The training method constructs an
81  *
82  * \param[in] factor_mode One of "linear_model",
83  * "matrix_factorization", or "factorization_model". If
84  * "linear_model", then the class is essentially linear regression;
85  * if "matrix_factorization", then only the first two columns have
86  * latent factors, and if "factorization_model", then the full
87  * factorization machine model is used.
88  *
89  * \param[in] train_data The training data for the model.
90  *
91  * \param[in] options The options used in the current model as well
92  * as training parameters.
93  */
94  static std::shared_ptr<factorization_model> factory_train(
95  const std::string& factor_mode,
96  const v2::ml_data& train_data,
97  std::map<std::string, flexible_type> options);
98 
99  /** Returns a map of the training statistics of the model.
100  */
101  std::map<std::string, variant_type> get_training_stats() const;
102 
103  ////////////////////////////////////////////////////////////////////////////////
104  //
105  // Part 3: Model options.
106 
107  /** Call the following function to insert the option definitions
108  * needed for the factorization_model class into an option manager.
109  *
110  * The option_flags parameter is used to control what options are
111  * enabled and what the factorization_model class is expected to
112  * support. Possible flags are as follows:
113  *
114  * ranking: Include options for ranking-based optimization. This
115  * is required for implicit rating optimization.
116  *
117  * \param[in,out] options The option manager to add the options to.
118  *
119  * \param[in] option_flags The functionality that the
120  * factorization_model class is expected to support.
121  *
122  * This function is defined in factorization_model_options.cpp.
123  */
124  static void add_options(
125  option_manager& options,
126  const std::vector<std::string>& option_flags);
127 
128  ////////////////////////////////////////////////////////////////////////////////
129  //
130  // Part 4: Interface methods to use the model.
131 
132  /** Calculate the value of the objective function as determined by
133  * the loss function, for a full data set, minus the regularization
134  * penalty.
135  */
136  double calculate_loss(const v2::ml_data& data) const;
137 
138 
139  /** Make a prediction for every observation in test_data. Returns a
140  * single-column SFrame with a prediction for every observation.
141  */
142  sframe predict(const v2::ml_data& test_data) const;
143 
144  /** Scores all the items in scores, updating the score. Used by the
145  * recommender system.
146  */
147  virtual void score_all_items(
148  std::vector<std::pair<size_t, double> >& scores,
149  const std::vector<v2::ml_data_entry>& query_row,
150  size_t top_k,
151  const std::shared_ptr<v2::ml_data_side_features>& known_side_features) const = 0;
152 
153  /** Resets the state with an initial random seed and standard
154  * deviation.
155  */
156  virtual void reset_state(size_t random_seed, double sd) = 0;
157 
158 
159  /** Returns a map of all the different coefficients of the model, as
160  * given by the current model.
161  */
162  virtual std::map<std::string, variant_type> get_coefficients() const = 0;
163 
164  protected:
165 
166  ////////////////////////////////////////////////////////////////////////////////
167  // These functions need to be implemented by the child class.
168 
169  /** Calculate the linear function value at the given point wrt the
170  * current state.
171  */
172  virtual double calculate_fx(size_t thread_idx, const std::vector<v2::ml_data_entry>& x) const = 0;
173 
174  /** \overload
175  */
176  virtual double calculate_fx(const std::vector<v2::ml_data_entry>& x) const = 0;
177 
178  public:
179 
180  virtual void get_item_similarity_scores(
181  size_t item, std::vector<std::pair<size_t, double> >& sim_scores) const = 0;
182 
183  typedef Eigen::Matrix<float, Eigen::Dynamic, 1> vector_type;
184 
185  /** Computes the cosine similarity between a particular factor
186  * within a column and all the other factors within that column.
187  */
188  virtual void calculate_intracolumn_similarity(vector_type& dest, size_t column_index, size_t ref_index) const = 0;
189 
190  /** Set up the model with the correct index sizes, etc.
191  *
192  * Here, only should be called from the training functions in
193  * factorization_model_training. However, these are implemented
194  * outside of this class, so we have to keep this method public.
195  */
196  void setup(
197  const std::string& loss_model_name,
198  const v2::ml_data& train_data,
199  const std::map<std::string, flexible_type>& opts);
200 
201  protected:
202 
203  virtual void internal_setup(const v2::ml_data& train_data) {}
204 
205  // Unfortunately, the sgd interface needs these right now :-P, so
206  // keep them public for that.
207  public:
208 
209  // All the options for this model
210  std::map<std::string, flexible_type> options;
211 
212  // These end up storing the original index blocks that the model was
213  // trained on. The length of these is equal to the number of
214  // columns, with index_sizes storing the number of indices
215  // (features) used at test time and index_offsets storing the offset
216  // needed to easily make the local feature unique. feature_index +
217  // index_offsets[column] gives a unique global index, and
218  // index_sizes allows us to detect new features.
219  size_t n_total_dimensions = 0;
220 
221  std::vector<size_t> index_sizes;
222  std::vector<size_t> index_offsets;
223 
224  // This gives the amount to shift and scale the columns by. Only
225  // numerical columns are shifted by default.
226  std::vector<std::pair<double, double> > column_shift_scales;
227 
228  std::shared_ptr<v2::ml_metadata> metadata;
229 
230  double target_mean=0;
231  double target_sd=1;
232 
233  size_t random_seed = 0;
234 
235  ////////////////////////////////////////////////////////////////////////////////
236 
237  std::string loss_model_name;
238  std::shared_ptr<loss_model_profile> loss_model;
239 
240  std::map<std::string, variant_type> _training_stats;
241 
242  ////////////////////////////////////////////////////////////////////////////////
243  // Part X: Serialization.
244  //
245  // Implementing the serialization is less trivial here, as we are
246  // dealing with a base class that has a number of possible
247  // subclasses. The expectation is that everything outside this
248  // class will not have to interact with the templated subclass of
249  // this but rather use std::shared_ptr<factorization_model>. The
250  // serialization methods for std::shared_ptr<factorization_model>
251  // should just work.
252  //
253  // The implementation of these functions is in
254  // factorization_model_serialization.cpp.
255 
256  public:
257 
258  /** Return the serialization version.
259  */
260  virtual size_t get_version() const = 0;
261 
262  /** Serialization in factorization_model_impl. These methods allow
263  * the child class to have specific parameters that need to be
264  * serialized and deserialized.
265  */
266  virtual void save_impl(turi::oarchive& oarc) const = 0;
267  virtual void load_version(turi::iarchive& iarc, size_t version) = 0;
268 
269  /** Serialization of this base class.
270  */
271  void local_save_impl(turi::oarchive& oarc) const;
272  void local_load_version(turi::iarchive& iarc, size_t version);
273 
274  /** Return all the parameters needed by factory_load to determine
275  * what model to instantiate.
276  */
277  virtual std::map<std::string, variant_type> get_serialization_parameters() const = 0;
278 
279  /** Instantiate and load a factorization model from a stream.
280  */
281  static std::shared_ptr<factorization_model> factory_load(
282  size_t version,
283  const std::map<std::string, variant_type>& serialization_parameters,
284  turi::iarchive& iarc);
285 };
286 
287 }}
288 
289 ////////////////////////////////////////////////////////////////////////////////
290 // Implement serialization for vector<std::shared_ptr<column_indexer>
291 // > and std::shared_ptr<column_indexer>
292 
293 BEGIN_OUT_OF_PLACE_SAVE(arc, std::shared_ptr<factorization::factorization_model>, m) {
294  if(m == nullptr) {
295  arc << false;
296  } else {
297  arc << true;
298 
299  // Save the version number
300  size_t version = m->get_version();
301  arc << version;
302 
303  // Save the model parameters as a map
304  std::map<std::string, variant_type> serialization_parameters =
305  m->get_serialization_parameters();
306 
307  variant_deep_save(to_variant(serialization_parameters), arc);
308 
309  m->local_save_impl(arc);
310  }
311 
312 } END_OUT_OF_PLACE_SAVE()
313 
314 BEGIN_OUT_OF_PLACE_LOAD(arc, std::shared_ptr<factorization::factorization_model>, m) {
315  bool is_not_nullptr;
316  arc >> is_not_nullptr;
317  if(is_not_nullptr) {
318 
319  size_t version;
320  arc >> version;
321 
322  variant_type data_v;
323  variant_deep_load(data_v, arc);
324 
325  std::map<std::string, variant_type> data;
326  data = variant_get_value<decltype(data)>(data_v);
327 
328  m = factorization::factorization_model::factory_load(version, data, arc);
329 
330  } else {
331  m = std::shared_ptr<factorization::factorization_model>();
332  }
333 } END_OUT_OF_PLACE_LOAD()
334 
335 
336 #endif
#define BEGIN_OUT_OF_PLACE_LOAD(arc, tname, tval)
Macro to make it easy to define out-of-place loads.
Definition: iarchive.hpp:314
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
void variant_deep_load(variant_type &v, iarchive &iarc)
void variant_deep_save(const variant_type &v, oarchive &oarc)
boost::make_recursive_variant< flexible_type, std::shared_ptr< unity_sgraph_base >, dataframe_t, std::shared_ptr< model_base >, std::shared_ptr< unity_sframe_base >, std::shared_ptr< unity_sarray_base >, std::map< std::string, boost::recursive_variant_ >, std::vector< boost::recursive_variant_ >, boost::recursive_wrapper< function_closure_info > >::type variant_type
Definition: variant.hpp:24
static std::shared_ptr< factorization_model > factory_load(size_t version, const std::map< std::string, variant_type > &serialization_parameters, turi::iarchive &iarc)
variant_type to_variant(const T &f)
Definition: variant.hpp:308
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
#define BEGIN_OUT_OF_PLACE_SAVE(arc, tname, tval)
Macro to make it easy to define out-of-place saves.
Definition: oarchive.hpp:346