Turi Create  4.0
itemcf.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_RECSYS_MODEL_ITEMCF_H_
7 #define TURI_RECSYS_MODEL_ITEMCF_H_
8 
9 #include <vector>
10 #include <string>
11 #include <model_server/lib/extensions/option_manager.hpp>
12 #include <toolkits/coreml_export/mlmodel_wrapper.hpp>
13 #include <toolkits/recsys/recsys_model_base.hpp>
14 #include <core/generics/symmetric_2d_array.hpp>
15 
16 #include <Eigen/Core>
17 #include <limits>
18 
19 namespace turi {
20 
21 namespace v2 {
22 class ml_data;
23 }
24 
25 class sframe;
26 class sgraph;
27 class flexible_type;
28 class sparse_similarity_lookup;
29 
30 namespace recsys {
31 
32 /**
33  * This provides an implementation of a collaborative filtering algorithm.
34  * The premise is to compute similarities (or distances) between all pairs
35  * of items. Several choices of similarity will be available, and these
36  * are functions of the list of users that were observed with the pair of
37  * items. Some choices of similarity can also leverage a score that the
38  * item was given by the user, e.g. a rating.
39  *
40  * In the following, let u(a) be the set of users who rated item a, let
41  * E be the set of all (user, item) pairs, and let \f$ r_{u,i} \f$ be the
42  * rating that user \f$ u \f$ gave to item \f$ i \f$.
43  *
44  * The three functions currently implemented are:
45  * Jaccard similarity:
46  * Let \f$u(a) = {k: (k,a) in E}\f$. Then Jaccard similarity is defined by:
47  * \f[J(a,b) = \frac{| u(a) \cap u(b) |}{ | u(a) \cup u(b) |} \f]
48  *
49  * Cosine similarity:
50  * This compares the ratings given by all users who rated both items (where all
51  * unobserved ratings \f$ r_{ua}\f$ are considered to be 0.
52  * \f[ d(a,b) = \frac{\sum_{k} r_{ka} * r_{kb}}
53  * {\sqrt{\sum_{k} r_{ka}^2} \sqrt{\sum_{k} r_{kb}^2}} \f]
54  *
55  * Pearson Correlation similarity:
56  * A problem with Cosine similarity measure is that it does not consider the
57  * differences in the mean and variance of the ratings of items a and b.
58  * Pearson Correlation is a popular measure where the effects of mean and variance
59  * have been removed.
60  * Let \f$ u(a,b) = \{k: (k,a) \in E and (k,b) \in E\} \f$ denote the set of users who
61  * rated both items a and b.
62  * \f[ d(a,b) = \frac{\sum_{k \in u(a,b)} (r_{ka} - \bar{r}_a) * (r_{kb} - \bar{r}_b)}
63  * {\sqrt{\sum_{k \in u(a,b)} (r_{ka} - \bar{r}_a)^2} \sqrt{\sum_{k \in u(a,b)} (r_{kb} - \bar{r}_b)^2}} \f]
64  *
65  * =============================================================
66  * Implementation details:
67  *
68  * - Jaccard is implemented using two sufficient statistics:
69  * -# C(i): The number of times item i was rated.
70  * -# C(i,j): The number of times item i and j were rated by the same user.
71  * .
72  * The final similarity is computed by: \f$ C(i,j)/(C(i) + C(j) - C(i,j)) \f$
73  *
74  * - Cosine similarity is implemented using two statistics.
75  * -# C(i): The sum of squared ratings given to item i.
76  * -# C(i,j): The sum of products of the ratings for all users who rated both i and j.
77  * .
78  * The distance is computed as: \f$ d(i,j) = \frac{C(i,j)}{\sqrt{C(i)C(j)}} \f$
79  *
80  * - Pearson Correlation similarity is implemented using three statistics.
81  * -# C(i) : the variance of ratings given to item i.
82  * -# C(i,j) : the sum of correlation score by all users who rated both items i and j
83  * .
84  * The final similarity is computed by: \f$ d(i,j) = \frac{C(i,j)}{\sqrt{C(i)C(j)}} \f$
85  *
86  * Details of computing item similarities:
87  *
88  * 1. Get the individual statistics of items C(i)
89  * 2. For each pair of items i and j that both rated by user u, update the overlapping
90  * statistics C(i,j).
91  * 3. Get the final score matrix by normalizing C(i,j) with individual statistics
92  * 4. Sort each row of the score matrix to get top-k similar items
93  *
94  */
95 
97  public:
98  bool use_target_column(bool target_is_present) const override {
99  return target_is_present;
100  }
101 
102  static constexpr size_t ITEMCF_VERSION = 2;
103 
104  void init_options(const std::map<std::string, flexible_type>& options) override;
105 
106  private:
107  /** Handling extra data given by the
108  *
109  */
110  void set_extra_data(const std::map<std::string, variant_type>& extra_data) override;
111  void load_user_provided_data();
112 
113  struct user_provided_data_struct {
114  sframe nearest_items;
115  };
116 
117  std::shared_ptr<user_provided_data_struct> user_provided_data;
118 
119  public:
120  /**
121  * When the number of items is less than 20k, it uses in memory computations train_in_memory().
122  * Otherwise, it uses the implementation based on SGraph train_using_sgraph().
123  */
124  std::map<std::string, flexible_type> train(const v2::ml_data& data) override;
125 
126  /**
127  * During the predict phase, we perform the "vector matrix product"
128  * where we compute a score for a particular (user, item) pair.
129  * This score is a sum of similarities between an item and all the items
130  * observed for the given user. For similarity functions that incorporate some
131  * target value for each (user, item) pair, this prediction also
132  * multiples each similarity by that value, e.g. a rating they gave the
133  * item in question.
134  */
135  sframe predict(const v2::ml_data& test_data) const override;
136 
137  std::vector<double> predict_all_items(
138  const std::vector<flexible_type>& base_observation) const;
139 
140 private:
141  mutable atomic<int> user_item_buffers_setup;
142  mutable std::vector<std::vector<std::pair<size_t, double> > > user_item_buffers_by_thread;
143  mutable mutex init_user_item_buffers_lock;
144 
145  // The internal function for calling the score.
146  void _score_items(std::vector<std::pair<size_t, double> >& item_scores,
147  const std::vector<std::pair<size_t, double> >& user_scores) const;
148 
149 public:
150  /** For a given base observation, predict the score for all the
151  * items with all non-item columns replaced by the values in the
152  * base observation.
153  *
154  * The base_observation vector is used to generate all the
155  * observations predicted. New observations are generated by
156  * repeatedly copying template_observation, then replacing the
157  * values in item_column_index by each possible item value.
158  */
159  void score_all_items(
160  std::vector<std::pair<size_t, double> >& scores,
161  const std::vector<v2::ml_data_entry>& query_row,
162  size_t top_k,
163  const std::vector<std::pair<size_t, double> >& user_item_list,
164  const std::vector<std::pair<size_t, double> >& new_user_item_data,
165  const std::vector<v2::ml_data_row_reference>& new_observation_data,
166  const std::shared_ptr<v2::ml_data_side_features>& known_side_features) const override;
167 
168  /**
169  * Utilities
170  */
171  std::string response_column_name() const;
172 
173  inline size_t internal_get_version() const override {
174  return ITEMCF_VERSION;
175  }
176 
177  void internal_save(turi::oarchive& oarc) const override;
178  void internal_load(turi::iarchive& iarc, size_t version) override;
179 
180  protected:
181 
182  /** The primary tool for the item similarity modeling part.
183  */
184  std::shared_ptr<sparse_similarity_lookup> item_sim;
185 
186  // For completely new users, keep track of some of the popular items
187  // and use these to seed predictions for them. in addition, the
188  // mean score is also dealt with.
189  std::vector<std::pair<size_t, double> > new_user_seed_items;
190  std::vector<double> item_mean_score;
191  double item_mean_min=0, item_mean_max=0;
192 
193  public:
194 
195  /**
196  * Get the nearest neighbors of a set of items.
197  *
198  * \param[in] indexed_items A SArray of items in flexible_type
199  * \param[in] topk Number of neighbors returned for each item
200  * \returns A SFrame with columns {"item", "similar", "score", "rank"}
201  */
202  sframe get_similar_items(
203  std::shared_ptr<sarray<flexible_type> > items, size_t topk=0) const override;
204 
205  /**
206  * Get the nearest neighbors of a set of users.
207  *
208  * \param[in] indexed_users A SArray of users in flexible_type
209  * \param[in] topk Number of neighbors returned for each item
210  * \returns A SFrame with columns {"user", "similar", "score", "rank"}
211  */
213  std::shared_ptr<sarray<flexible_type> > items, size_t topk=0) const override {
214  log_and_throw("get_similar_users currently not supported for item similarity models. "
215  "To get the neighborhood of users, train a model with the items and users reversed, "
216  "then call get_similar_items.");
217 
218  return sframe();
219  }
220 
221  virtual std::shared_ptr<coreml::MLModelWrapper> export_to_coreml(
222  const std::string& filename,
223  const std::map<std::string, flexible_type>& additional_user_defined)
224  override;
225 
226  private:
227 
228  void make_user_item_graph(const v2::ml_data& data,
229  const std::shared_ptr<sarray<flex_dict> >& user_item_lists,
230  sgraph& g);
231 
232  std::shared_ptr<sparse_similarity_lookup> create_similarity_lookup() const;
233 
234  public:
235 
236  BEGIN_CLASS_MEMBER_REGISTRATION("item_similarity")
239 };
240 
241 }}
242 
243 #endif
#define BEGIN_CLASS_MEMBER_REGISTRATION(python_facing_classname)
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
std::shared_ptr< sparse_similarity_lookup > item_sim
Definition: itemcf.hpp:184
#define IMPORT_BASE_CLASS_REGISTRATION(base_class)
sframe get_similar_users(std::shared_ptr< sarray< flexible_type > > items, size_t topk=0) const override
Definition: itemcf.hpp:212
#define END_CLASS_MEMBER_REGISTRATION
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80