Turi Create  4.0
sparse_similarity_lookup.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_UNITY_ITEM_SIMILARITY_LOOKUP_H_
7 #define TURI_UNITY_ITEM_SIMILARITY_LOOKUP_H_
8 
9 #include <core/data/flexible_type/flexible_type.hpp>
10 #include <model_server/lib/extensions/option_manager.hpp>
11 
12 namespace turi {
13 
14 class sframe;
15 template <typename T> class sarray;
16 
17 /** A model that can be used for sparse similarity lookup.
18  *
19  * A trained version of this model contains a lookup table of the
20  * nearest items to each item along with a similarity score. This
21  * allows both retrieval of the most similar items to a given item,
22  * and to generate a list of the items most similar to a collection
23  * of items. The latter is used to recommend items, e.g. by
24  * providing a list of the items and ratings for a particular user.
25  *
26  * The similarity metrics are an implementation of a class that
27  * implements a number of methods dictating the math used in the
28  * accumulation. See similarities.hpp for details.
29  *
30  * This model is creating using the create(...) methods below, which
31  * takes the name of the similarity and the current options. The
32  * options given in add_options(...) must be present in the options
33  * map.
34  *
35  * The model can be trained by either providing the similarities of
36  * the items directly, or by training the model on a sarray of
37  * user-item-ratings. See the below functions for details.
38  *
39  *
40  * Code Structure:
41  *
42  * - This model is intended to be encapsulated by other user-facing
43  * models such as item similarity. In this case, item similarity
44  * provides the user facing API, creates this model and then uses
45  * it. Some item cf specific features, like how to handle new
46  * users, are punted to that model -- this one only can be queried
47  * with a list of items and ratings which then produce the output.
48  *
49  * - The similarity class defines the metric used, and then how the
50  * averaging at prediction time is done.
51  *
52  * - The similarity class is given as a template to the implementation
53  * part of this class, sparse_similarity_lookup_impl, which inherits
54  * from this one. This class does the training (if it's not farmed
55  * out to different helper functions), and takes care of the
56  * prediction and item scoring. Basically, the core of the
57  * algorithm is in sparse_similarity_lookup_impl.hpp.
58  *
59  * - A number of accompaning utilities -- for example, nearest
60  * neighbors functions, utilities to generate the per-item
61  * statistics, and
62  *
63  * - A dense matrix class that stores only the upper diaganol part of
64  * a matrix is provided in sliced_itemitem_matrix.hpp. Included in
65  * that header are tools for estimating the number of row-slices and
66  * passes through the data to make given constraints on the memory
67  * usage.
68  *
69  */
71  public:
72 
73  virtual ~sparse_similarity_lookup() = default;
74 
75  /** Returns the name of the similarity this version uses.
76  */
77  virtual std::string similarity_name() const = 0;
78 
79  /** Adds in all of the options needed for this class to the option
80  * manager.
81  */
82  static void add_options(option_manager& options);
83 
84  /** Factory method: Call this to create or load a model from one of
85  * the existing similarities by name.
86  */
87  static std::shared_ptr<sparse_similarity_lookup>
88  create(const std::string& similarity, const std::map<std::string, flexible_type>& options);
89 
90  /** Trains the model from an sarray of vectors of (index, score)
91  * pairs. Each row is assumed to be the user, and each index in
92  * the score is an item that the user rated. The similarity of a
93  * given item to another item is given by treating the user ratings
94  * of each item as a sparse vector and then measuring the
95  * similarity between them. This calculation is done as
96  * efficiently as possible using a combination of nearest neighbors
97  * search and lookup tables.
98  */
99  virtual std::map<std::string, flexible_type>
101  size_t num_items,
102  const std::shared_ptr<sarray<std::vector<std::pair<size_t, double> > > >& data) = 0;
103 
104  /** Sets the lookup tables directly from an sframe of interaction
105  * data. The interaction data is an sframe containing columns
106  * item_column, similar_item_column, and similarity. The items and
107  * similar items must be indices in {0, ..., num_items-1}.
108  *
109  * An error is raised if the similarity value does not conform to
110  * the similarity chosen -- e.g. jaccard similarity must be between
111  * 0 and 1.
112  *
113  * If add_reverse is true, then all (i,j, rating) entries are also
114  * interpreted as (j, i, rating). Note that no duplicates can be
115  * present.
116  */
117  virtual void setup_by_raw_similarity(
118  size_t num_items,
119  const flex_list& item_data,
120  const sframe& _interaction_data,
121  const std::string& item_column,
122  const std::string& similar_item_column,
123  const std::string& similarity,
124  bool add_reverse = false) = 0;
125 
126 
127  ////////////////////////////////////////////////////////////////////////////////
128  // Prediction and item scoring.
129 
130  /** Score all items in a list of item predictions given a list of
131  * user-item interactions. This method fills in the second part
132  * of the tuple for all the items in the item_predictions
133  * container.
134  *
135  * This is also the way to generate values for predict --
136  *
137  * Returns the number of item similarity pairs that were
138  * considered. (In some corner cases, such as when an item had no
139  * users that also rated other items, we want to recommend by
140  * popularity or some other metric).
141  */
142  virtual size_t score_items(std::vector<std::pair<size_t, double> >& item_predictions,
143  const std::vector<std::pair<size_t, double> >& user_item_data) const = 0;
144 
145  /** Fills an array with the most similar items to a given item.
146  * This is read directly from the lookup table. If fewer than
147  * top_k items are present in the lookup table, then only those in
148  * the lookup table are returned.
149  */
150  virtual void get_similar_items(
151  std::vector<std::pair<size_t, flexible_type> >& similar_items_dest,
152  size_t item, size_t top_k) const = 0;
153 
154  ////////////////////////////////////////////////////////////////////////////////
155  // Routines for loading and serialization.
156 
157  virtual size_t get_version() const = 0;
158 
159  /** Serialization in sparse_similarity_lookup_impl.
160  */
161  virtual void save(turi::oarchive& oarc) const = 0;
162  virtual void load(turi::iarchive& iarc) = 0;
163 
164  /** The current options.
165  */
166  const std::map<std::string, flexible_type>& current_options() const {
167  return options;
168  }
169 
170  /** A method to detect if two sparse_similarity_lookup classes are
171  * essentially the same.
172  *
173  */
174  virtual bool _debug_check_equal(const sparse_similarity_lookup& other) const = 0;
175 
176  protected:
177 
178  /** The stored options.
179  */
180  std::map<std::string, flexible_type> options;
181 
182 };
183 
184 }
185 
186 BEGIN_OUT_OF_PLACE_SAVE(arc, std::shared_ptr<sparse_similarity_lookup>, m) {
187  if(m == nullptr) {
188  arc << false;
189  } else {
190  arc << true;
191 
192  static const size_t verification_number = 0x36fe3812b00eddb0ULL;
193 
194  // Save the similarity name
195  arc << m->similarity_name();
196 
197  // Save the options
198  arc << m->current_options();
199 
200  // Save the model.
201  m->save(arc);
202 
203  arc << verification_number;
204 
205  }
206 
207 } END_OUT_OF_PLACE_SAVE()
208 
209 BEGIN_OUT_OF_PLACE_LOAD(arc, std::shared_ptr<sparse_similarity_lookup>, m) {
210  bool is_not_nullptr;
211  arc >> is_not_nullptr;
212  if(is_not_nullptr) {
213 
214  static const size_t verification_number = 0x36fe3812b00eddb0ULL;
215 
216  std::string similarity_name;
217  arc >> similarity_name;
218 
219  std::map<std::string, flexible_type> options;
220  arc >> options;
221 
222  std::shared_ptr<sparse_similarity_lookup> new_m
223  = sparse_similarity_lookup::create(similarity_name, options);
224 
225  new_m->load(arc);
226 
227  size_t test_verification_number = 0;
228  arc >> test_verification_number;
229 
230  ASSERT_MSG(test_verification_number == verification_number,
231  "Unknown error loading similarity model.");
232 
233  m = new_m;
234 
235  } else {
236  m = std::shared_ptr<sparse_similarity_lookup>();
237  }
238 } END_OUT_OF_PLACE_LOAD()
239 
240 #endif /* _ITEM_SIMILARITY_LOOKUP_H_ */
#define BEGIN_OUT_OF_PLACE_LOAD(arc, tname, tval)
Macro to make it easy to define out-of-place loads.
Definition: iarchive.hpp:314
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
static std::shared_ptr< sparse_similarity_lookup > create(const std::string &similarity, const std::map< std::string, flexible_type > &options)
const std::map< std::string, flexible_type > & current_options() const
virtual void save(turi::oarchive &oarc) const =0
virtual std::map< std::string, flexible_type > train_from_sparse_matrix_sarray(size_t num_items, const std::shared_ptr< sarray< std::vector< std::pair< size_t, double > > > > &data)=0
std::map< std::string, flexible_type > options
virtual void setup_by_raw_similarity(size_t num_items, const flex_list &item_data, const sframe &_interaction_data, const std::string &item_column, const std::string &similar_item_column, const std::string &similarity, bool add_reverse=false)=0
static void add_options(option_manager &options)
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
virtual std::string similarity_name() const =0
virtual size_t score_items(std::vector< std::pair< size_t, double > > &item_predictions, const std::vector< std::pair< size_t, double > > &user_item_data) const =0
std::vector< flexible_type > flex_list
virtual void get_similar_items(std::vector< std::pair< size_t, flexible_type > > &similar_items_dest, size_t item, size_t top_k) const =0
#define BEGIN_OUT_OF_PLACE_SAVE(arc, tname, tval)
Macro to make it easy to define out-of-place saves.
Definition: oarchive.hpp:346
virtual bool _debug_check_equal(const sparse_similarity_lookup &other) const =0