Turi Create  4.0
similarities.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_UNITY_ITEM_SIMILARITY_LOOKUP_AGGREGATORS_H_
7 #define TURI_UNITY_ITEM_SIMILARITY_LOOKUP_AGGREGATORS_H_
8 
9 #include <core/data/flexible_type/flexible_type.hpp>
10 
11 namespace turi { namespace sparse_sim {
12 
13 /** Generic classes for implementing similarities. All the math of
14  * calculating the actual similarity goes here. Denote the rating of
15  * a user/item pair is r_ui.
16  *
17  *
18  * The classes must all define the following types:
19  *
20  * item_data_type;
21  * interaction_data_type;
22  * final_item_data_type;
23  * final_interaction_data_type;
24  *
25  *
26  * And the following methods that do the math computation:
27  *
28  * void update_item(item_data_type&, double)
29  * void finalize_item(final_item_data_type&, item_data_type&)
30  *
31  * void update_interaction(interaction_data_type& e,
32  * const item_data_type& v1, const item_data_type& v2,
33  * double new_v1, double new_v2)
34  *
35  * void finalize_interaction(final_interaction_data_type& e_out,
36  * const final_item_data_type&,
37  * const final_item_data_type&,
38  * const interaction_data_type& e,
39  * const item_data_type& v1,
40  * const item_data_type& v2)
41  *
42  *
43  * With these functions, the similarity of two items is calculated using the following
44  * algorithm:
45  *
46  *
47  * for i1 in items:
48  * vb[i1] <- item_data_type()
49  *
50  * for u in users(i1):
51  * update_item(vb[i1], rating[u, i1])
52  *
53  * v[i1] <- final_item_data_type()
54  * finalize_item(v[i1], vb[i1])
55  *
56  *
57  *
58  * for i1 in items:
59  * for i2 in items:
60  * eb[i1, i2] <- interaction_data_type()
61  * for u in (users(i1) & users(i2)):
62  * update_interaction(eb[i1, i2], vb[i1], vb[i2], rating[u, i1], rating[u, i2])
63  *
64  * e[i1, i2] <- final_interaction_data_type()
65  * finalize_interaction(e[i1, i2], v[i1], v[i2], eb[i1, i2], vb[i1], vb[i2])
66  *
67  *
68  * Then, the top edge values for each item are saved as the nearest
69  * neighbors, where "top" is determined by compare_interactions:
70  *
71  * // Returns true if e1 is better than e2, and false otherwise.
72  * bool compare_interaction_values(const final_interaction_data_type& e1,
73  * const final_interaction_data_type& e2,
74  * const final_item_data_type& common_item_data,
75  * const final_item_data_type& item_data_1,
76  * const final_item_data_type& item_data_2)
77  *
78  * ////////////////////////////////////////////////////////////////////////////////
79  *
80  * Additional methods:
81  *
82  * // True if vertex operations are not atomic.
83  * static constexpr bool require_item_locking();
84  *
85  * // True if edge operations are not atomic.
86  * static constexpr bool require_interaction_locking();
87  *
88  * // True if missing elements can be treated as zero, and false otherwise.
89  * static constexpr bool missing_values_are_zero();
90  */
91 
92 
93 // A struct used to indicate that a particular type (typically the
94 // final vertex type) is unused.
95 struct unused_value_type : public IS_POD_TYPE {};
96 
97 // For speed and parallel processing with atomics, we use size_t for
98 // accumulating the recommendations as we can use efficient atomic
99 // operations with them. This then uses fixed point math, scaled by
100 // the following.
101 static constexpr int64_t _fixed_precision_scale_factor = (size_t(1) << 24);
102 
103 typedef int64_t _fixed_precision_type;
104 
105 static inline _fixed_precision_type _to_fixed(double v) {
106  return int64_t(std::round(v * _fixed_precision_scale_factor));
107 }
108 
109 static inline double _from_fixed(_fixed_precision_type v) {
110  return double(v) / _fixed_precision_scale_factor;
111 }
112 
113 class jaccard {
114  public:
115 
116  static std::string name() { return "jaccard"; }
117 
118  // At the end, we have
119  typedef size_t item_data_type;
120  typedef size_t interaction_data_type;
121  typedef _fixed_precision_type final_interaction_data_type;
122  typedef unused_value_type final_item_data_type;
123 
124  static constexpr bool require_item_locking() { return false; }
125  static constexpr bool require_interaction_locking() { return false; }
126  static constexpr bool missing_values_are_zero() { return true; }
127 
128  void update_item(item_data_type& v, double target) const {
129  if(LIKELY(target != 0)) {
130  atomic_increment(v);
131  }
132  }
133 
134  void update_item_unsafe(item_data_type& v, double target) const {
135  if(LIKELY(target != 0)) {
136  ++v;
137  }
138  }
139 
140  void finalize_item(final_item_data_type&, item_data_type&) const { }
141  void import_final_item_value(final_item_data_type& it, const flexible_type& src) const { }
142 
143  void update_interaction(interaction_data_type& e,
144  const item_data_type& v1, const item_data_type& v2,
145  double new_v1, double new_v2) const {
146  if(LIKELY((new_v1 != 0) && (new_v2 != 0)) ) {
147  atomic_increment(e);
148  }
149  }
150 
151  void update_interaction_unsafe(interaction_data_type& e,
152  const item_data_type& v1, const item_data_type& v2,
153  double new_v1, double new_v2) const {
154 
155  // Increment if both e and the double values are non-zero. This
156  // version is easier for the compiler to vectorize.
157  e += ((new_v1 != 0) & (new_v2 != 0));
158  }
159 
160  void finalize_interaction(final_interaction_data_type& e_out,
161  const final_item_data_type&,
162  const final_item_data_type&,
163  const interaction_data_type& e,
164  const item_data_type& v1,
165  const item_data_type& v2) const {
166 
167  // The intersection count should be less than the size of either one.
168  DASSERT_LE(e, v1);
169  DASSERT_LE(e, v2);
170 
171  double _e_out = ( (v1 == 0) || (v2 == 0) ) ? 0.0 : double(e) / (v1 + v2 - e);
172 
173  DASSERT_GE(_e_out, -1e-3);
174  DASSERT_LE(_e_out, 1.0 + 1e-3);
175 
176  e_out = _to_fixed(_e_out);
177  }
178 
179  ////////////////////////////////////////////////////////////////////////////////
180 
181  bool compare_interaction_values(const final_interaction_data_type& e1,
182  const final_interaction_data_type& e2,
183  const final_item_data_type& common_item_data,
184  const final_item_data_type& item_data_1,
185  const final_item_data_type& item_data_2) const {
186  return e1 > e2;
187  }
188 
189  ////////////////////////////////////////////////////////////////////////////////
190 
191  // Some methods need this for deserialization or setting things directly.
192  void import_final_interaction_value(
193  final_interaction_data_type& e, const flexible_type& src) const {
194 
195  double v = src.get<flex_float>();
196 
197  if(v < -1e-3 || v > 1 + 1e-3) {
198  auto error_out = [&]() GL_GCC_ONLY(GL_COLD_NOINLINE) {
199  std::ostringstream ss;
200  ss << "Values for jaccard similarity type must be between 0 and 1; "
201  << "Encountered " << v << ". Please choose an appropriate "
202  << "similarity type or transform your values."
203  << std::endl;
204  log_and_throw(ss.str().c_str());
205  };
206 
207  error_out();
208  }
209 
210  e = _to_fixed(v);
211  }
212 
213  double export_similarity_score(const final_interaction_data_type& e) const {
214  return std::max<double>(0, std::min<double>(1, _from_fixed(e)));
215  }
216 
217  ////////////////////////////////////////////////////////////////////////////////
218 
219  // Finally, accumulation data types.
220  typedef size_t prediction_accumulation_type;
221 
222  void update_prediction(prediction_accumulation_type& p,
223  const final_interaction_data_type& item_interaction_data,
224  const final_item_data_type& prediction_item_item_data,
225  const final_item_data_type& neighbor_item_item_data,
226  double prediction_item_score) const {
227 
228  if(LIKELY(prediction_item_score != 0)) {
229  atomic_increment(p, item_interaction_data);
230  }
231  }
232 
233  void update_prediction_unsafe(prediction_accumulation_type& p,
234  const final_interaction_data_type& item_interaction_data,
235  const final_item_data_type& prediction_item_item_data,
236  const final_item_data_type& neighbor_item_item_data,
237  double prediction_item_score) const {
238 
239  p += (prediction_item_score != 0) ? item_interaction_data : 0;
240  }
241 
242  double finalize_prediction(const prediction_accumulation_type& p,
243  const final_item_data_type& prediction_item_data,
244  size_t n_user_ratings) const {
245  return _from_fixed(p) / std::max<size_t>(1, n_user_ratings);
246  }
247 
248 };
249 
250 ////////////////////////////////////////////////////////////////////////////////
251 //
252 
253 class cosine {
254  public:
255 
256  static std::string name() { return "cosine"; }
257 
258  // The long here is so we can use atomics.
259  typedef _fixed_precision_type item_data_type;
260  typedef _fixed_precision_type interaction_data_type;
261 
262  typedef _fixed_precision_type final_interaction_data_type;
263  typedef unused_value_type final_item_data_type;
264 
265  static constexpr bool require_item_locking() { return false; }
266  static constexpr bool require_interaction_locking() { return false; }
267  static constexpr bool missing_values_are_zero() { return true; }
268 
269  void update_item(item_data_type& v, double target) const {
270  atomic_increment(v, _to_fixed(target * target));
271  }
272 
273  void update_item_unsafe(item_data_type& v, double target) const {
274  v += _to_fixed(target * target);
275  }
276 
277  // no item final data.
278  void finalize_item(final_item_data_type&, item_data_type&) const { }
279  void import_final_item_value(final_item_data_type& it, const flexible_type& src) const { }
280 
281  void update_interaction(interaction_data_type& e,
282  const item_data_type& v1, const item_data_type& v2,
283  double new_v1, double new_v2) const {
284 
285  atomic_increment(e, _to_fixed(new_v1 * new_v2));
286  }
287 
288  void update_interaction_unsafe(interaction_data_type& e,
289  const item_data_type& v1, const item_data_type& v2,
290  double new_v1, double new_v2) const {
291  e += _to_fixed(new_v1 * new_v2);
292  }
293 
294  void finalize_interaction(final_interaction_data_type& e_out,
295  const final_item_data_type&,
296  const final_item_data_type&,
297  const interaction_data_type& e,
298  const item_data_type& v1,
299  const item_data_type& v2) const {
300 
301  // e, v1, and v2 all use fixed point math, but the ratio is the
302  // same, so just cast to double.
303  double _e_out = ((v1 == 0) || (v2 == 0)) ? 0.0 : double(e) / std::sqrt(double(v1) * v2);
304 
305  DASSERT_LT(_e_out, 1.0 + 1e-3);
306  DASSERT_GT(_e_out, -1.0 - 1e-3);
307 
308  e_out = _to_fixed(_e_out);
309  }
310 
311  ////////////////////////////////////////////////////////////////////////////////
312 
313  bool compare_interaction_values(const final_interaction_data_type& e1,
314  const final_interaction_data_type& e2,
315  const final_item_data_type& common_item_data,
316  const final_item_data_type& item_data_1,
317  const final_item_data_type& item_data_2) const {
318  return e1 > e2;
319  }
320 
321  // Some methods need this for deserialization or setting things directly.
322  void import_final_interaction_value(
323  final_interaction_data_type& e, const flexible_type& src) const {
324 
325  double v = src.get<flex_float>();
326 
327  if(v < (-1 - 1e-3) || v > 1 + 1e-3) {
328  auto error_out = [&]() GL_GCC_ONLY(GL_COLD_NOINLINE) {
329  std::ostringstream ss;
330  ss << "Values for cosine similarity type must be between -1 and 1; "
331  << "Encountered " << v << ". Please choose an appropriate "
332  << "similarity type or transform your values."
333  << std::endl;
334  log_and_throw(ss.str().c_str());
335  };
336 
337  error_out();
338  }
339 
340  e = _to_fixed(v);
341  }
342 
343  double export_similarity_score(const final_interaction_data_type& e) const {
344  return std::max<double>(-1, std::min<double>(1, _from_fixed(e)));
345  }
346 
347  ////////////////////////////////////////////////////////////////////////////////
348 
349  // Finally, accumulation data types.
350  typedef _fixed_precision_type prediction_accumulation_type;
351 
352  void update_prediction_unsafe(prediction_accumulation_type& p,
353  const final_interaction_data_type& item_interaction_data,
354  const final_item_data_type& prediction_item_item_data,
355  const final_item_data_type& neighbor_item_item_data,
356  double prediction_item_score) const {
357 
358  _fixed_precision_type delta_prediction
359  = _fixed_precision_type(item_interaction_data * prediction_item_score);
360 
361  p += delta_prediction;
362  }
363 
364  void update_prediction(prediction_accumulation_type& p,
365  const final_interaction_data_type& item_interaction_data,
366  const final_item_data_type& prediction_item_item_data,
367  const final_item_data_type& neighbor_item_item_data,
368  double prediction_item_score) const {
369 
370  _fixed_precision_type delta_prediction
371  = _fixed_precision_type(item_interaction_data * prediction_item_score);
372 
373  atomic_increment(p, delta_prediction);
374  }
375 
376  double finalize_prediction(const prediction_accumulation_type& p,
377  const final_item_data_type& prediction_item_item_data,
378  size_t n_user_ratings) const {
379  if(n_user_ratings == 0) {
380  return 0;
381  } else {
382  return _from_fixed(p) / n_user_ratings;
383  }
384  }
385 };
386 
387 
388 ////////////////////////////////////////////////////////////////////////////////
389 //
390 
391 class pearson {
392  public:
393 
394  static std::string name() { return "pearson"; }
395 
396  // At the end, we have
397  struct item_data_type {
398  size_t count = 0;
399  double mean = 0;
400  double var_sum = 0;
401  };
402 
403  typedef double interaction_data_type;
404 
405  typedef _fixed_precision_type final_interaction_data_type;
406  typedef double final_item_data_type;
407 
408  static constexpr bool require_item_locking() { return true; }
409  static constexpr bool require_interaction_locking() { return true; }
410  static constexpr bool missing_values_are_zero() { return false; }
411 
412  void update_item(item_data_type& v, double target) const {
413  double old_mean = v.mean;
414 
415  // Do the efficient and stable mean+stddev computation.
416  v.mean += (target - old_mean) / (v.count + 1);
417 
418  v.var_sum += (target - old_mean) * (target - v.mean);
419  ++v.count;
420  }
421 
422  void finalize_item(final_item_data_type& fv, item_data_type& v) const {
423  v.var_sum *= double(v.count) / std::max<size_t>(1, v.count - 1);
424  fv = v.mean;
425  }
426 
427  void import_final_item_value(final_item_data_type& it, const flexible_type& src) const {
428  it = src.get<flex_float>();
429  }
430 
431  void update_interaction(interaction_data_type& e,
432  const item_data_type& v1,
433  const item_data_type& v2,
434  double new_v1, double new_v2) const {
435  e += (new_v1 - v1.mean) * (new_v2 - v2.mean);
436  }
437 
438  void update_interaction_unsafe(
439  interaction_data_type& e, const item_data_type& v1, const item_data_type& v2,
440  double new_v1, double new_v2) const {
441 
442  e += (new_v1 - v1.mean) * (new_v2 - v2.mean);
443  }
444 
445 
446  void finalize_interaction(final_interaction_data_type& e_out,
447  const final_item_data_type&,
448  const final_item_data_type&,
449  const interaction_data_type& e,
450  const item_data_type& v1,
451  const item_data_type& v2) const {
452 
453  double denominator_2 = v1.var_sum * v2.var_sum;
454 
455  double _e_out = (denominator_2 > 0) ? e / std::sqrt(denominator_2) : 0.0;
456 
457  DASSERT_LT(_e_out, 1.0 + 1e-3);
458  DASSERT_GT(_e_out, -1.0 - 1e-3);
459 
460  e_out = _to_fixed(_e_out);
461  }
462 
463  // Some methods need this for deserialization or setting things directly.
464  void import_final_interaction_value(
465  final_interaction_data_type& e, const flexible_type& src) const {
466 
467  double v = src.get<flex_float>();
468 
469  if(v < (-1 - 1e-3) || v > 1 + 1e-3) {
470  auto error_out = [&]() GL_GCC_ONLY(GL_COLD_NOINLINE) {
471  std::ostringstream ss;
472  ss << "Values for pearson correlation similarity type must be between -1 and 1; "
473  << "Encountered " << v << ". Please choose an appropriate "
474  << "similarity type or transform your values."
475  << std::endl;
476  log_and_throw(ss.str().c_str());
477  };
478 
479  error_out();
480  }
481 
482  e = _to_fixed(v);
483  }
484 
485  double export_similarity_score(const final_interaction_data_type& e) const {
486  return std::max<double>(-1, std::min<double>(1, _from_fixed(e)));
487  }
488 
489  ////////////////////////////////////////////////////////////////////////////////
490 
491  // Finally, accumulation data types.
492  typedef _fixed_precision_type prediction_accumulation_type;
493 
494  bool compare_interaction_values(const final_interaction_data_type& e1,
495  const final_interaction_data_type& e2,
496  const final_item_data_type& common_item_data,
497  const final_item_data_type& item_data_1,
498  const final_item_data_type& item_data_2) const {
499  return e1 > e2;
500  }
501 
502  void update_prediction(prediction_accumulation_type& p,
503  const final_interaction_data_type& item_interaction_data,
504  const final_item_data_type& prediction_item_item_data,
505  const final_item_data_type& neighbor_item_item_data,
506  double prediction_item_score) const {
507 
508  _fixed_precision_type delta_prediction = _fixed_precision_type(std::round(
509  item_interaction_data * (prediction_item_score - prediction_item_item_data) ) );
510 
511  atomic_increment(p, delta_prediction);
512  }
513 
514  void update_prediction_unsafe(prediction_accumulation_type& p,
515  const final_interaction_data_type& item_interaction_data,
516  const final_item_data_type& prediction_item_item_data,
517  const final_item_data_type& neighbor_item_item_data,
518  double prediction_item_score) const {
519 
520  // Note that this is all done at scale
521  _fixed_precision_type delta_prediction = _fixed_precision_type(std::round(
522  item_interaction_data * (prediction_item_score - prediction_item_item_data) ) );
523 
524  p += delta_prediction;
525  }
526 
527  double finalize_prediction(const prediction_accumulation_type& p,
528  const final_item_data_type& prediction_item_item_data,
529  size_t n_user_ratings) const {
530  if(n_user_ratings <= 0) {
531  return 0;
532  } else {
533  return prediction_item_item_data + _from_fixed(p) / n_user_ratings;
534  }
535  }
536 };
537 
538 ////////////////////////////////////////////////////////////////////////////////
539 // Utility functions for things
540 
541 /** Not all of the aggregators use or need to store the
542  * final_item_data, so don't work with it if we don't need to.
543  * This allows us to selectively access it.
544  */
545 template <typename SimilarityType>
546 static constexpr bool use_final_item_data() {
547  return !std::is_same<unused_value_type,
548  typename SimilarityType::final_item_data_type>::value;
549 }
550 
551 
552 }}
553 
554 #endif /* _AGGREGATORS_H_ */
static T atomic_increment(T &value, const U &increment=1, typename std::enable_if< std::is_integral< T >::value &&std::is_integral< U >::value >::type *=0)
Definition: atomic_ops.hpp:292
Inheriting from this type will force the serializer to treat the derived type as a POD type...
Definition: is_pod.hpp:16
const T & get() const