Turi Create  4.0
distance_functions.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_NN_DISTANCE_FUNCTIONS_H_
7 #define TURI_NN_DISTANCE_FUNCTIONS_H_
8 
9 #include <string>
10 #include <Eigen/SparseCore>
11 #include <Eigen/Core>
12 #include <memory>
13 #include <core/util/logit_math.hpp>
14 #include <toolkits/util/algorithmic_utils.hpp>
15 #include <core/data/flexible_type/flexible_type_base_types.hpp>
16 #include <model_server/lib/toolkit_function_macros.hpp>
17 #include <boost/algorithm/string.hpp>
18 
19 namespace turi {
20 namespace nearest_neighbors {
21 
22 typedef Eigen::VectorXd DenseVector;
23 typedef Eigen::MatrixXd DenseMatrix;
24 typedef Eigen::SparseVector<double> SparseVector;
25 typedef Eigen::SparseMatrix<double, 1> SparseMatrix; // row-major
26 
27 /**
28  * Compute the Euclidean distance between the Cartesian product of rows in two
29  * matrices.
30  */
31 void inline all_pairs_squared_euclidean(const DenseMatrix& A,
32  const DenseMatrix& B,
33  DenseMatrix& dists) {
34 
35  DASSERT_EQ(A.cols(), B.cols());
36  DASSERT_EQ(A.rows(), dists.rows());
37  DASSERT_EQ(B.rows(), dists.cols());
38 
39  dists = -2 * A * B.transpose();
40 
41  for (size_t i = 0; i < (size_t)A.rows(); ++i) {
42  dists.row(i).array() += A.row(i).squaredNorm();
43  }
44 
45  for (size_t j = 0; j < (size_t)B.rows(); ++j) {
46  dists.col(j).array() += B.row(j).squaredNorm();
47  }
48 }
49 
50 
51 void inline all_pairs_cosine(const DenseMatrix& A, const DenseMatrix& B,
52  DenseMatrix& dists) {
53 
54  DASSERT_EQ(A.cols(), B.cols());
55  DASSERT_EQ(A.rows(), dists.rows());
56  DASSERT_EQ(B.rows(), dists.cols());
57 
58  dists = -1 * A * B.transpose();
59 
60  for (size_t i = 0; i < (size_t)A.rows(); ++i) {
61  double row_norm = std::max(1e-16, A.row(i).norm());
62  dists.row(i).array() /= row_norm;
63  }
64 
65  for (size_t j = 0; j < (size_t)B.rows(); ++j) {
66  double col_norm = std::max(1e-16, B.row(j).norm());
67  dists.col(j).array() /= col_norm;
68  }
69 
70  dists.array() += 1;
71 }
72 
73 
74 void inline all_pairs_transformed_dot_product(const DenseMatrix& A,
75  const DenseMatrix& B,
76  DenseMatrix& dists) {
77 
78  DASSERT_EQ(A.cols(), B.cols());
79  DASSERT_EQ(A.rows(), dists.rows());
80  DASSERT_EQ(B.rows(), dists.cols());
81 
82  dists = A * B.transpose();
83  dists = dists.unaryExpr([](double x) { return log1pen(x); });
84 }
85 
86 
87 struct distance_metric {
88 
89  virtual ~distance_metric() = default;
90 
91  // factory methods
92  static inline std::shared_ptr<distance_metric> make_dist_instance(const std::string& dist_name);
93 
94  static inline std::shared_ptr<distance_metric> make_distance_metric(
95  function_closure_info fn);
96 
97  virtual double distance(const DenseVector& a, const DenseVector& b) const {
98  ASSERT_MSG(false, "Dense vector type not supported by this distance metric.");
99  ASSERT_UNREACHABLE();
100  }
101 
102  virtual double distance(const SparseVector& a, const SparseVector& b) const {
103  ASSERT_MSG(false, "Sparse vector type not supported by this distance metric.");
104  ASSERT_UNREACHABLE();
105  }
106 
107  virtual double distance(const std::string& a, const std::string& b) const {
108  ASSERT_MSG(false, "String type not supported by this distance metric.");
109  ASSERT_UNREACHABLE();
110  }
111 
112  virtual double distance(const std::vector<double>& a, const std::vector<double>& b) const {
113  ASSERT_MSG(false, "Vector of double type not supported by this distance metric.");
114  ASSERT_UNREACHABLE();
115  }
116 
117 };
118 
119 /** squared_euclidean distance.
120  */
121 struct gaussian_kernel final : public distance_metric {
122 
123  double distance(const DenseVector& a, const DenseVector& b) const {
124  return 1 - std::exp( - ( (a - b).squaredNorm() ) );
125  }
126 
127  double distance(const SparseVector& a, const SparseVector& b) const {
128  return 1 - std::exp(-( (a.squaredNorm() + b.squaredNorm() - 2 * (a.dot(b))) ));
129  }
130 
131 };
132 
133 /** squared_euclidean distance.
134  */
135 struct squared_euclidean final : public distance_metric {
136 
137  double distance(const DenseVector& a, const DenseVector& b) const {
138  return (a - b).squaredNorm();
139  }
140 
141  double distance(const SparseVector& a, const SparseVector& b) const {
142  return (a.squaredNorm() + b.squaredNorm() - 2 * (a.dot(b)));
143  }
144 
145 };
146 
147 /** euclidean distance.
148  */
149 struct euclidean final : public distance_metric {
150 
151  double distance(const DenseVector& a, const DenseVector& b) const {
152  DASSERT_TRUE(a.size() == b.size());
153  DASSERT_TRUE(a.size() > 0);
154  return std::sqrt((a - b).squaredNorm());
155  }
156 
157  double distance(const SparseVector& a, const SparseVector& b) const {
158  return std::sqrt(a.squaredNorm() + b.squaredNorm() - 2 * (a.dot(b)));
159  }
160 
161 };
162 
163 /** manhattan distance.
164  */
165 struct manhattan final : public distance_metric {
166 
167  double distance(const DenseVector& a, const DenseVector& b) const {
168  DASSERT_TRUE(a.size() == b.size());
169  DASSERT_TRUE(a.size() > 0);
170  return (a - b).cwiseAbs().sum();
171  }
172 
173  double distance(const SparseVector& a, const SparseVector& b) const {
174  return (a - b).cwiseAbs().sum();
175  }
176 
177 };
178 
179 /** cosine distance.
180  */
181 struct cosine final : public distance_metric {
182 
183  double distance(const DenseVector& a, const DenseVector& b) const {
184  DASSERT_TRUE(a.size() == b.size());
185  DASSERT_TRUE(a.size() > 0);
186 
187  double similarity = (double)a.dot(b) / std::max(1e-16, a.norm() * b.norm());
188  return 1 - similarity;
189  }
190 
191  double distance(const SparseVector& a, const SparseVector& b) const {
192  double similarity = (double)a.dot(b) / std::max(1e-16, a.norm() * b.norm());
193  return 1 - similarity;
194  }
195 
196 };
197 
198 /* transformed_dot_product distance
199  */
200 struct transformed_dot_product final : public distance_metric {
201 
202  double distance(const DenseVector& a, const DenseVector& b) const {
203  DASSERT_TRUE(a.size() == b.size());
204  DASSERT_TRUE(a.size() > 0);
205 
206  double dot_product = (double)a.dot(b);
207  return log1pen(dot_product);
208  }
209 
210  double distance(const SparseVector& a, const SparseVector& b) const {
211  double dot_product = (double)a.dot(b);
212  return log1pen(dot_product);
213  }
214 };
215 
216 /* jaccard distance
217  */
218 struct jaccard final : public distance_metric {
219  using distance_metric::distance;
220 
221  double distance(const DenseVector& a, const DenseVector& b) const {
222  DASSERT_EQ(a.size(), b.size());
223  double intersection_size = 0.;
224  double union_size = 0.;
225  for(size_t idx = 0; idx < std::min<size_t>(a.size(), b.size()); ++idx) {
226  if (a(idx) > 0. || b(idx) > 0.) {
227  union_size += 1.;
228  if (a(idx) > 0. && b(idx) > 0.) {
229  intersection_size += 1.;
230  }
231  }
232  }
233  if (union_size == 0.) return 1.;
234  return 1. - intersection_size / union_size;
235  }
236 
237  double distance(const SparseVector& a, const SparseVector& b) const GL_HOT_FLATTEN {
238 
239  size_t intersection_size = 0;
240 
241  SparseVector::InnerIterator it_a(a, 0);
242  SparseVector::InnerIterator it_b(b, 0);
243 
244  while(it_a && it_b) {
245  if(it_a.index() < it_b.index()) {
246  ++it_a;
247  } else if(it_a.index() > it_b.index()) {
248  ++it_b;
249  } else {
250  ++intersection_size;
251  ++it_a, ++it_b;
252  }
253  }
254 
255  size_t d = a.nonZeros() + b.nonZeros() - intersection_size;
256 
257  return 1.0 - double(intersection_size) / d;
258  }
259 
260  double distance(std::vector<size_t>& av,
261  std::vector<size_t>& bv) const {
262  DASSERT_TRUE(av.size() > 0);
263  DASSERT_TRUE(bv.size() > 0);
264 
265  std::sort(av.begin(), av.end());
266  std::sort(bv.begin(), bv.end());
267 
268  // Use an efficient accumulate function
269  size_t n = count_intersection(av.begin(), av.end(),
270  bv.begin(), bv.end());
271  size_t d = av.size() + bv.size() - n;
272  DASSERT_TRUE(d > 0);
273 
274  return 1 - double(n) / d;
275  }
276 
277 };
278 
279 /* weighted jaccard distance
280  */
281 struct weighted_jaccard final : public distance_metric {
282 
283  double distance(const SparseVector& a, const SparseVector& b) const {
284 
285  SparseVector::InnerIterator it_a(a, 0);
286  SparseVector::InnerIterator it_b(b, 0);
287 
288  double cwise_min_sum = 0;
289  double cwise_max_sum = 0;
290 
291  while(it_a && it_b) {
292  if(it_a.index() < it_b.index()) {
293  cwise_max_sum += it_a.value();
294  ++it_a;
295  } else if(it_a.index() > it_b.index()) {
296  cwise_max_sum += it_b.value();
297  ++it_b;
298  } else {
299  cwise_min_sum += std::min(it_a.value(), it_b.value());
300  cwise_max_sum += std::max(it_a.value(), it_b.value());
301  ++it_a, ++it_b;
302  }
303  }
304 
305  while(it_a) {
306  cwise_max_sum += it_a.value();
307  ++it_a;
308  }
309 
310  while(it_b) {
311  cwise_max_sum += it_b.value();
312  ++it_b;
313  }
314 
315  double similarity = cwise_min_sum / cwise_max_sum;
316  return 1 - similarity;
317  }
318 
319 };
320 
321 
322 /* levenshtein distance
323  */
324 struct levenshtein final : public distance_metric {
325 
326  double distance(const std::string& a, const std::string& b) const {
327 
328  std::string s;
329  std::string t;
330  size_t len_s;
331  size_t len_t;
332 
333  // if 't' is not the longer string, switch them so 't' is the longer string
334  if (a.length() > b.length()) {
335  t = a;
336  s = b;
337  } else {
338  s = a;
339  t = b;
340  }
341 
342  // trim common prefix - these cannot add anything to the distance
343  size_t idx_start = 0;
344  while ((s[idx_start] == t[idx_start]) && (idx_start < s.length())) {
345  idx_start++;
346  }
347 
348  if (idx_start == t.length()) { // if the entire strings match, distance is 0
349  return 0;
350  }
351 
352  s = s.substr(idx_start);
353  t = t.substr(idx_start);
354 
355  len_s = s.length();
356  len_t = t.length();
357 
358  // if either trimmed string has length 0, the distance is the length of the
359  // other string. Since 's' is the shorter string, this should capture all
360  // cases.
361  if (len_s == 0)
362  return len_t;
363 
364  // initialize the rows
365  std::vector<size_t> v0(len_t + 1, 0);
366  std::vector<size_t> v1(len_t + 1, 0);
367 
368  for (size_t i = 0; i < v0.size(); i++) {
369  v0[i] = i;
370  }
371 
372  size_t cost;
373 
374  // For each letter in the shorter string...
375  for (size_t i = 0; i < len_s; i++) {
376  v1[0] = i + 1;
377 
378  // Fill in the second row
379  for (size_t j = 0; j < len_t; j++) {
380  cost = (s[i] == t[j]) ? 0 : 1;
381  v1[j+1] = std::min({v0[j] + cost, v0[j+1] + 1, v1[j] + 1});
382  }
383 
384  // Copy the second row into the first row
385  for (size_t j = 0; j < v0.size(); j++) {
386  v0[j] = v1[j];
387  }
388  }
389 
390  return v1[t.length()];
391  }
392 };
393 
394 struct custom_distance final : public distance_metric {
395 
396  // std::function<double(const std::vector<double>, const std::vector<double>)> fn;
397  std::function<double(const flexible_type, const flexible_type)> fn;
398 
399  double distance(const std::vector<double>& a, const std::vector<double>& b) const {
400  return fn(a, b);
401  }
402 
403  double distance(const std::string& a, const std::string& b) const {
404  return fn(a, b);
405  }
406 };
407 
408 std::shared_ptr<distance_metric> inline distance_metric::make_distance_metric(
410  auto fn_name = fn.native_fn_name;
411  distance_metric d;
412  if (boost::algorithm::ends_with(fn_name, ".euclidean")) {
413  return distance_metric::make_dist_instance("euclidean");
414  } else if (boost::algorithm::ends_with(fn_name, ".squared_euclidean")) {
415  return distance_metric::make_dist_instance("squared_euclidean");
416  } else if (boost::algorithm::ends_with(fn_name, ".gaussian_kernel")) {
417  return distance_metric::make_dist_instance("gaussian_kernel");
418  } else if (boost::algorithm::ends_with(fn_name, ".manhattan")) {
419  return distance_metric::make_dist_instance("manhattan");
420  } else if (boost::algorithm::ends_with(fn_name, ".cosine")) {
421  return distance_metric::make_dist_instance("cosine");
422  } else if (boost::algorithm::ends_with(fn_name, ".dot_product")) {
423  return distance_metric::make_dist_instance("dot_product");
424  } else if (boost::algorithm::ends_with(fn_name, ".transformed_dot_product")) {
425  return distance_metric::make_dist_instance("transformed_dot_product");
426  } else if (boost::algorithm::ends_with(fn_name, ".jaccard")) {
427  return distance_metric::make_dist_instance("jaccard");
428  } else if (boost::algorithm::ends_with(fn_name, ".weighted_jaccard")) {
429  return distance_metric::make_dist_instance("weighted_jaccard");
430  } else if (boost::algorithm::ends_with(fn_name, ".levenshtein")) {
431  return distance_metric::make_dist_instance("levenshtein");
432  } else {
433  // Create a distance metric that uses the user-provided function.
434  // Only functions that take dense vectors are currently supported.
435  auto actual_fn = variant_get_value< std::function<double(const std::vector<double>, const std::vector<double>)> >(fn);
436  auto d = nearest_neighbors::custom_distance();
437  d.fn = actual_fn;
438  auto sp = std::make_shared<distance_metric>(d);
439  return sp;
440  }
441 }
442 
443 std::shared_ptr<distance_metric> inline distance_metric::make_dist_instance(
444  const std::string& dist_name) {
445 
446  std::shared_ptr<distance_metric> dist_ptr;
447 
448  if (dist_name == "euclidean")
449  dist_ptr.reset(new euclidean);
450  else if(dist_name == "squared_euclidean")
451  dist_ptr.reset(new squared_euclidean);
452  else if(dist_name == "gaussian_kernel")
453  dist_ptr.reset(new gaussian_kernel);
454  else if (dist_name == "manhattan")
455  dist_ptr.reset(new manhattan);
456  else if (dist_name == "cosine")
457  dist_ptr.reset(new cosine);
458  else if (dist_name == "transformed_dot_product")
459  dist_ptr.reset(new transformed_dot_product);
460  else if (dist_name == "jaccard")
461  dist_ptr.reset(new jaccard);
462  else if (dist_name == "weighted_jaccard")
463  dist_ptr.reset(new weighted_jaccard);
464  else if (dist_name == "levenshtein")
465  dist_ptr.reset(new levenshtein);
466  else
467  log_and_throw("Unrecognized distance: " + dist_name);
468 
469  return dist_ptr;
470 }
471 
472 
473 }}
474 
475 #endif // TURI_NN_DISTANCE_FUNCTIONS_H_
#define GL_HOT_FLATTEN
std::shared_ptr< sframe > sort(std::shared_ptr< planner_node > sframe_planner_node, const std::vector< std::string > column_names, const std::vector< size_t > &sort_column_indices, const std::vector< bool > &sort_orders)
static size_t count_intersection(InputIterator1 first1, const InputIterator1 &last1, InputIterator2 first2, const InputIterator2 &last2)
static GL_HOT_INLINE_FLATTEN double log1pen(double x)
Definition: logit_math.hpp:49
#define DASSERT_TRUE(cond)
Definition: assertions.hpp:364