6 #ifndef TURI_NN_DISTANCE_FUNCTIONS_H_ 7 #define TURI_NN_DISTANCE_FUNCTIONS_H_ 10 #include <Eigen/SparseCore> 13 #include <core/util/logit_math.hpp> 14 #include <toolkits/util/algorithmic_utils.hpp> 15 #include <core/data/flexible_type/flexible_type_base_types.hpp> 16 #include <model_server/lib/toolkit_function_macros.hpp> 17 #include <boost/algorithm/string.hpp> 20 namespace nearest_neighbors {
22 typedef Eigen::VectorXd DenseVector;
23 typedef Eigen::MatrixXd DenseMatrix;
24 typedef Eigen::SparseVector<double> SparseVector;
25 typedef Eigen::SparseMatrix<double, 1> SparseMatrix;
31 void inline all_pairs_squared_euclidean(
const DenseMatrix& A,
35 DASSERT_EQ(A.cols(), B.cols());
36 DASSERT_EQ(A.rows(), dists.rows());
37 DASSERT_EQ(B.rows(), dists.cols());
39 dists = -2 * A * B.transpose();
41 for (
size_t i = 0; i < (size_t)A.rows(); ++i) {
42 dists.row(i).array() += A.row(i).squaredNorm();
45 for (
size_t j = 0; j < (size_t)B.rows(); ++j) {
46 dists.col(j).array() += B.row(j).squaredNorm();
51 void inline all_pairs_cosine(
const DenseMatrix& A,
const DenseMatrix& B,
54 DASSERT_EQ(A.cols(), B.cols());
55 DASSERT_EQ(A.rows(), dists.rows());
56 DASSERT_EQ(B.rows(), dists.cols());
58 dists = -1 * A * B.transpose();
60 for (
size_t i = 0; i < (size_t)A.rows(); ++i) {
61 double row_norm = std::max(1e-16, A.row(i).norm());
62 dists.row(i).array() /= row_norm;
65 for (
size_t j = 0; j < (size_t)B.rows(); ++j) {
66 double col_norm = std::max(1e-16, B.row(j).norm());
67 dists.col(j).array() /= col_norm;
74 void inline all_pairs_transformed_dot_product(
const DenseMatrix& A,
78 DASSERT_EQ(A.cols(), B.cols());
79 DASSERT_EQ(A.rows(), dists.rows());
80 DASSERT_EQ(B.rows(), dists.cols());
82 dists = A * B.transpose();
83 dists = dists.unaryExpr([](
double x) {
return log1pen(x); });
87 struct distance_metric {
89 virtual ~distance_metric() =
default;
92 static inline std::shared_ptr<distance_metric> make_dist_instance(
const std::string& dist_name);
94 static inline std::shared_ptr<distance_metric> make_distance_metric(
95 function_closure_info fn);
97 virtual double distance(
const DenseVector& a,
const DenseVector& b)
const {
98 ASSERT_MSG(
false,
"Dense vector type not supported by this distance metric.");
102 virtual double distance(
const SparseVector& a,
const SparseVector& b)
const {
103 ASSERT_MSG(
false,
"Sparse vector type not supported by this distance metric.");
104 ASSERT_UNREACHABLE();
107 virtual double distance(
const std::string& a,
const std::string& b)
const {
108 ASSERT_MSG(
false,
"String type not supported by this distance metric.");
109 ASSERT_UNREACHABLE();
112 virtual double distance(
const std::vector<double>& a,
const std::vector<double>& b)
const {
113 ASSERT_MSG(
false,
"Vector of double type not supported by this distance metric.");
114 ASSERT_UNREACHABLE();
123 double distance(
const DenseVector& a,
const DenseVector& b)
const {
124 return 1 - std::exp( - ( (a - b).squaredNorm() ) );
127 double distance(
const SparseVector& a,
const SparseVector& b)
const {
128 return 1 - std::exp(-( (a.squaredNorm() + b.squaredNorm() - 2 * (a.dot(b))) ));
137 double distance(
const DenseVector& a,
const DenseVector& b)
const {
138 return (a - b).squaredNorm();
141 double distance(
const SparseVector& a,
const SparseVector& b)
const {
142 return (a.squaredNorm() + b.squaredNorm() - 2 * (a.dot(b)));
151 double distance(
const DenseVector& a,
const DenseVector& b)
const {
154 return std::sqrt((a - b).squaredNorm());
157 double distance(
const SparseVector& a,
const SparseVector& b)
const {
158 return std::sqrt(a.squaredNorm() + b.squaredNorm() - 2 * (a.dot(b)));
167 double distance(
const DenseVector& a,
const DenseVector& b)
const {
170 return (a - b).cwiseAbs().sum();
173 double distance(
const SparseVector& a,
const SparseVector& b)
const {
174 return (a - b).cwiseAbs().sum();
181 struct cosine final :
public distance_metric {
183 double distance(
const DenseVector& a,
const DenseVector& b)
const {
187 double similarity = (double)a.dot(b) / std::max(1e-16, a.norm() * b.norm());
188 return 1 - similarity;
191 double distance(
const SparseVector& a,
const SparseVector& b)
const {
192 double similarity = (double)a.dot(b) / std::max(1e-16, a.norm() * b.norm());
193 return 1 - similarity;
200 struct transformed_dot_product final :
public distance_metric {
202 double distance(
const DenseVector& a,
const DenseVector& b)
const {
206 double dot_product = (double)a.dot(b);
210 double distance(
const SparseVector& a,
const SparseVector& b)
const {
211 double dot_product = (double)a.dot(b);
218 struct jaccard final :
public distance_metric {
219 using distance_metric::distance;
221 double distance(
const DenseVector& a,
const DenseVector& b)
const {
222 DASSERT_EQ(a.size(), b.size());
223 double intersection_size = 0.;
224 double union_size = 0.;
225 for(
size_t idx = 0; idx < std::min<size_t>(a.size(), b.size()); ++idx) {
226 if (a(idx) > 0. || b(idx) > 0.) {
228 if (a(idx) > 0. && b(idx) > 0.) {
229 intersection_size += 1.;
233 if (union_size == 0.)
return 1.;
234 return 1. - intersection_size / union_size;
237 double distance(
const SparseVector& a,
const SparseVector& b)
const GL_HOT_FLATTEN {
239 size_t intersection_size = 0;
241 SparseVector::InnerIterator it_a(a, 0);
242 SparseVector::InnerIterator it_b(b, 0);
244 while(it_a && it_b) {
245 if(it_a.index() < it_b.index()) {
247 }
else if(it_a.index() > it_b.index()) {
255 size_t d = a.nonZeros() + b.nonZeros() - intersection_size;
257 return 1.0 - double(intersection_size) / d;
260 double distance(std::vector<size_t>& av,
261 std::vector<size_t>& bv)
const {
270 bv.begin(), bv.end());
271 size_t d = av.size() + bv.size() - n;
274 return 1 - double(n) / d;
281 struct weighted_jaccard final :
public distance_metric {
283 double distance(
const SparseVector& a,
const SparseVector& b)
const {
285 SparseVector::InnerIterator it_a(a, 0);
286 SparseVector::InnerIterator it_b(b, 0);
288 double cwise_min_sum = 0;
289 double cwise_max_sum = 0;
291 while(it_a && it_b) {
292 if(it_a.index() < it_b.index()) {
293 cwise_max_sum += it_a.value();
295 }
else if(it_a.index() > it_b.index()) {
296 cwise_max_sum += it_b.value();
299 cwise_min_sum += std::min(it_a.value(), it_b.value());
300 cwise_max_sum += std::max(it_a.value(), it_b.value());
306 cwise_max_sum += it_a.value();
311 cwise_max_sum += it_b.value();
315 double similarity = cwise_min_sum / cwise_max_sum;
316 return 1 - similarity;
324 struct levenshtein final :
public distance_metric {
326 double distance(
const std::string& a,
const std::string& b)
const {
334 if (a.length() > b.length()) {
343 size_t idx_start = 0;
344 while ((s[idx_start] == t[idx_start]) && (idx_start < s.length())) {
348 if (idx_start == t.length()) {
352 s = s.substr(idx_start);
353 t = t.substr(idx_start);
365 std::vector<size_t> v0(len_t + 1, 0);
366 std::vector<size_t> v1(len_t + 1, 0);
368 for (
size_t i = 0; i < v0.size(); i++) {
375 for (
size_t i = 0; i < len_s; i++) {
379 for (
size_t j = 0; j < len_t; j++) {
380 cost = (s[i] == t[j]) ? 0 : 1;
381 v1[j+1] = std::min({v0[j] + cost, v0[j+1] + 1, v1[j] + 1});
385 for (
size_t j = 0; j < v0.size(); j++) {
390 return v1[t.length()];
394 struct custom_distance final :
public distance_metric {
397 std::function<double(const flexible_type, const flexible_type)> fn;
399 double distance(
const std::vector<double>& a,
const std::vector<double>& b)
const {
403 double distance(
const std::string& a,
const std::string& b)
const {
408 std::shared_ptr<distance_metric>
inline distance_metric::make_distance_metric(
410 auto fn_name = fn.native_fn_name;
412 if (boost::algorithm::ends_with(fn_name,
".euclidean")) {
413 return distance_metric::make_dist_instance(
"euclidean");
414 }
else if (boost::algorithm::ends_with(fn_name,
".squared_euclidean")) {
415 return distance_metric::make_dist_instance(
"squared_euclidean");
416 }
else if (boost::algorithm::ends_with(fn_name,
".gaussian_kernel")) {
417 return distance_metric::make_dist_instance(
"gaussian_kernel");
418 }
else if (boost::algorithm::ends_with(fn_name,
".manhattan")) {
419 return distance_metric::make_dist_instance(
"manhattan");
420 }
else if (boost::algorithm::ends_with(fn_name,
".cosine")) {
421 return distance_metric::make_dist_instance(
"cosine");
422 }
else if (boost::algorithm::ends_with(fn_name,
".dot_product")) {
423 return distance_metric::make_dist_instance(
"dot_product");
424 }
else if (boost::algorithm::ends_with(fn_name,
".transformed_dot_product")) {
425 return distance_metric::make_dist_instance(
"transformed_dot_product");
426 }
else if (boost::algorithm::ends_with(fn_name,
".jaccard")) {
427 return distance_metric::make_dist_instance(
"jaccard");
428 }
else if (boost::algorithm::ends_with(fn_name,
".weighted_jaccard")) {
429 return distance_metric::make_dist_instance(
"weighted_jaccard");
430 }
else if (boost::algorithm::ends_with(fn_name,
".levenshtein")) {
431 return distance_metric::make_dist_instance(
"levenshtein");
435 auto actual_fn = variant_get_value< std::function<double(const std::vector<double>,
const std::vector<double>)> >(fn);
436 auto d = nearest_neighbors::custom_distance();
438 auto sp = std::make_shared<distance_metric>(d);
443 std::shared_ptr<distance_metric>
inline distance_metric::make_dist_instance(
444 const std::string& dist_name) {
446 std::shared_ptr<distance_metric> dist_ptr;
448 if (dist_name ==
"euclidean")
450 else if(dist_name ==
"squared_euclidean")
452 else if(dist_name ==
"gaussian_kernel")
454 else if (dist_name ==
"manhattan")
456 else if (dist_name ==
"cosine")
457 dist_ptr.reset(
new cosine);
458 else if (dist_name ==
"transformed_dot_product")
459 dist_ptr.reset(
new transformed_dot_product);
460 else if (dist_name ==
"jaccard")
461 dist_ptr.reset(
new jaccard);
462 else if (dist_name ==
"weighted_jaccard")
463 dist_ptr.reset(
new weighted_jaccard);
464 else if (dist_name ==
"levenshtein")
465 dist_ptr.reset(
new levenshtein);
467 log_and_throw(
"Unrecognized distance: " + dist_name);
475 #endif // TURI_NN_DISTANCE_FUNCTIONS_H_
std::shared_ptr< sframe > sort(std::shared_ptr< planner_node > sframe_planner_node, const std::vector< std::string > column_names, const std::vector< size_t > &sort_column_indices, const std::vector< bool > &sort_orders)
static GL_HOT_INLINE_FLATTEN double log1pen(double x)
#define DASSERT_TRUE(cond)