Turi Create  4.0
distances.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_DISTANCES_H_
7 #define TURI_DISTANCES_H_
8 
9 #include <model_server/lib/toolkit_function_macros.hpp>
10 #include <toolkits/util/algorithmic_utils.hpp>
11 #include <toolkits/nearest_neighbors/distance_functions.hpp>
12 #include <core/data/sframe/gl_sarray.hpp>
13 
14 namespace turi { namespace distances {
15 
16 typedef Eigen::VectorXd dense_vector;
17 typedef Eigen::SparseVector<double> sparse_vector;
18 
19 /**
20  * Utility function for taking a pair of dictionaries whose keys are
21  * flexible_type and returning a sparse vector representation of each.
22  * The index value of each key is determined by the order in which it appears.
23  */
24 std::pair<sparse_vector, sparse_vector>
25  convert_dict_pair_to_sparse(const flex_dict& a, const flex_dict& b) {
26  sparse_vector av(a.size() + b.size());
27  sparse_vector bv(a.size() + b.size());
28  av.reserve(a.size());
29  bv.reserve(b.size());
30 
31  // Initialize the map from value to the assigned index
32  size_t current_index = 0;
33  auto value_to_index = std::unordered_map<flexible_type, size_t>();
34 
35  // Iterate through the first dictionary, filling in the sparse_vector
36  for (const auto& kv : a) {
37  if (value_to_index.count(kv.first) == 0) {
38  value_to_index[kv.first] = current_index;
39  current_index++;
40  }
41  if (kv.second.get_type() == flex_type_enum::STRING)
42  log_and_throw("At least one of the dictionary values could not be converted to a number.");
43 
44  size_t index = value_to_index.at(kv.first);
45  av.coeffRef(index) = kv.second;
46  }
47 
48  // Iterate through the second dictionary, filling in the sparse_vector
49  for (const auto& kv : b) {
50  if (value_to_index.count(kv.first) == 0) {
51  value_to_index[kv.first] = current_index;
52  current_index++;
53  }
54  if (kv.second.get_type() == flex_type_enum::STRING)
55  log_and_throw("At least one of the dictionary values could not be converted to a number.");
56 
57  size_t index = value_to_index.at(kv.first);
58  bv.coeffRef(index) = kv.second;
59  }
60  return std::make_pair(av, bv);
61 }
62 
63 /**
64  * Utility function for taking a pair of lists whose values are
65  * flexible_type and returning a sparse vector representation of each.
66  */
67 std::pair<sparse_vector, sparse_vector>
68  convert_list_pair_to_sparse(const flex_list& a, const flex_list& b) {
69  sparse_vector av(a.size() + b.size());
70  sparse_vector bv(a.size() + b.size());
71  av.reserve(a.size());
72  bv.reserve(b.size());
73 
74  // Initialize the map from value to the assigned index
75  size_t current_index = 0;
76  auto value_to_index = std::unordered_map<flexible_type, size_t>();
77 
78  // Iterate through the first list, filling in the sparse_vector
79  for (const auto& v : a) {
80  if (value_to_index.count(v) == 0) {
81  value_to_index[v] = current_index;
82  av.coeffRef(current_index) = 0;
83  current_index++;
84  }
85  size_t index = value_to_index.at(v);
86  av.coeffRef(index) += 1;
87  }
88 
89  // Iterate through the second list, filling in the sparse_vector
90  for (const auto& v : b) {
91  if (value_to_index.count(v) == 0) {
92  value_to_index[v] = current_index;
93  bv.coeffRef(current_index) = 0;
94  current_index++;
95  }
96  size_t index = value_to_index.at(v);
97  bv.coeffRef(index) += 1;
98  }
99  return std::make_pair(av, bv);
100 }
101 
102 
103 
104 /**
105  * For each of the following distances, we take a pair of flexible_type
106  * objects and dispatch to one of two implementations:
107  * - VECTOR: for several of the distances, it is sensible to compute the
108  * distance between two vectors of equal length
109  * - DICT: for many distances we expose an implementation that can compute
110  * a distance between two dictionaries.
111  * - LIST: treated like a dict containing the counts of each unique item.
112  */
113 
114 double compute_distance(std::string distance_name, const flexible_type& a, const flexible_type& b) {
115 
116  // Check that types match
117  auto a_t = a.get_type();
118  auto b_t = b.get_type();
119  if (a_t != b_t) log_and_throw("Argument types must match.");
120 
121  // Make a distance metric struct that contains both sparse and dense
122  // implementations
123  auto d = nearest_neighbors::distance_metric::make_dist_instance(distance_name);
124 
125  if (a_t == flex_type_enum::VECTOR) {
126  DASSERT_TRUE(a.size() == b.size());
127  DASSERT_TRUE(a.size() > 0);
128  const auto& a_vec = a.get<std::vector<double>>();
129  const auto& b_vec = b.get<std::vector<double>>();
130  Eigen::Map<const dense_vector> av(a_vec.data(), a.size());
131  Eigen::Map<const dense_vector> bv(b_vec.data(), b.size());
132 
133  return d->distance(av, bv);
134 
135  } else if (a_t == flex_type_enum::DICT) {
136 
137  // Convert into sparse vectors
138  auto ab = convert_dict_pair_to_sparse(a, b);
139 
140  // If both sparse vectors are empty, then return 0
141  if ((ab.first.size() == 0) && (ab.second.size() == 0))
142  return 0.0;
143 
144  // Compute the distance
145  return d->distance(ab.first, ab.second);
146 
147  } else if (a_t == flex_type_enum::LIST) {
148 
149  // Convert into sparse vectors
150  auto ab = convert_list_pair_to_sparse(a, b);
151 
152  // If both sparse vectors are empty, then return 0
153  if ((ab.first.size() == 0) && (ab.second.size() == 0))
154  return 0.0;
155 
156  // Compute the distance
157  return d->distance(ab.first, ab.second);
158 
159  } else {
160  log_and_throw("This distance does not support the provided type.");
161  }
162 }
163 
164 double gaussian_kernel(const flexible_type& a, const flexible_type& b) {
165  return compute_distance("gaussian_kernel", a, b);
166 }
167 
168 double euclidean(const flexible_type& a, const flexible_type& b) {
169  return compute_distance("euclidean", a, b);
170 }
171 
172 double squared_euclidean(const flexible_type& a, const flexible_type& b) {
173  return compute_distance("squared_euclidean", a, b);
174 }
175 
176 double manhattan(const flexible_type& a, const flexible_type& b) {
177  return compute_distance("manhattan", a, b);
178 }
179 
180 double cosine(const flexible_type& a, const flexible_type& b) {
181  return compute_distance("cosine", a, b);
182 }
183 
184 double transformed_dot_product(const flexible_type& a, const flexible_type& b) {
185  return compute_distance("transformed_dot_product", a, b);
186 }
187 
188 double levenshtein(const std::string& a, std::string& b) {
189  return nearest_neighbors::levenshtein().distance(a, b);
190 }
191 
192 double jaccard(const flexible_type& a, const flexible_type& b) {
193  auto a_t = a.get_type();
194  auto b_t = b.get_type();
195  if (a_t != b_t) log_and_throw("Argument types must match.");
196  if (a_t == flex_type_enum::DICT) {
197  auto ab = convert_dict_pair_to_sparse(a, b);
198 
199  if ((ab.first.size() == 0) && (ab.second.size() == 0))
200  return 0.0;
201 
202  return nearest_neighbors::jaccard().distance(ab.first, ab.second);
203 
204  } else if (a_t == flex_type_enum::LIST) {
205  auto ab = convert_list_pair_to_sparse(a, b);
206 
207  if ((ab.first.size() == 0) && (ab.second.size() == 0))
208  return 0.0;
209 
210  return nearest_neighbors::jaccard().distance(ab.first, ab.second);
211 
212  } else {
213 
214  log_and_throw("This distance does not support the provided type.");
215  }
216 }
217 
218 
219 double weighted_jaccard(const flexible_type& a, const flexible_type& b) {
220  auto a_t = a.get_type();
221  auto b_t = b.get_type();
222  if (a_t != b_t) log_and_throw("Argument types must match.");
223  if (a_t == flex_type_enum::DICT) {
224  auto ab = convert_dict_pair_to_sparse(a, b);
225  if ((ab.first.size() == 0) && (ab.second.size() == 0))
226  return 0.0;
227  return nearest_neighbors::weighted_jaccard().distance(ab.first, ab.second);
228 
229  } else if (a_t == flex_type_enum::LIST) {
230  auto ab = convert_list_pair_to_sparse(a, b);
231 
232  if ((ab.first.size() == 0) && (ab.second.size() == 0))
233  return 0.0;
234 
235  return nearest_neighbors::weighted_jaccard().distance(ab.first, ab.second);
236 
237  } else {
238  log_and_throw("This distance does not support the provided type.");
239  }
240 }
241 
242 double apply_w_custom(function_closure_info fn,
243  const std::vector<double>& a,
244  const std::vector<double>& b) {
245  auto actual_fn = variant_get_value< std::function<double(const std::vector<double>, const std::vector<double>)> >(fn);
246  auto d = nearest_neighbors::custom_distance();
247  d.fn = actual_fn;
248  return d.distance(a, b);
249 }
250 
251 gl_sarray apply(gl_sarray a, gl_sarray b, function_closure_info fn) {
252  if (a.dtype() != b.dtype())
253  log_and_throw("Types of both SArrays must match.");
254 
255  auto actual_fn = variant_get_value< std::function<double(const flexible_type, const flexible_type)> >(fn);
256 
257  gl_sarray_writer writer(flex_type_enum::FLOAT, 1);
258  auto ar = a.range_iterator();
259  auto br = b.range_iterator();
260  for (auto ita = ar.begin(), itb = br.begin(); ita != ar.end(); ++ita, ++itb) {
261  writer.write(actual_fn(*ita, *itb), 0);
262  }
263  return writer.close();
264 }
265 
266 
268 REGISTER_FUNCTION(euclidean, "x", "y")
269 REGISTER_DOCSTRING(euclidean, "Compute the Euclidean distance between two dictionaries or two lists of equal length.");
270 REGISTER_FUNCTION(squared_euclidean, "x", "y")
271 REGISTER_DOCSTRING(squared_euclidean, "Compute the squared Euclidean distance between two dictionaries or two lists of equal length.");
272 REGISTER_FUNCTION(cosine, "x", "y")
273 REGISTER_DOCSTRING(cosine, "Compute the cosine distance between two dictionaries or two lists of equal length.");
274 REGISTER_FUNCTION(transformed_dot_product, "x", "y")
275 REGISTER_DOCSTRING(transformed_dot_product, "Compute the dot product between two dictionaries or two lists of equal length.");
276 REGISTER_FUNCTION(manhattan, "x", "y")
277 REGISTER_DOCSTRING(manhattan, "Compute the Manhattan distance between two dictionaries or two lists of equal length.");
278 REGISTER_FUNCTION(levenshtein, "x", "y")
279 REGISTER_DOCSTRING(levenshtein, "Compute the Levenshtein distance between two strings.");
280 REGISTER_FUNCTION(jaccard, "x", "y")
281 REGISTER_DOCSTRING(jaccard, "Compute the Jaccard distance between two dictionaries.");
282 REGISTER_FUNCTION(gaussian_kernel, "x", "y")
283 REGISTER_DOCSTRING(gaussian_kernel, "Compute the Gaussian distance between two dictionaries.");
284 REGISTER_FUNCTION(weighted_jaccard, "x", "y")
285 REGISTER_DOCSTRING(weighted_jaccard, "Compute the weighted Jaccard distance between two dictionaries.");
286 
287 REGISTER_FUNCTION(apply_w_custom, "f", "x", "y")
288 
289 REGISTER_FUNCTION(apply, "a", "b", "fn")
290 
292 
293 std::vector<turi::toolkit_function_specification> get_toolkit_function_registration();
294 }}
295 
296 
297 
298 
299 #endif /* TURI_DISTANCES_H_ */
STL namespace.
#define REGISTER_DOCSTRING(function, docstring)
#define BEGIN_FUNCTION_REGISTRATION
#define END_FUNCTION_REGISTRATION
#define REGISTER_FUNCTION(function,...)
std::vector< std::pair< flexible_type, flexible_type > > flex_dict
std::vector< flexible_type > flex_list
#define DASSERT_TRUE(cond)
Definition: assertions.hpp:364