6 #ifndef TURI_DISTANCES_H_ 7 #define TURI_DISTANCES_H_ 9 #include <model_server/lib/toolkit_function_macros.hpp> 10 #include <toolkits/util/algorithmic_utils.hpp> 11 #include <toolkits/nearest_neighbors/distance_functions.hpp> 12 #include <core/data/sframe/gl_sarray.hpp> 14 namespace turi {
namespace distances {
16 typedef Eigen::VectorXd dense_vector;
17 typedef Eigen::SparseVector<double> sparse_vector;
24 std::pair<sparse_vector, sparse_vector>
26 sparse_vector av(a.size() + b.size());
27 sparse_vector bv(a.size() + b.size());
32 size_t current_index = 0;
33 auto value_to_index = std::unordered_map<flexible_type, size_t>();
36 for (
const auto& kv : a) {
37 if (value_to_index.count(kv.first) == 0) {
38 value_to_index[kv.first] = current_index;
42 log_and_throw(
"At least one of the dictionary values could not be converted to a number.");
44 size_t index = value_to_index.at(kv.first);
45 av.coeffRef(index) = kv.second;
49 for (
const auto& kv : b) {
50 if (value_to_index.count(kv.first) == 0) {
51 value_to_index[kv.first] = current_index;
55 log_and_throw(
"At least one of the dictionary values could not be converted to a number.");
57 size_t index = value_to_index.at(kv.first);
58 bv.coeffRef(index) = kv.second;
60 return std::make_pair(av, bv);
67 std::pair<sparse_vector, sparse_vector>
69 sparse_vector av(a.size() + b.size());
70 sparse_vector bv(a.size() + b.size());
75 size_t current_index = 0;
76 auto value_to_index = std::unordered_map<flexible_type, size_t>();
79 for (
const auto& v : a) {
80 if (value_to_index.count(v) == 0) {
81 value_to_index[v] = current_index;
82 av.coeffRef(current_index) = 0;
85 size_t index = value_to_index.at(v);
86 av.coeffRef(index) += 1;
90 for (
const auto& v : b) {
91 if (value_to_index.count(v) == 0) {
92 value_to_index[v] = current_index;
93 bv.coeffRef(current_index) = 0;
96 size_t index = value_to_index.at(v);
97 bv.coeffRef(index) += 1;
99 return std::make_pair(av, bv);
114 double compute_distance(std::string distance_name,
const flexible_type& a,
const flexible_type& b) {
117 auto a_t = a.get_type();
118 auto b_t = b.get_type();
119 if (a_t != b_t) log_and_throw(
"Argument types must match.");
123 auto d = nearest_neighbors::distance_metric::make_dist_instance(distance_name);
128 const auto& a_vec = a.get<std::vector<double>>();
129 const auto& b_vec = b.get<std::vector<double>>();
130 Eigen::Map<const dense_vector> av(a_vec.data(), a.size());
131 Eigen::Map<const dense_vector> bv(b_vec.data(), b.size());
133 return d->distance(av, bv);
138 auto ab = convert_dict_pair_to_sparse(a, b);
141 if ((ab.first.size() == 0) && (ab.second.size() == 0))
145 return d->distance(ab.first, ab.second);
150 auto ab = convert_list_pair_to_sparse(a, b);
153 if ((ab.first.size() == 0) && (ab.second.size() == 0))
157 return d->distance(ab.first, ab.second);
160 log_and_throw(
"This distance does not support the provided type.");
164 double gaussian_kernel(
const flexible_type& a,
const flexible_type& b) {
165 return compute_distance(
"gaussian_kernel", a, b);
168 double euclidean(
const flexible_type& a,
const flexible_type& b) {
169 return compute_distance(
"euclidean", a, b);
172 double squared_euclidean(
const flexible_type& a,
const flexible_type& b) {
173 return compute_distance(
"squared_euclidean", a, b);
176 double manhattan(
const flexible_type& a,
const flexible_type& b) {
177 return compute_distance(
"manhattan", a, b);
180 double cosine(
const flexible_type& a,
const flexible_type& b) {
181 return compute_distance(
"cosine", a, b);
184 double transformed_dot_product(
const flexible_type& a,
const flexible_type& b) {
185 return compute_distance(
"transformed_dot_product", a, b);
188 double levenshtein(
const std::string& a, std::string& b) {
189 return nearest_neighbors::levenshtein().distance(a, b);
192 double jaccard(
const flexible_type& a,
const flexible_type& b) {
193 auto a_t = a.get_type();
194 auto b_t = b.get_type();
195 if (a_t != b_t) log_and_throw(
"Argument types must match.");
197 auto ab = convert_dict_pair_to_sparse(a, b);
199 if ((ab.first.size() == 0) && (ab.second.size() == 0))
202 return nearest_neighbors::jaccard().distance(ab.first, ab.second);
205 auto ab = convert_list_pair_to_sparse(a, b);
207 if ((ab.first.size() == 0) && (ab.second.size() == 0))
210 return nearest_neighbors::jaccard().distance(ab.first, ab.second);
214 log_and_throw(
"This distance does not support the provided type.");
219 double weighted_jaccard(
const flexible_type& a,
const flexible_type& b) {
220 auto a_t = a.get_type();
221 auto b_t = b.get_type();
222 if (a_t != b_t) log_and_throw(
"Argument types must match.");
224 auto ab = convert_dict_pair_to_sparse(a, b);
225 if ((ab.first.size() == 0) && (ab.second.size() == 0))
227 return nearest_neighbors::weighted_jaccard().distance(ab.first, ab.second);
230 auto ab = convert_list_pair_to_sparse(a, b);
232 if ((ab.first.size() == 0) && (ab.second.size() == 0))
235 return nearest_neighbors::weighted_jaccard().distance(ab.first, ab.second);
238 log_and_throw(
"This distance does not support the provided type.");
242 double apply_w_custom(function_closure_info fn,
243 const std::vector<double>& a,
244 const std::vector<double>& b) {
245 auto actual_fn = variant_get_value< std::function<double(const std::vector<double>,
const std::vector<double>)> >(fn);
246 auto d = nearest_neighbors::custom_distance();
248 return d.distance(a, b);
251 gl_sarray apply(gl_sarray a, gl_sarray b, function_closure_info fn) {
252 if (a.dtype() != b.dtype())
253 log_and_throw(
"Types of both SArrays must match.");
255 auto actual_fn = variant_get_value< std::function<double(const flexible_type, const flexible_type)> >(fn);
258 auto ar = a.range_iterator();
259 auto br = b.range_iterator();
260 for (
auto ita = ar.begin(), itb = br.begin(); ita != ar.end(); ++ita, ++itb) {
261 writer.write(actual_fn(*ita, *itb), 0);
263 return writer.close();
269 REGISTER_DOCSTRING(euclidean, "Compute the Euclidean distance between two dictionaries or two lists of equal length.");
271 REGISTER_DOCSTRING(squared_euclidean, "Compute the squared Euclidean distance between two dictionaries or two lists of equal length.");
273 REGISTER_DOCSTRING(cosine, "Compute the cosine distance between two dictionaries or two lists of equal length.");
275 REGISTER_DOCSTRING(transformed_dot_product, "Compute the dot product between two dictionaries or two lists of equal length.");
277 REGISTER_DOCSTRING(manhattan, "Compute the Manhattan distance between two dictionaries or two lists of equal length.");
279 REGISTER_DOCSTRING(levenshtein, "Compute the Levenshtein distance between two strings.");
281 REGISTER_DOCSTRING(jaccard, "Compute the Jaccard distance between two dictionaries.");
283 REGISTER_DOCSTRING(gaussian_kernel, "Compute the Gaussian distance between two dictionaries.");
285 REGISTER_DOCSTRING(weighted_jaccard, "Compute the weighted Jaccard distance between two dictionaries.");
293 std::vector<
turi::toolkit_function_specification> get_toolkit_function_registration();
#define REGISTER_DOCSTRING(function, docstring)
#define BEGIN_FUNCTION_REGISTRATION
#define END_FUNCTION_REGISTRATION
#define REGISTER_FUNCTION(function,...)
std::vector< std::pair< flexible_type, flexible_type > > flex_dict
std::vector< flexible_type > flex_list
#define DASSERT_TRUE(cond)