Turi Create  4.0
lsh_family.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_LSH_FAMILY_H_
7 #define TURI_LSH_FAMILY_H_
8 
9 #include <memory>
10 #include <toolkits/nearest_neighbors/nearest_neighbors.hpp>
11 #include <toolkits/nearest_neighbors/hash_map_container.hpp>
12 
13 namespace turi {
14 namespace nearest_neighbors {
15 
16 
17 class EXPORT lsh_family {
18  public:
19 
20  lsh_family() = default;
21  virtual ~lsh_family() {}
22 
23  // create a lsh_family pointer using dist_name ("euclidean", "cosine", ect.)
24  inline static std::shared_ptr<lsh_family> create_lsh_family(const std::string& dist_name);
25 
26  // indicator function: whether it is an asymmetric LSH or not
27  virtual bool is_asymmetric() const = 0;
28 
29  // distance type name
30  virtual std::string distance_type_name() const = 0;
31 
32  // initialize options
33  virtual void init_options(const std::map<std::string, flexible_type>& _opts);
34 
35  // One pass over the data to get some information about the data
36  // For example, for lsh_dot_product, we need to know the max norm of all the
37  // reference data.
38  virtual void pre_lsh(const v2::ml_data& mld_ref, bool is_sparse) {}
39 
40  // initialize the model. num_input_dimensions is needed
41  virtual void init_model(size_t num_dimensions) = 0;
42 
43  // add reference data one by one
44  // Only DenseVector and SparseVector are supported
45  template <typename T>
46  void add_reference_data(size_t ref_id, const T& t);
47 
48  // Return a set of candidates for the query vector
49  // Only DenseVector and SparseVector are supported
50  template <typename T>
51  std::vector<size_t> query(const T& t) const;
52 
53  // save & load
54  virtual void save(turi::oarchive& oarc) const;
55  virtual void load(turi::iarchive& iarc);
56 
57  protected:
58  // model initialized when add_reference_data is called in the first time
59  virtual std::vector<int> hash_vector_to_codes(const DenseVector& vec,
60  bool is_reference_data) const;
61  virtual std::vector<int> hash_vector_to_codes(const SparseVector& vec,
62  bool is_reference_data) const;
63 
64  protected:
65  size_t num_input_dimensions;
66  size_t num_tables;
67  size_t num_projections_per_table;
68  size_t num_projections;
69  std::vector<hash_map_container<size_t, std::vector<size_t>>> lookup_table;
70 };
71 
72 
73 /**
74  * LSH for Euclidean distance
75  */
76 class EXPORT lsh_euclidean : public lsh_family {
77  public:
78 
79  virtual bool is_asymmetric() const { return false; }
80 
81  virtual std::string distance_type_name() const {
82  return "euclidean";
83  }
84 
85  // Sample a subset of data points, get the average euclidean distance to
86  // initilize w
87  virtual void pre_lsh(const v2::ml_data& mld_ref, bool is_sparse);
88 
89  virtual void init_model(size_t num_dimensions);
90 
91  virtual void save(turi::oarchive& oarc) const;
92  virtual void load(turi::iarchive& iarc);
93 
94  protected:
95  virtual std::vector<int> hash_vector_to_codes(const DenseVector& vec,
96  bool is_reference_data) const;
97  virtual std::vector<int> hash_vector_to_codes(const SparseVector& vec,
98  bool is_reference_data) const;
99 
100  protected:
101  size_t w;
102  DenseMatrix rand_mat;
103  DenseVector rand_vec;
104 };
105 
106 /**
107  * LSH for squared_euclidean distance
108  * The only difference from euclidean happens when calculating the real
109  * distances
110  */
111 class EXPORT lsh_squared_euclidean final : public lsh_euclidean {
112  public:
113  std::string distance_type_name() const {
114  return "squared_euclidean";
115  }
116 };
117 
118 /**
119  * LSH for manhattan distance
120  * The only difference from euclidean is initialization
121  */
122 class EXPORT lsh_manhattan final : public lsh_euclidean {
123  public:
124 
125  std::string distance_type_name() const {
126  return "manhattan";
127  }
128 
129  // Sample a subset of data points, get the average manhattan distance to
130  // initilize w
131  void pre_lsh(const v2::ml_data& mld_ref, bool is_sparse);
132 
133  void init_model(size_t num_dimensions);
134 };
135 
136 /**
137  * LSH for Cosine distance
138  */
139 class EXPORT lsh_cosine final : public lsh_family {
140  public:
141 
142  bool is_asymmetric() const { return false; }
143 
144  std::string distance_type_name() const {
145  return "cosine";
146  }
147 
148  void init_model(size_t num_dimensions);
149 
150  void save(turi::oarchive& oarc) const;
151  void load(turi::iarchive& iarc);
152 
153  protected:
154  std::vector<int> hash_vector_to_codes(const DenseVector& vec,
155  bool is_reference_data) const;
156  std::vector<int> hash_vector_to_codes(const SparseVector& vec,
157  bool is_reference_data) const;
158 
159  private:
160  DenseMatrix rand_mat;
161 };
162 
163 /**
164  * LSH for Jaccard similarity
165  */
166 class EXPORT lsh_jaccard final : public lsh_family {
167  public:
168 
169  bool is_asymmetric() const { return false; }
170 
171  std::string distance_type_name() const {
172  return "jaccard";
173  }
174 
175  void init_model(size_t num_dimensions);
176 
177  void save(turi::oarchive& oarc) const;
178  void load(turi::iarchive& iarc);
179 
180  protected:
181  std::vector<int> hash_vector_to_codes(const DenseVector& vec,
182  bool is_reference_data) const;
183  std::vector<int> hash_vector_to_codes(const SparseVector& vec,
184  bool is_reference_data) const;
185 
186  // helper function
187  void fill_empty_bins(std::vector<int>& vec) const;
188 
189  private:
190  // This implements the one permutation MinHash
191  std::vector<size_t> rand_permutation;
192  std::vector<size_t> rand_sign;
193 };
194 
195 /**
196  * LSH for dot product
197  */
198 class EXPORT lsh_dot_product : public lsh_family {
199  public:
200 
201  bool is_asymmetric() const { return true; }
202 
203  virtual std::string distance_type_name() const {
204  return "dot_product";
205  }
206 
207  void init_model(size_t num_dimensions);
208 
209  void pre_lsh(const v2::ml_data& mld_ref, bool is_sparse);
210 
211  void save(turi::oarchive& oarc) const;
212  void load(turi::iarchive& iarc);
213 
214  protected:
215  std::vector<int> hash_vector_to_codes(const DenseVector& vec,
216  bool is_reference_data) const;
217  std::vector<int> hash_vector_to_codes(const SparseVector& vec,
218  bool is_reference_data) const;
219 
220  private:
221  double max_vec_norm;
222  DenseMatrix rand_mat;
223  DenseVector rand_vec; // rand_vec for the extra column
224 };
225 
226 class EXPORT lsh_transformed_dot_product : public lsh_dot_product {
227  std::string distance_type_name() const {
228  return "transformed_dot_product";
229  }
230 };
231 
232 std::shared_ptr<lsh_family> lsh_family::create_lsh_family(const std::string& dist_name) {
233  if (dist_name == "euclidean") {
234  return std::shared_ptr<lsh_family>(new lsh_euclidean);
235  } else if (dist_name == "squared_euclidean") {
236  return std::shared_ptr<lsh_family>(new lsh_squared_euclidean);
237  } else if (dist_name == "manhattan") {
238  return std::shared_ptr<lsh_family>(new lsh_manhattan);
239  } else if (dist_name == "cosine") {
240  return std::shared_ptr<lsh_family>(new lsh_cosine);
241  } else if (dist_name == "jaccard") {
242  return std::shared_ptr<lsh_family>(new lsh_jaccard);
243  } else if (dist_name == "dot_product") {
244  return std::shared_ptr<lsh_family>(new lsh_dot_product);
245  } else if (dist_name == "transformed_dot_product") {
246  return std::shared_ptr<lsh_family>(new lsh_transformed_dot_product);
247  } else {
248  log_and_throw(dist_name + std::string(" is not supported by LSH! Try another distance or method!"));
249  return nullptr;
250  }
251 }
252 
253 template <typename T>
254 void lsh_family::add_reference_data(size_t ref_id, const T& vec) {
255 
256  ASSERT_MSG(size_t(vec.size()) == num_input_dimensions,
257  "The input dimension does not match the previous ones!");
258 
259  auto hash_vec = hash_vector_to_codes(vec, true);
260  DASSERT_TRUE(hash_vec.size() == num_projections);
261 
262  parallel_for (0, num_tables, [&](size_t table_idx) {
263  auto hash_bucket_id = boost::hash_range(
264  hash_vec.begin() + table_idx * num_projections_per_table,
265  hash_vec.begin() + std::min((table_idx + 1) * num_projections_per_table, num_projections));
266 
267  lookup_table[table_idx].update(hash_bucket_id, [ref_id](std::vector<size_t>& v){
268  v.push_back(ref_id);
269  });
270  });
271 }
272 
273 template <typename T>
274 std::vector<size_t> lsh_family::query(const T& vec) const {
275 
276  ASSERT_MSG(size_t(vec.size()) == num_input_dimensions,
277  "The input num_dimensions does not match the reference data!");
278 
279  std::unordered_set<size_t> ret;
280  auto hash_vec = hash_vector_to_codes(vec, false);
281  DASSERT_TRUE(hash_vec.size() == num_projections);
282  for (size_t table_idx = 0; table_idx < num_tables; ++table_idx) {
283  auto hash_bucket_id = boost::hash_range(
284  hash_vec.begin() + table_idx * num_projections_per_table,
285  hash_vec.begin() + std::min((table_idx + 1) * num_projections_per_table, num_projections));
286 
287  const auto& candidates = lookup_table[table_idx].get(hash_bucket_id);
288  ret.insert(candidates.cbegin(), candidates.cend());
289  }
290 
291  std::vector<size_t> ret_vec(ret.begin(), ret.end());
292  return ret_vec;
293 }
294 
295 
296 } // namespace nearest_neighbors
297 } // namespace turi
298 
299 #endif
void parallel_for(size_t begin, size_t end, const FunctionType &fn)
Definition: lambda_omp.hpp:93
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
#define DASSERT_TRUE(cond)
Definition: assertions.hpp:364