Turi Create  4.0
nearest_neighbors.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_NEAREST_NEIGHBORS_H_
7 #define TURI_NEAREST_NEIGHBORS_H_
8 
9 // Types
10 #include <core/storage/sframe_data/sframe.hpp>
11 #include <core/data/flexible_type/flexible_type.hpp>
12 #include <model_server/lib/variant_deep_serialize.hpp>
13 
14 // Data structure utils
15 #include <core/storage/sframe_data/sframe_iterators.hpp>
16 #include <toolkits/ml_data_2/ml_data.hpp>
17 #include <toolkits/ml_data_2/metadata.hpp>
18 #include <toolkits/ml_data_2/row_slicing_utilities.hpp>
19 
20 // Interfaces
21 #include <model_server/lib/toolkit_function_specification.hpp>
22 #include <model_server/lib/variant.hpp>
23 #include <model_server/lib/unity_base_types.hpp>
24 #include <model_server/lib/extensions/ml_model.hpp>
25 #include <toolkits/util/algorithmic_utils.hpp>
26 #include <toolkits/supervised_learning/supervised_learning_utils-inl.hpp>
27 
28 #include <toolkits/nearest_neighbors/distance_functions.hpp>
29 
30 #include <core/export.hpp>
31 #include <Eigen/Core>
32 #include <Eigen/SparseCore>
33 
34 namespace turi {
35 namespace nearest_neighbors {
36 
37 typedef std::tuple<std::vector<std::string>, function_closure_info, double> dist_component_type;
38 
39 }
40 }
41 
42 BEGIN_OUT_OF_PLACE_SAVE(arc, turi::nearest_neighbors::dist_component_type, d) {
43  std::map<std::string, turi::variant_type> data;
44  data["column_names"] = turi::to_variant(std::get<0>(d));
45  data["weight"] = turi::to_variant(std::get<2>(d));
46  variant_deep_save(data, arc);
47  arc << std::get<1>(d);
48 } END_OUT_OF_PLACE_SAVE()
49 
50 BEGIN_OUT_OF_PLACE_LOAD(arc, turi::nearest_neighbors::dist_component_type, d) {
51  std::map<std::string, turi::variant_type> data;
52  variant_deep_load(data, arc);
53  std::vector<std::string> column_names;
54  function_closure_info distance_info;
55  double weight;
56 #define __EXTRACT(var) var = variant_get_value<decltype(var)>(data.at(#var));
57  __EXTRACT(column_names);
58  __EXTRACT(weight);
59 #undef __EXTRACT
60  arc >> distance_info;
61  d = std::make_tuple(column_names, distance_info, weight);
62 
63 } END_OUT_OF_PLACE_LOAD()
64 
65 namespace turi {
66 namespace nearest_neighbors {
67 
68 static constexpr size_t NONE_FLAG = (size_t) -1;
69 
70 enum class row_type {dense, sparse, flex_type};
71 
72 struct dist_component {
73  std::vector<std::string> column_names;
74  std::shared_ptr<distance_metric> distance;
75  double weight;
76  v2::row_slicer slicer;
77  row_type row_sparsity;
78 };
79 
80 class neighbor_candidates;
81 
82 
83 // -----------------------------------------------------------------------------
84 // NEAREST NEIGHBORS HELPER FUNCTIONS
85 // -----------------------------------------------------------------------------
86 
87 
88 /**
89  * Convert the index of a flat array into row and column indices for an upper
90  * triangular matrix. The general idea for the algorithm is from this
91  * StackOverflow thread:
92  * http://stackoverflow.com/questions/242711/algorithm-for-index-numbers-of-triangular-matrix-coefficients
93  *
94  * \param i size_t Index in the flat array.
95  * \param n size_t Number of rows and columns in the upper triangular matrix.
96  *
97  * \return std::pair<size_t, size_t> Row and column index in the upper
98  * triangular matrix.
99  */
100  std::pair<size_t, size_t> upper_triangular_indices(const size_t i,
101  const size_t n);
102 
103 /**
104  * Extract a distance function's name.
105  */
106 std::string extract_distance_function_name(
107  const function_closure_info distance_fn);
108 
109 /**
110  * Figure out how many memory blocks to break the reference and query datasets
111  * into, based on the number of data points and the maximum number of points in
112  * a memory block.
113  *
114  * Assume that each block has the same number of query and reference rows (r).
115  * Each thread loads into memory a reference block with 8 * dimension * r bytes
116  * and a distance matrix of 8 * r^2 bytes. This function simply uses to
117  * quadratic formula to figure out the upper bound on r.
118  *
119  * One copy of each query block is also loaded into memory sequentially, but
120  * this is ignored.
121  *
122  * \param num_ref_examples size_t Number of total reference data points.
123  * \param num_query_examples size_t Number of total reference query points.
124  * \param dimension size_t Number of (unpacked) features in each data point.
125  * \param max_thread_memory Max memory to use in each thread (in bytes).
126  * \param min_ref_blocks Lower bound on the number of reference blocks to use.
127  * \param min_query_blocks Lower bound on the number of reference blocks to use.
128  *
129  * \return num_blocks A pair of block sizes, (num_ref_blocks, num_query_blocks)
130  */
131 std::pair<size_t, size_t> calculate_num_blocks(const size_t num_ref_examples,
132  const size_t num_query_examples,
133  const size_t dimension,
134  const size_t max_thread_memory,
135  const size_t min_ref_blocks,
136  const size_t min_query_blocks);
137 
138 /**
139  * Read data from an ml_data object into a dense matrix, in parallel.
140  */
141 void parallel_read_data_into_matrix(const v2::ml_data& dataset, DenseMatrix& A,
142  const size_t block_start,
143  const size_t block_end);
144 
145 /**
146  * Read data from an ml_data object into a dense matrix, single threaded.
147  */
148 void read_data_into_matrix(const v2::ml_data& dataset, DenseMatrix& A,
149  const size_t block_start, const size_t block_end);
150 
151 /**
152  * Find the query nearest neighbors for a block of queries and a block of
153  * reference data.
154  *
155  * \param R DenseMatrix reference data. Each row is a reference example.
156  * \param Q DenseMatrix query data. Each row is a query example.
157  * \param neighbors std::vector<neighbor_candidates> output container for the
158  * results.
159  * \param ref_offset size_t The index value where the reference data starts,
160  * assuming the reference data is one block from a larger set.
161  * \param query_offset size_t The index value where the query data starts,
162  * assuming the query data is one block from a larger set.
163  */
164 void find_block_neighbors(const DenseMatrix& R, const DenseMatrix& Q,
165  std::vector<neighbor_candidates>& neighbors,
166  const std::string& dist_name,
167  const size_t ref_offset, const size_t query_offset);
168 
169 /**
170  * Find the nearest neighbors for each point in a block of reference data.
171  * Update the nearest neighbor heaps for both the rows and columns in the
172  * resulting distance matrix (unlike the blockwise query, which only worries
173  * about the rows).
174  *
175  * \param R DenseMatrix Block of reference data corresponding to distance matrix
176  * rows.
177  * \param C DenseMatrix Block of reference data corresponding to distance matrix
178  * columns.
179  * \param neighbors std::vector<neighbor_candidates> output container for the
180  * results.
181  * \param row_offset size_t The index value where the row data starts,
182  * assuming the data is one block from a larger set.
183  * \param col_offset size_t The index value where the column data starts,
184  * assuming the data is one block from a larger set.
185  */
186 void off_diag_block_similarity_graph(const DenseMatrix& R, const DenseMatrix& C,
187  std::vector<neighbor_candidates>& neighbors,
188  const std::string& dist_name,
189  const size_t row_offset,
190  const size_t col_offset);
191 
192 /**
193  * Write nearest neighbors results stored in a vector of heaps to a stacked
194  * SFrame.
195  */
196 sframe write_neighbors_to_sframe(
197  std::vector<nearest_neighbors::neighbor_candidates>& neighbors,
198  const std::vector<flexible_type>& reference_labels,
199  const std::vector<flexible_type>& query_labels);
200 
201 /**
202  * Append nearest neighbors results stored in a vector of heaps to a sframe.
203  */
204 void append_neighbors_to_sframe(
205  sframe& result,
206  std::vector<nearest_neighbors::neighbor_candidates>& neighbors,
207  const std::vector<flexible_type>& reference_labels,
208  const std::vector<flexible_type>& query_labels);
209 
210 
211 
212 // -----------------------------------------------------------------------------
213 // NEAREST NEIGHBORS MODEL CLASS
214 // -----------------------------------------------------------------------------
215 
216 /**
217  * Nearest neighbors model base class
218  * -----------------------------------------------------------------------------
219  *
220  * Base class for computing k-nearest neighbors queries, inherited by both the
221  * ball tree and LSH structure. Each nearest neighbors model contains the
222  * following;
223  *
224  * - metadata:
225  * A globally consistent object with column wise metadata. This metadata
226  * changes with time (even after training). If you want to freeze the
227  * metadata after training, you have to do so yourself.
228  *
229  * - label_metadata:
230  * A globally consistent object with target metadata. For nearest neighbors
231  * this is merely used to index reference and query rows; it is not
232  * predicted as in supervised learning.
233  *
234  * - num_examples:
235  * Number of rows in the reference data.
236  *
237  * - num_features:
238  * Number of features in the reference and query data. This counts dense
239  * and sparse vectors as single features.
240  *
241  * - num_variables:
242  * Number of variables in the reference and query data. Dense and sparse
243  * vectors are counted according to their lengths.
244  *
245  *
246  * The following functions should always be implemented in a
247  * nearest_neighbors_model.
248  *
249  * - supervised_learning_clone:
250  * Clone objects to this base class type.
251  *
252  * - name:
253  * Get the name of this model. The unity_server can construct
254  * model_base objects and they can be cast to a model of this type. The name
255  * determine how the casting happens. The init_models() function in
256  * unity_server.cpp will give you an idea of how this interface happens.
257  *
258  * - train:
259  * A train function for the model. The result of this function should be a
260  * model object that is trained and has state updated so that a
261  * caller can use get_training_stats() to get back the stats that were
262  * collected during training.
263  *
264  * - predict:
265  * A predict function for the model for batch predictions. The result of
266  * this function can be an SArray of predictions. One for each value of the
267  * input SFrame.
268  *
269  * - evaluate:
270  * An evaluattion function for the model for evaluations. The result of this
271  * function must be an updated evaluation_stats map which can be queried
272  * with the get_evaluation_stats().
273  *
274  * - save:
275  * Save the model with the turicreate iarc. Turi is a server-client
276  * module. DO NOT SAVE ANYTHING in the client side. Make sure that
277  * everything is in the server side. For example: You might be tempted do
278  * keep options that the user provides into the server side but DO NOT do
279  * that because save and load will break things for you!
280  *
281  * - load:
282  * Load the model with the turicreate oarc.
283  *
284  * - init_options:
285  * Initialize the options with the option manager.
286  *
287 */
288 class EXPORT nearest_neighbors_model : public ml_model_base {
289 
290  protected:
291 
292  std::map<std::string, flexible_type> train_stats;
293  std::shared_ptr<v2::ml_metadata> metadata;
294  v2::ml_data mld_ref;
295  bool is_dense;
296  size_t num_examples = 0; // Number of records in the reference set
297  std::vector<dist_component> composite_distances = {};
298  std::vector<dist_component_type> composite_params = {};
299  std::map<std::string, v2::ml_column_mode> untranslated_cols; // Map of columns that should not be translated by ml_data
300  std::vector<flexible_type> reference_labels;
301 
302  /**
303  * Methods that may be overriden in derived classes.
304  * ---------------------------------------------------------------------------
305  */
306  public:
307 
308  nearest_neighbors_model();
309 
310  virtual ~nearest_neighbors_model(){}
311 
312  /**
313  * Create a nearest neighbors reference object without input reference labels.
314  */
315  virtual void train(const sframe& X,
316  const std::vector<dist_component_type>& composite_distance_params,
317  const std::map<std::string, flexible_type>& opts);
318 
319  /**
320  * Create a nearest neighbors reference object.
321  */
322  virtual void train(const sframe& X, const sframe& ref_labels,
323  const std::vector<dist_component_type>& composite_distance_params,
324  const std::map<std::string, flexible_type>& opts);
325 
326  /**
327  * Create a nearest neighbors reference object.
328  */
329  virtual void train(const sframe& X, const std::vector<flexible_type>& ref_labels,
330  const std::vector<dist_component_type>& composite_distance_params,
331  const std::map<std::string, flexible_type>& opts) = 0;
332 
333  /**
334  * Search a nearest neighbors reference object for neighbors to a set of query
335  * points, without input query row labels.
336  *
337  * \param[in] X query data (features only)
338  * \param[in] k number of neighbors to return for each query
339  * \param[in] radius distance threshold to call a reference point a neighbor
340  *
341  * \returns ret Shared pointer to an SFrame containing query results.
342  *
343  * \note Already assumes that data is of the right shape.
344  */
345  virtual sframe query(const sframe& X, const size_t k,
346  const double radius) const;
347 
348  /**
349  * Search a nearest neighbors reference object for neighbors to a set of query
350  * points.
351  *
352  * \param[in] X query data (features only)
353  * \param[in] query labels row labels for the query dataset
354  * \param[in] k number of neighbors to return for each query
355  * \param[in] radius distance threshold to call a reference point a neighbor
356  *
357  * \returns ret Shared pointer to an SFrame containing query results.
358  *
359  * \note Already assumes that data is of the right shape.
360  */
361  virtual sframe query(const sframe& X, const sframe& query_labels,
362  const size_t k, const double radius) const;
363 
364  /**
365  * Search a nearest neighbors reference object for neighbors to a set of query
366  * points.
367  *
368  * \param[in] X query data (features only)
369  * \param[in] query labels row labels for the query dataset
370  * \param[in] k number of neighbors to return for each query
371  * \param[in] radius distance threshold to call a reference point a neighbor
372  *
373  * \returns ret Shared pointer to an SFrame containing query results.
374  *
375  * \note Already assumes that data is of the right shape.
376  */
377  virtual sframe query(const sframe& X,
378  const std::vector<flexible_type>& query_labels,
379  const size_t k, const double radius) const;
380 
381  /**
382  * Search a nearest neighbors reference object for neighbors to a set of query
383  * points (in ml_data format).
384  *
385  * \param[in] mld_queries query data in ml_data format
386  * \param[in] query labels row labels for the query dataset
387  * \param[in] k number of neighbors to return for each query
388  * \param[in] radius distance threshold to call a reference point a neighbor
389  * \param[in] include_self_edges if false, don't include results where the
390  * query index and the reference index are the same.
391  *
392  * \returns ret Shared pointer to an SFrame containing query results.
393  *
394  * \note Already assumes that data is of the right shape.
395  */
396  virtual sframe query(const v2::ml_data& mld_queries,
397  const std::vector<flexible_type>& query_labels,
398  const size_t k, const double radius,
399  const bool include_self_edges) const = 0;
400 
401 
402  /**
403  * Search a nearest neighbors reference object for the neighbors of every
404  * point.
405  *
406  * \param[in] k number of neighbors to return for each query.
407  * \param[in] radius distance threshold to call a reference point a neighbor.
408  * \param[in] include_self_edges if false, don't include results where the
409  * query index and the reference index are the same.
410  *
411  * \returns ret Shared pointer to an SFrame containing query results.
412  */
413  virtual sframe similarity_graph(const size_t k, const double radius,
414  const bool include_self_edges) const;
415 
416  /**
417  * Set the model options. Use the option manager to set these options. The
418  * option manager should throw errors if the options do not satisfy the option
419  * manager's conditions.
420  *
421  * \param[in] opts Options to set
422  */
423  virtual void init_options(const std::map<std::string,flexible_type>& _opts) = 0;
424 
425 
426  /**
427  * Gets the model version number
428  */
429  virtual size_t get_version() const = 0;
430 
431  /**
432  * Serialize the model object.
433  */
434  virtual void save_impl(turi::oarchive& oarc) const = 0;
435 
436  /**
437  * Load the model object.
438  */
439  virtual void load_version(turi::iarchive& iarc, size_t version) = 0;
440 
441 
442 
443  /**
444  * Methods that should not be overriden in derived classes.
445  * -------------------------------------------------------------------------
446  */
447  public:
448 
449  /**
450  * Get training stats.
451  *
452  * \returns The training stat map.
453  */
454  std::map<std::string, flexible_type> get_training_stats() const;
455 
456  /**
457  * Get names of predictor variables.
458  *
459  * \returns Names of predictors (Vector of string names).
460  */
461  std::vector<std::string> get_feature_names() const;
462 
463  /**
464  * Get metadata object.
465  *
466  * \returns Metadata object.
467  */
468  std::shared_ptr<v2::ml_metadata> get_metadata() const;
469 
470  /**
471  * Check the query schema against the create schema.
472  * \param[in] X SFrame
473  *
474  */
475  void check_schema_for_query(const sframe& X) const;
476 
477  /**
478  * Check if input data is empty.
479  * \param[in] X SFrame
480  */
481  void check_empty_data(const sframe& X) const;
482 
483  /*
484  * Check for missing values in the untranslated columns, aka string features.
485  * Assumes the training data is already set in the model, as 'mld_ref'.
486  * \param[in] X SFrame
487  */
488  void check_missing_strings(const sframe& X) const;
489 
490  /**
491  * Initialize the reference ml_data object in the model, and set metadata in
492  * the model's state for visibility to Python.
493  */
494  void initialize_model_data(const sframe& X,
495  const std::vector<flexible_type>& ref_labels);
496 
497  /**
498  * Initialize each distance function in the set of distance
499  * components.
500  */
501  void initialize_distances();
502 
503  /**
504  *
505  * Validates feature types for each distance function in the set of distance
506  * components.
507 
508  * \param[in] composite_params std::vector<dist_component_type> Specifications
509  * for each component of a composite distance.
510  *
511  * \param[in] X SFrame All input features, i.e. the union over features
512  * specified in composite params.
513  *
514  * \param[in] y SFrame Label data.
515  */
516  void validate_distance_components(const std::vector<dist_component_type>& composite_params,
517  const sframe& X);
518 
519  /**
520  * Check that the feature types are valid for a particular distance component.
521  * Return the row data type.
522  *
523  * \param[in] column_names std::vector<std::string>
524  * \param[in] X SFrame
525  * \param[in] distance_name std::string
526  * \param[in] weight double
527  *
528  */
529  void validate_distance_component(const std::vector<std::string> column_names,
530  const sframe& X,
531  const function_closure_info distance_name,
532  const double weight);
533 
534  /**
535  * Populates the distance field for the summary struct.
536  */
537  void populate_distance_for_summary_struct(
538  const std::vector<dist_component_type>& composite_distance_params);
539 
540  /**
541  * Get reference data as a vector of vectors
542  * \returns Reference data as a vector of vectors (in ml-data form)
543  */
544  flexible_type get_reference_data() const;
545 
546 };
547 
548 // -----------------------------------------------------------------------------
549 // CANDIDATE NEIGHBORS CLASS
550 // -----------------------------------------------------------------------------
551 /**
552  * Class that holds nearest neighbors candidates
553  * -----------------------------------------------------------------------------
554  * Users may specify a maximum number of neighbors to return (i.e. k) or a
555  * maximum radius within which all neighbors should be returned (i.e. radius),
556  * or neigther, or both. Each of these four cases has slightly different
557  * behavior, which this class encapsulates to make the nearest neighbor models
558  * and methods easier to write and use.
559  *
560  * The model contains the following attributes:
561  * - k:
562  * Maximum number of neighbors to return.
563  *
564  * - radius:
565  * Max distance for a query point to be considered a neighbor of the
566  * reference point.
567  *
568  * - include_self_edges:
569  * If set to 'false', neighbors with the same index as the object's label
570  * are excluded from the results.
571  *
572  * - candidates:
573  * Data structure that holds candidate neighbors. The baseline structure is
574  * a vector of pairs. Each pair contains a double distance to the query
575  * point, and an int index of the candidate neighbor. If 'k' is specified,
576  * a heap is constructed on top of this vector.
577  *
578  * The model contains the following methods:
579  * - evaluate_point:
580  * Evaluate a new point as a neighbor candidate. Each of the four settings
581  * for 'k' and 'radius' yield different decisions on when to add a point as
582  * a candidate. If 'k' is specified and the heap is full, this also pops
583  * off the furthest point in the candidates vector.
584  *
585  * - print_candidates:
586  * Print all of the candidates with logprogress_stream.
587  *
588  * - sort_candidates:
589  * Sort the candidates, from smallest to largest distances (the first
590  * element of each pair in the candidates vector/heap).
591  *
592  * - get_max_dist:
593  * Return the maximum distance in the current set of candidates.
594  */
595 class neighbor_candidates {
596 
597  protected:
598 
599  size_t label = (size_t) -1;
600  bool include_self_edges = true;
601  size_t k = (size_t) -1;
602  double radius = -1.0;
603  simple_spinlock heap_lock;
604 
605  public:
606 
607  // each candidate is both an index and distance
608  std::vector<std::pair<double, size_t>> candidates;
609 
610  neighbor_candidates(size_t lbl, size_t a, double b, bool c);
611 
612  ~neighbor_candidates();
613 
614  /**
615  * Set the label.
616  */
617  void set_label(size_t label);
618 
619  /**
620  * Get the label
621  */
622  size_t get_label() const;
623 
624  /**
625  * Accessor for the max number of neighbors (i.e. k).
626  */
627  size_t get_max_neighbors() const;
628 
629  /**
630  * Accessor for the radius.
631  */
632  double get_radius() const;
633 
634  /**
635  * Evaluate a specified reference point as a nearest neighbor candidate.
636  *
637  * \param[in] point std::pair<double, int> Reference point to consider.
638  * Consists of the distance to the query point and the index of the reference
639  * point.
640  */
641  void evaluate_point(const std::pair<double, size_t>& point) GL_HOT_FLATTEN;
642 
643  /**
644  * Print all of the current candidates.
645  */
646  void print_candidates() const;
647 
648  /**
649  * Sort candidates.
650  */
651  void sort_candidates();
652 
653  /**
654  * Return the max distance of the current candidates. Note: returns -1.0 if the
655  * candidates vector/heap is empty.
656  */
657  double get_max_dist() const;
658 };
659 
660 /**
661  * Function to get the reference data from the NN model
662  *
663  * \param[in] model Nearest neighbour model.
664  */
665 flexible_type _nn_get_reference_data(std::shared_ptr<nearest_neighbors_model> model);
666 
667 } // namespace nearest_neighbors
668 } // namespace turi
669 
670 #endif
#define BEGIN_OUT_OF_PLACE_LOAD(arc, tname, tval)
Macro to make it easy to define out-of-place loads.
Definition: iarchive.hpp:314
#define GL_HOT_FLATTEN
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
void variant_deep_load(variant_type &v, iarchive &iarc)
void variant_deep_save(const variant_type &v, oarchive &oarc)
variant_type to_variant(const T &f)
Definition: variant.hpp:308
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
#define BEGIN_OUT_OF_PLACE_SAVE(arc, tname, tval)
Macro to make it easy to define out-of-place saves.
Definition: oarchive.hpp:346