6 #ifndef TURI_NEAREST_NEIGHBORS_H_ 7 #define TURI_NEAREST_NEIGHBORS_H_ 10 #include <core/storage/sframe_data/sframe.hpp> 11 #include <core/data/flexible_type/flexible_type.hpp> 12 #include <model_server/lib/variant_deep_serialize.hpp> 15 #include <core/storage/sframe_data/sframe_iterators.hpp> 16 #include <toolkits/ml_data_2/ml_data.hpp> 17 #include <toolkits/ml_data_2/metadata.hpp> 18 #include <toolkits/ml_data_2/row_slicing_utilities.hpp> 21 #include <model_server/lib/toolkit_function_specification.hpp> 22 #include <model_server/lib/variant.hpp> 23 #include <model_server/lib/unity_base_types.hpp> 24 #include <model_server/lib/extensions/ml_model.hpp> 25 #include <toolkits/util/algorithmic_utils.hpp> 26 #include <toolkits/supervised_learning/supervised_learning_utils-inl.hpp> 28 #include <toolkits/nearest_neighbors/distance_functions.hpp> 30 #include <core/export.hpp> 32 #include <Eigen/SparseCore> 35 namespace nearest_neighbors {
37 typedef std::tuple<std::vector<std::string>, function_closure_info,
double> dist_component_type;
43 std::map<std::string, turi::variant_type> data;
47 arc << std::get<1>(d);
48 } END_OUT_OF_PLACE_SAVE()
51 std::map<std::string, turi::variant_type> data;
53 std::vector<std::string> column_names;
54 function_closure_info distance_info;
56 #define __EXTRACT(var) var = variant_get_value<decltype(var)>(data.at(#var)); 57 __EXTRACT(column_names);
61 d = std::make_tuple(column_names, distance_info, weight);
63 } END_OUT_OF_PLACE_LOAD()
66 namespace nearest_neighbors {
68 static constexpr
size_t NONE_FLAG = (size_t) -1;
70 enum class row_type {dense, sparse, flex_type};
72 struct dist_component {
73 std::vector<std::string> column_names;
74 std::shared_ptr<distance_metric> distance;
76 v2::row_slicer slicer;
77 row_type row_sparsity;
80 class neighbor_candidates;
100 std::pair<size_t, size_t> upper_triangular_indices(
const size_t i,
106 std::string extract_distance_function_name(
107 const function_closure_info distance_fn);
131 std::pair<size_t, size_t> calculate_num_blocks(
const size_t num_ref_examples,
132 const size_t num_query_examples,
133 const size_t dimension,
134 const size_t max_thread_memory,
135 const size_t min_ref_blocks,
136 const size_t min_query_blocks);
141 void parallel_read_data_into_matrix(
const v2::ml_data& dataset, DenseMatrix& A,
142 const size_t block_start,
143 const size_t block_end);
148 void read_data_into_matrix(
const v2::ml_data& dataset, DenseMatrix& A,
149 const size_t block_start,
const size_t block_end);
164 void find_block_neighbors(
const DenseMatrix& R,
const DenseMatrix& Q,
165 std::vector<neighbor_candidates>& neighbors,
166 const std::string& dist_name,
167 const size_t ref_offset,
const size_t query_offset);
186 void off_diag_block_similarity_graph(
const DenseMatrix& R,
const DenseMatrix& C,
187 std::vector<neighbor_candidates>& neighbors,
188 const std::string& dist_name,
189 const size_t row_offset,
190 const size_t col_offset);
196 sframe write_neighbors_to_sframe(
197 std::vector<nearest_neighbors::neighbor_candidates>& neighbors,
198 const std::vector<flexible_type>& reference_labels,
199 const std::vector<flexible_type>& query_labels);
204 void append_neighbors_to_sframe(
206 std::vector<nearest_neighbors::neighbor_candidates>& neighbors,
207 const std::vector<flexible_type>& reference_labels,
208 const std::vector<flexible_type>& query_labels);
288 class EXPORT nearest_neighbors_model :
public ml_model_base {
292 std::map<std::string, flexible_type> train_stats;
293 std::shared_ptr<v2::ml_metadata> metadata;
296 size_t num_examples = 0;
297 std::vector<dist_component> composite_distances = {};
298 std::vector<dist_component_type> composite_params = {};
299 std::map<std::string, v2::ml_column_mode> untranslated_cols;
300 std::vector<flexible_type> reference_labels;
308 nearest_neighbors_model();
310 virtual ~nearest_neighbors_model(){}
315 virtual void train(
const sframe& X,
316 const std::vector<dist_component_type>& composite_distance_params,
317 const std::map<std::string, flexible_type>& opts);
322 virtual void train(
const sframe& X,
const sframe& ref_labels,
323 const std::vector<dist_component_type>& composite_distance_params,
324 const std::map<std::string, flexible_type>& opts);
329 virtual void train(
const sframe& X,
const std::vector<flexible_type>& ref_labels,
330 const std::vector<dist_component_type>& composite_distance_params,
331 const std::map<std::string, flexible_type>& opts) = 0;
345 virtual sframe query(
const sframe& X,
const size_t k,
346 const double radius)
const;
361 virtual sframe query(
const sframe& X,
const sframe& query_labels,
362 const size_t k,
const double radius)
const;
377 virtual sframe query(
const sframe& X,
378 const std::vector<flexible_type>& query_labels,
379 const size_t k,
const double radius)
const;
396 virtual sframe query(
const v2::ml_data& mld_queries,
397 const std::vector<flexible_type>& query_labels,
398 const size_t k,
const double radius,
399 const bool include_self_edges)
const = 0;
413 virtual sframe similarity_graph(
const size_t k,
const double radius,
414 const bool include_self_edges)
const;
423 virtual void init_options(
const std::map<std::string,flexible_type>& _opts) = 0;
429 virtual size_t get_version()
const = 0;
439 virtual void load_version(
turi::iarchive& iarc,
size_t version) = 0;
454 std::map<std::string, flexible_type> get_training_stats()
const;
461 std::vector<std::string> get_feature_names()
const;
468 std::shared_ptr<v2::ml_metadata> get_metadata()
const;
475 void check_schema_for_query(
const sframe& X)
const;
481 void check_empty_data(
const sframe& X)
const;
488 void check_missing_strings(
const sframe& X)
const;
494 void initialize_model_data(
const sframe& X,
495 const std::vector<flexible_type>& ref_labels);
501 void initialize_distances();
516 void validate_distance_components(
const std::vector<dist_component_type>& composite_params,
529 void validate_distance_component(
const std::vector<std::string> column_names,
531 const function_closure_info distance_name,
532 const double weight);
537 void populate_distance_for_summary_struct(
538 const std::vector<dist_component_type>& composite_distance_params);
544 flexible_type get_reference_data()
const;
595 class neighbor_candidates {
599 size_t label = (size_t) -1;
600 bool include_self_edges =
true;
601 size_t k = (size_t) -1;
602 double radius = -1.0;
603 simple_spinlock heap_lock;
608 std::vector<std::pair<double, size_t>> candidates;
610 neighbor_candidates(
size_t lbl,
size_t a,
double b,
bool c);
612 ~neighbor_candidates();
617 void set_label(
size_t label);
622 size_t get_label()
const;
627 size_t get_max_neighbors()
const;
632 double get_radius()
const;
641 void evaluate_point(
const std::pair<double, size_t>& point)
GL_HOT_FLATTEN;
646 void print_candidates()
const;
651 void sort_candidates();
657 double get_max_dist()
const;
665 flexible_type _nn_get_reference_data(std::shared_ptr<nearest_neighbors_model> model);
#define BEGIN_OUT_OF_PLACE_LOAD(arc, tname, tval)
Macro to make it easy to define out-of-place loads.
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
void variant_deep_load(variant_type &v, iarchive &iarc)
void variant_deep_save(const variant_type &v, oarchive &oarc)
variant_type to_variant(const T &f)
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
#define BEGIN_OUT_OF_PLACE_SAVE(arc, tname, tval)
Macro to make it easy to define out-of-place saves.