11 #include <Eigen/SparseCore> 12 #include <core/storage/sframe_data/sframe.hpp> 13 #include <core/data/sframe/gl_sarray.hpp> 14 #include <core/parallel/atomic.hpp> 15 #include <core/parallel/pthread_tools.hpp> 16 #include <core/parallel/lambda_omp.hpp> 17 #include <core/data/flexible_type/flexible_type.hpp> 18 #include <core/generics/symmetric_2d_array.hpp> 19 #include <core/globals/globals.hpp> 22 #include <toolkits/ml_data_2/ml_data.hpp> 23 #include <toolkits/ml_data_2/ml_data_iterators.hpp> 26 #include <model_server/lib/extensions/ml_model.hpp> 27 #include <model_server/lib/extensions/option_manager.hpp> 28 #include <model_server/lib/variant_deep_serialize.hpp> 29 #include <core/globals/globals.hpp> 30 #include <toolkits/supervised_learning/supervised_learning_utils-inl.hpp> 33 #include <model_server/lib/toolkit_util.hpp> 34 #include <core/storage/sframe_interface/unity_sframe.hpp> 35 #include <core/logging/table_printer/table_printer.hpp> 36 #include <core/export.hpp> 43 typedef Eigen::Matrix<double, Eigen::Dynamic, 1> dense_vector;
44 typedef Eigen::SparseVector<double> sparse_vector;
58 void check_empty_data(
const sframe& X);
65 void check_column_types(
const sframe& X);
78 atomic<size_t> count = 0;
83 cluster(
size_t dimension): center(dense_vector(dimension)), count(0) {
88 cluster(
const cluster& other): center(other.center), count(other.count) {};
183 std::shared_ptr<v2::ml_metadata> metadata;
184 size_t num_examples = 0;
187 std::vector<size_t> assignments;
188 std::vector<cluster> clusters;
189 size_t num_clusters = 0;
190 size_t max_iterations = 0;
191 size_t batch_size = 1;
192 std::vector<flexible_type> row_labels;
193 std::string row_label_name;
196 std::vector<float> upper_bounds;
205 void initialize_model_data(
const sframe& X,
206 const std::vector<flexible_type>& row_labels,
207 const std::string row_label_name);
214 void assign_initial_clusters_elkan();
224 void choose_random_centers();
236 size_t compute_clusters_elkan();
244 size_t compute_clusters_minibatch();
251 size_t compute_clusters_lloyd();
259 void process_custom_centers(
const sframe& init_centers);
265 void compute_center_distances();
270 void update_cluster_centers();
278 void adjust_distance_bounds(
const std::vector<cluster>& previous_clusters);
285 void set_exact_point_distances();
294 size_t update_assignments_elkan();
306 size_t update_assignments_lloyd();
310 static constexpr
size_t KMEANS_VERSION = 4;
326 void init_options(
const std::map<std::string, flexible_type>& _opts)
override;
338 void train(
const sframe& X,
const sframe& init_centers, std::string method,
339 bool allow_categorical =
false);
353 void train(
const sframe& X,
354 const sframe& init_centers,
356 const std::vector<flexible_type>& row_labels,
357 const std::string row_label_name,
358 bool allow_categorical =
false);
377 sframe get_cluster_assignments();
388 sframe get_cluster_info();
402 inline size_t get_version()
const override {
return KMEANS_VERSION; }
#define BEGIN_CLASS_MEMBER_REGISTRATION(python_facing_classname)
#define REGISTER_CLASS_MEMBER_FUNCTION(function,...)
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
void safe_update_center(const dense_vector &u)
#define END_CLASS_MEMBER_REGISTRATION
std::vector< std::string > list_fields()
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
size_t get_version() const override