Turi Create  4.0
kmeans.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_KMEANS
7 #define TURI_KMEANS
8 
9 // Types
10 #include <Eigen/Core>
11 #include <Eigen/SparseCore>
12 #include <core/storage/sframe_data/sframe.hpp>
13 #include <core/data/sframe/gl_sarray.hpp>
14 #include <core/parallel/atomic.hpp>
15 #include <core/parallel/pthread_tools.hpp>
16 #include <core/parallel/lambda_omp.hpp>
17 #include <core/data/flexible_type/flexible_type.hpp>
18 #include <core/generics/symmetric_2d_array.hpp>
19 #include <core/globals/globals.hpp>
20 
21 // ML Data utils
22 #include <toolkits/ml_data_2/ml_data.hpp>
23 #include <toolkits/ml_data_2/ml_data_iterators.hpp>
24 
25 // Interfaces
26 #include <model_server/lib/extensions/ml_model.hpp>
27 #include <model_server/lib/extensions/option_manager.hpp>
28 #include <model_server/lib/variant_deep_serialize.hpp>
29 #include <core/globals/globals.hpp>
30 #include <toolkits/supervised_learning/supervised_learning_utils-inl.hpp>
31 
32 // Miscellaneous
33 #include <model_server/lib/toolkit_util.hpp>
34 #include <core/storage/sframe_interface/unity_sframe.hpp>
35 #include <core/logging/table_printer/table_printer.hpp>
36 #include <core/export.hpp>
37 
38 
39 namespace turi {
40 namespace kmeans {
41 
42 
43 typedef Eigen::Matrix<double, Eigen::Dynamic, 1> dense_vector;
44 typedef Eigen::SparseVector<double> sparse_vector;
45 
46 
47 /**
48  * ----------------
49  * Helper functions
50  * ----------------
51  */
52 
53 /**
54  * Make sure the dataset is not empty.
55  *
56  * \param X Input dataset.
57  */
58 void check_empty_data(const sframe& X);
59 
60 /**
61  * Check that the feature types are valid for the kmeans model.
62  *
63  * \param X Input dataset.
64  */
65 void check_column_types(const sframe& X);
66 
67 
68 
69 /**
70  * ----------------------
71  * Definition of clusters
72  * ----------------------
73  */
74 struct cluster {
75 
76  // Data
77  dense_vector center;
78  atomic<size_t> count = 0;
79  turi::mutex m;
80 
81 
82  // Methods
83  cluster(size_t dimension): center(dense_vector(dimension)), count(0) {
84  center.setZero();
85  };
86 
87  cluster() = delete;
88  cluster(const cluster& other): center(other.center), count(other.count) {};
89  cluster& operator=(const cluster& other);
90 
91  /**
92  * Safe mean update that avoids overflow. See
93  * http://www.johndcook.com/standard_deviation.html
94  */
95  void safe_update_center(const dense_vector& u);
96 };
97 
98 
99 /**
100  * ------------------------
101  * Kmeans clustering model
102  * ------------------------
103  *
104  * Kmeans clustering model. By default, the model uses the KMeans++ algorithm to
105  * choose initial cluster centers, although users may also pass custom initial
106  * centers. This implementation uses the implementation of Elkan (2003), which
107  * takes advantage of the triangle inequality to reduce the number of distance
108  * computations. In addition to storing the n x 1 vectors of cluster assignments
109  * and distances from each point to its assigned cluster center (necessary for
110  * any Kmeans implementation), the Elkan algorithm also requires computation and
111  * storage of all pairwise distances between cluster centers.
112  *
113  * \note This implementation does *not* currently use the second lemma from the
114  * Elkan (2003) paper, which further reduces the number of exact distance
115  * computations by storing the lower bound on the distance between every point
116  * and every cluster center. This n x K matrix is generally too big to store in
117  * memory and too slow to write to as an SFrame.
118  *
119  * The Kmeans model contains the following private data objects:
120  *
121  * mldata: The data, in ml_data_2 form. Each row is an observation. After the
122  * model is trained, this member is no longer needed, so it is not serialized
123  * when the model is saved.
124  *
125  * num_examples: Number of points.
126  *
127  * assignments: Cluster assignment for each data point.
128  *
129  * clusters: Vector of cluster structs, each of which contains a center vector,
130  * a count of assigned points, and a mutex lock, so the cluster can safely be
131  * updated in parallel.
132  *
133  * num_clusters: Number of clusters, set by the user.
134  *
135  * max_iterations: Maximum iterations of the main Kmeans loop, excluding
136  * initial selection of the centers.
137  *
138  * upper_bounds: For every point, an upper bound on the distance from the point
139  * to its currently assigned center. Whenever the distance between a point
140  * and its assigned center is computed exactly, then this bound is tight, but
141  * the bounds also must be adjusted at the end of each iteration to account
142  * for the movement of the assigned center. Despite this adjustment, the
143  * upper bound often remains small enough to avoid computing the exact
144  * distance to other candidate centers.
145  *
146  * center_dists: Exact distance between all pairs of cluster centers. Computing
147  * this K x K matrix allows us to use the triangle inequality to avoid
148  * computing all n x K point-to-center distances in every iteration.
149  *
150  *
151  * The Kmeans model stores the following members in its *state* object, which is
152  * exposed to the Python API:
153  *
154  * options: Option manager which keeps track of default options, current
155  * options, option ranges, type etc. This must be initialized only once in
156  * the init_options() function.
157  *
158  * training_time: Run time of the algorithm, in seconds.
159  *
160  * training_iterations: Number of iterations of the main Kmeans loop. If the
161  * algorithm does not converge, this is equal to 'max_iterations'.
162  *
163  *
164  * In addition, the following public methods return information from a trained
165  * Kmeans model:
166  *
167  * get_cluster_assignments: Returns an SFrame with the fields "row_id",
168  * "cluster_id", and "distance" for input data point. The "cluster_id" field
169  * (integer) contains the cluster assignment of the data point, and the
170  * "distance" field (float) contains the Euclidean distance of the data point
171  * to the assigned cluster's center.
172  *
173  * get_cluster_info: Returns an SFrame with metadata about each cluster. For
174  * each cluster, the output SFrame includes the features describing the
175  * center, the number of points assigned to the cluster, and the
176  * within-cluster sum of squared Euclidean distances from points to the
177  * center.
178  */
179 class EXPORT kmeans_model : public ml_model_base {
180 
181  // Data objects and attributes
182  v2::ml_data mldata;
183  std::shared_ptr<v2::ml_metadata> metadata;
184  size_t num_examples = 0;
185 
186  // Model items
187  std::vector<size_t> assignments;
188  std::vector<cluster> clusters;
189  size_t num_clusters = 0;
190  size_t max_iterations = 0;
191  size_t batch_size = 1;
192  std::vector<flexible_type> row_labels;
193  std::string row_label_name;
194 
195  // Training objects
196  std::vector<float> upper_bounds;
197  turi::symmetric_2d_array<float> center_dists;
198 
199  /**
200  * Initialize the model's members.
201  *
202  * \param X Input dataset.
203  * \param row_labels Row labels, either flex_type string or flex_type int.
204  */
205  void initialize_model_data(const sframe& X,
206  const std::vector<flexible_type>& row_labels,
207  const std::string row_label_name);
208 
209  /**
210  * Initialize the point assignments and the bounds on distances between points
211  * and cluster centers. Uses the triangle inequality and pairwise cluster
212  * center distances to eliminate unnecessary distance computations.
213  */
214  void assign_initial_clusters_elkan();
215 
216  /**
217  * Choose random initial cluster centers, with a modified version of the
218  * k-means++ method. For the first cluster center, sample uniformly at random
219  * from data set. For the remaining clusters, sample proportional to the
220  * distance between each point the distance to its nearest existing center.
221  * For this toolkit we actually read a sample of the data into memory first,
222  * for much faster random access of the chosen centers.
223  */
224  void choose_random_centers();
225 
226  /**
227  * High-memory version of main Kmeans iterations, using Elkan's algorithm.
228  *
229  * \returns iter Number of iterations in the main training loop.
230  *
231  * \note: Uses only lemma 1 from Elkan (2003), which uses the triangle
232  * inequality and distances between cluster centers to avoid distance
233  * computations. Does *not* use the lower bounds between all points and all
234  * distances.
235  */
236  size_t compute_clusters_elkan();
237 
238  /**
239  * Minibatch version of main Kmeans iterations, using the D. Sculley
240  * algorithm (http://www.eecs.tufts.edu/~dsculley/papers/fastkmeans.pdf).
241  *
242  * \returns iter Number of iterations in the main training loop.
243  */
244  size_t compute_clusters_minibatch();
245 
246  /**
247  * Low-memory version of main Kmeans iterations, using Lloyd's algorithm.
248  *
249  * \returns iter Number of iterations in the main training loop.
250  */
251  size_t compute_clusters_lloyd();
252 
253  /**
254  * Set custom initial centers in the model.
255  *
256  * \param init_centers Custom initial centers, in an SFrame with the same
257  * metadata as the input dataset.
258  */
259  void process_custom_centers(const sframe& init_centers);
260 
261  /**
262  * Compute distances between all pairs of cluster centers. The result is
263  * stored in the model's 'center_dists' member.
264  */
265  void compute_center_distances();
266 
267  /**
268  * Update cluster centers to be the means of the currently assigned points.
269  */
270  void update_cluster_centers();
271 
272  /**
273  * Update distance bounds based on the distance between each cluster center's
274  * previous and current locations.
275  *
276  * \param previous_clusters Previous iteration's clusters.
277  */
278  void adjust_distance_bounds(const std::vector<cluster>& previous_clusters);
279 
280  /**
281  * Compute the exact distance between each point and its assigned cluster.
282  * Store in the 'upper_bounds' member, which for low-memory clustering is the
283  * exact distance anyway.
284  */
285  void set_exact_point_distances();
286 
287  /**
288  * Update the cluster assignments based on the current cluster means and
289  * return the number of assignments that changed. Uses cluster center
290  * distances to eliminate many exact point-to-center distance computations.
291  *
292  * \return num_chaged Number of points whose cluster assignment changed.
293  */
294  size_t update_assignments_elkan();
295 
296  /**
297  * Update cluster assignments based on current cluster means and return the
298  * number of assignments that changed. Computes all point-to-center distances.
299  *
300  * \return num_chaged Number of points whose cluster assignment changed.
301  *
302  * \note: 'upper_bounds' for the low-memory version (i.e. Lloyd's algorithm) are
303  * actually exact distances between each point and its assigned cluster's
304  * center.
305  */
306  size_t update_assignments_lloyd();
307 
308 
309 public:
310  static constexpr size_t KMEANS_VERSION = 4;
311 
312  /**
313  * Constructor
314  */
315  kmeans_model();
316 
317  /**
318  * Destructor.
319  */
320  ~kmeans_model();
321 
322  /**
323  * Set the model options. The option manager should throw errors if the
324  * options do not satisfy the option manager's conditions.
325  */
326  void init_options(const std::map<std::string, flexible_type>& _opts) override;
327 
328  /**
329  * Train the kmeans model, without row labels.
330 
331  * \param X Input data. Each row is an observation.
332  * \param init_centers Custom initial centers provided by the user.
333  * \param method Indicate if Lloyd's algorithm should be used
334  * instead of Elkan's. Lloyd's is generally substantially slower, but uses
335  * very little memory, while Elkan's requires storage of all pairwise cluster
336  * center distances.
337  */
338  void train(const sframe& X, const sframe& init_centers, std::string method,
339  bool allow_categorical = false);
340 
341  /**
342  * Train the kmeans model, with row labels.
343 
344  * \param X Input data. Each row is an observation.
345  * \param init_centers Custom initial centers provided by the user.
346  * \param use_naive_method Indicate if Lloyd's algorithm should be used
347  * instead of Elkan's. Lloyd's is generally substantially slower, but uses
348  * very little memory, while Elkan's requires storage of all pairwise cluster
349  * center distances.
350  * \param row_labels Flexible type row labels.
351  * \param row_label_name Name of the row label column.
352  */
353  void train(const sframe& X,
354  const sframe& init_centers,
355  std::string method,
356  const std::vector<flexible_type>& row_labels,
357  const std::string row_label_name,
358  bool allow_categorical = false);
359 
360  /**
361  * Predict the cluster assignment for new data, according to a trained Kmeans
362  * model.
363  *
364  * \param X Input dataset.
365  * \return out SFrame with row index, cluster assignment and distance to
366  * assigned cluster center (similar to `get_cluster_assignments`).
367  */
368  sframe predict(const sframe& X);
369 
370  /**
371  * Write cluster assigments to an SFrame and return. Also records the row
372  * index of the point and the distance from the point to its assigned cluster.
373  *
374  * \returns out SFrame with row index, cluster assignment, and distance to
375  * assigned cluster center for each input data point. cluster's center.
376  */
377  sframe get_cluster_assignments();
378 
379  /**
380  * Write cluster metadata to an SFrame and return. Records the features for
381  * each cluster, the count of assigned points, and the within-cluster sum of
382  * squared distances.
383  *
384  * \returns out SFrame with metadata about each cluster, including the center
385  * vector, count of assigned points, and within-cluster sum of squared
386  * Euclidean distances.
387  */
388  sframe get_cluster_info();
389 
390  /**
391  * Get the model version number.
392  *
393  * Version map
394  * ===========
395  * GLC version -> Kmeans version
396  * ----------- --------------
397  * <= 1.3 1
398  * 1.4 2
399  * 1.5 3
400  * 1.9 4
401  */
402  inline size_t get_version() const override { return KMEANS_VERSION; }
403 
404  /**
405  * Serialize the model.
406  */
407  void save_impl(turi::oarchive& oarc) const override;
408 
409  /**
410  * De-serialize the model.
411  */
412  void load_version(turi::iarchive& iarc, size_t version) override;
413 
414  // TODO: convert interface above to use the extensions methods here
418 
419 }; // kmeans_model class
420 
421 
422 
423 } // namespace kmeans
424 } // namespace turi
425 #endif
#define BEGIN_CLASS_MEMBER_REGISTRATION(python_facing_classname)
#define REGISTER_CLASS_MEMBER_FUNCTION(function,...)
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
void safe_update_center(const dense_vector &u)
#define END_CLASS_MEMBER_REGISTRATION
std::vector< std::string > list_fields()
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
size_t get_version() const override
Definition: kmeans.hpp:402