Turi Create  4.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
categorical_imputer.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef categorical_imputer_INDEXER_H_
7 #define categorical_imputer_INDEXER_H_
8 #include <string>
9 #include <unordered_map>
10 #include <unordered_set>
11 
12 #include <core/export.hpp>
13 
14 #include <model_server/lib/toolkit_class_macros.hpp>
15 #include <toolkits/feature_engineering/transformer_base.hpp>
16 
17 namespace turi {
18 namespace sdk_model {
19 namespace feature_engineering {
20 
21 class EXPORT categorical_imputer : public transformer_base {
22 
23  // Version of the imputer
24  static constexpr size_t CATEGORICAL_IMPUTER_VERSION = 0;
25  static constexpr char CLUSTER_ID[] = "__internal__cluster_id";
26  static constexpr char CLUSTER_DISTANCE[] = "__internal__cluster_centroid_distance";
27  static constexpr char LABEL_COUNT[] = "__internal__label_count";
28  static constexpr char MAX_LABEL[] = "__internal__max_label";
29  static constexpr char FIXED_LABEL[] = "__internal__fixed_label";
30  static constexpr char COUNT_OF_LABELS[] = "__internal__count_of_labels";
31  static constexpr char MAX_OF_LABELS[] = "__internal__max_of_labels";
32  static constexpr char PREDICTED_COLUMN_PREFIX[] = "predicted_feature_";
33  static constexpr char PROBABILITY_COLUMN_PREFIX[] = "feature_probability_";
34 
35  // Map from internal-label to user-label
36  std::unordered_map<int64_t, flexible_type> label_map;
37 
38  // Map from user-label to internal-label
39  std::unordered_map<flexible_type, int64_t> reverse_label_map;
40 
41  // Features valid for label propagation
42  std::unordered_set<std::string> label_propagation_features_set;
43 
44  // Was fit() called?
45  bool fitted = false;
46 
47  // User-provided inputs
48  flexible_type dependent_feature_columns; // Columns to use as features
49  std::string feature_column; // Column to impute
50  flex_type_enum feature_column_type; // Type of the column to impute
51  bool exclude = false; // Are some columns to be excluded?
52  bool verbose = false; // Verbose output?
53 
54  /**
55  * Utility method to convert an sframt into a gl_sframe
56  *
57  * \param sframe An SFrame
58  *
59  * \returns a gl_sframe wrapping the SFrame
60  */
61  gl_sframe from_sframe(const sframe& sframe);
62 
63  /**
64  * Utility method to retrieve the index of a column in an SFrame
65  *
66  * \param sfram An SFrame
67  * \param column_name Column to get the index of
68  * \returns The index of the column_name column if found in the SFrame,
69  * otherwise (size_t)(-1).
70  */
71  size_t get_column_index(const gl_sframe& sframe,
72  const std::string& column_name);
73 
74  /**
75  * Calls the kmeans toolkit and assigns a cluster ID to each user-provided
76  * row of data.
77  *
78  * \param data The user-supplied SFrame of data, containing all the columns
79  * \param use_centroids Use the provided centroids as starting points
80  * \param gl_clustered_user_data (output) The user data with cluster IDs
81  * \param gl_centroids (output) Centroid IDs and features
82  */
83  void call_kmeans(
84  gl_sframe data,
85  bool use_centroids,
86  gl_sframe* gl_clustered_user_data,
87  gl_sframe* gl_centroids);
88 
89  /**
90  * Use ARGMAX to assign a label to each cluster computed by kmeans
91  *
92  * \param gl_clustered_user_data User-provided data with cluster_id column
93  * \param gl_centroids Centroids with features
94  * \param gl_centroid_with_label (output) Centroids with computed label
95  */
96  void compute_cluster_argmax_label(
97  gl_sframe gl_clustered_user_data,
98  gl_sframe gl_centroids,
99  gl_sframe* gl_centroid_with_label);
100 
101  /**
102  * Returns wether all the centroids have an assigned label. If they
103  * all do, we don't need to perform label propagation.
104  *
105  * \param gl_centroid_with_label Centroids marked with labels
106  * \returns True if all centroids have an assigned label
107  */
108  bool all_centroids_labeled(gl_sframe gl_centroid_with_label);
109 
110  /**
111  * Renames the cluster labels from the user-provided labels to
112  * numbers from [0, N) as required by the label_propagation toolkit
113  *
114  * \param gl_centroid_with_label A new column "fixed_label" is added
115  */
116  void rename_labels(gl_sframe* gl_centroid_with_label);
117 
118  /**
119  * Builds the distance graph between every centroid, allowing us to run
120  * label propagation between the vertices
121  *
122  * \param gl_centroid_with_label Centroids with fixed labels
123  * \returns An SGraph with every centroid-centroid distances computed
124  */
125  gl_sgraph build_distance_graph(gl_sframe gl_centroid_with_label);
126 
127  /**
128  * Calls the label_propagation toolkit to fill in the missing labels
129  * for all centroids.
130  *
131  * \param centroid_graph Centroid graph, output of build_distance_graph
132  * \returns An SFrame with centroid and propagated labels
133  */
134  gl_sframe call_label_propagation(gl_sgraph centroid_graph);
135 
136  /**
137  * The output of the label propagation module is one probability for
138  * every possible label for each row; we really just want the probability
139  * for the selected label. This method generates that column with the
140  * probability of the chosen label.
141  *
142  * \param label_propagation_output Output of label propagation, a new
143  * column (label_probability) will be added
144  * \returns An SFrame with Cluster Id, Predicted Label and Label Probability
145  */
146  gl_sframe get_prediction_probability(gl_sframe* label_propagation_output);
147 
148  /**
149  * Joins the user provided data with the computed labels, making sure to
150  * keep the user labels instead of centroid labels as well as replacing the
151  * labels back into user-provided label-space (instead of [0, N) ).
152  *
153  * \param gl_clustered_user_data User data with Cluster ID column
154  * \param clusters_with_predictions Output of get_prediction_probability
155  * \returns The User's SFrame with the predicted_label and label_probability
156  * columns added.
157  */
158  gl_sframe join_user_data_and_predictions(
159  gl_sframe gl_clustered_user_data,
160  gl_sframe clusters_with_predictions);
161 
162  /**
163  * In the case where every centroid has a label attached, we can skip the
164  * label propagation step.
165  *
166  * \param gl_clustered_user_data User data with Cluster IDs
167  * \param gl_centroid_with_label Cluster IDs with labels
168  * \returns The User's SFrame with the predicted_label and label_probability
169  * columns added.
170  */
171  gl_sframe join_user_data_and_kmeans_output(
172  gl_sframe gl_clustered_user_data,
173  gl_sframe gl_centroid_with_label);
174 
175  public:
176 
177  /**
178  * Methods that must be implemented in a new transformer model.
179  * -------------------------------------------------------------------------
180  */
181 
182  virtual inline ~categorical_imputer() {}
183 
184  /**
185  * Set one of the options in the model. Use the option manager to set
186  * these options. If the option does not satisfy the conditions that the
187  * option manager has imposed on it. Errors will be thrown.
188  *
189  * \param[in] options Options to set
190  */
191  void init_options(const std::map<std::string, flexible_type>&_options) override;
192 
193  /**
194  * Get a version for the object.
195  */
196  size_t get_version() const override;
197 
198  /**
199  * Save the object using Turi's oarc.
200  */
201  void save_impl(turi::oarchive& oarc) const override;
202 
203  /**
204  * Load the object using Turi's iarc.
205  */
206  void load_version(turi::iarchive& iarc, size_t version) override;
207 
208 
209  /**
210  * Initialize the transformer.
211  */
212  void init_transformer(const std::map<std::string, flexible_type>& _options) override;
213 
214  /**
215  * Set constant.
216  *
217  * \param[in] data (SFrame of data)
218  */
219  void fit(gl_sframe data) override;
220 
221  /**
222  * Transform the given data.
223  *
224  * \param[in] data (SFrame of data)
225  *
226  * Python side interface
227  * ------------------------
228  * This function directly interfaces with "transform" in python.
229  *
230  */
231  gl_sframe transform(gl_sframe data) override;
232 
233  /**
234  * Fit and transform the given data. Intended as an optimization because
235  * fit and transform are usually always called together. The default
236  * implementaiton calls fit and then transform.
237  *
238  * \param[in] data (SFrame of data)
239  */
240  gl_sframe fit_transform(gl_sframe data) {
241  data.materialize();
242  fit(data);
243  return transform(data);
244  }
245 
246  // Functions that all transformers need to register. Can be copied verbatim
247  // for other classes.
248  // --------------------------------------------------------------------------
249  BEGIN_CLASS_MEMBER_REGISTRATION("_CategoricalImputer")
250  REGISTER_CLASS_MEMBER_FUNCTION(categorical_imputer::init_transformer, "_options");
251  REGISTER_CLASS_MEMBER_FUNCTION(categorical_imputer::fit, "data");
252  REGISTER_CLASS_MEMBER_FUNCTION(categorical_imputer::fit_transform, "data");
253  REGISTER_CLASS_MEMBER_FUNCTION(categorical_imputer::transform, "data");
254  REGISTER_CLASS_MEMBER_FUNCTION(categorical_imputer::get_current_options);
255  REGISTER_CLASS_MEMBER_FUNCTION(categorical_imputer::list_fields);
256  REGISTER_NAMED_CLASS_MEMBER_FUNCTION("_get_default_options",
257  categorical_imputer::get_default_options);
259  categorical_imputer::get_value_from_state, "key");
261 
262 };
263 
264 
265 } // feature_engineering
266 } // sdk_model
267 } // turicreate
268 #endif
#define BEGIN_CLASS_MEMBER_REGISTRATION(python_facing_classname)
#define REGISTER_CLASS_MEMBER_FUNCTION(function,...)
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
#define END_CLASS_MEMBER_REGISTRATION
#define REGISTER_NAMED_CLASS_MEMBER_FUNCTION(name, function,...)
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
void transform(S &&input, T &&output, TransformFn transformfn, std::set< size_t > constraint_segments=std::set< size_t >())
Definition: algorithm.hpp:64