6 #ifndef categorical_imputer_INDEXER_H_ 7 #define categorical_imputer_INDEXER_H_ 9 #include <unordered_map> 10 #include <unordered_set> 12 #include <core/export.hpp> 14 #include <model_server/lib/toolkit_class_macros.hpp> 15 #include <toolkits/feature_engineering/transformer_base.hpp> 19 namespace feature_engineering {
21 class EXPORT categorical_imputer :
public transformer_base {
24 static constexpr
size_t CATEGORICAL_IMPUTER_VERSION = 0;
25 static constexpr
char CLUSTER_ID[] =
"__internal__cluster_id";
26 static constexpr
char CLUSTER_DISTANCE[] =
"__internal__cluster_centroid_distance";
27 static constexpr
char LABEL_COUNT[] =
"__internal__label_count";
28 static constexpr
char MAX_LABEL[] =
"__internal__max_label";
29 static constexpr
char FIXED_LABEL[] =
"__internal__fixed_label";
30 static constexpr
char COUNT_OF_LABELS[] =
"__internal__count_of_labels";
31 static constexpr
char MAX_OF_LABELS[] =
"__internal__max_of_labels";
32 static constexpr
char PREDICTED_COLUMN_PREFIX[] =
"predicted_feature_";
33 static constexpr
char PROBABILITY_COLUMN_PREFIX[] =
"feature_probability_";
36 std::unordered_map<int64_t, flexible_type> label_map;
39 std::unordered_map<flexible_type, int64_t> reverse_label_map;
42 std::unordered_set<std::string> label_propagation_features_set;
48 flexible_type dependent_feature_columns;
49 std::string feature_column;
61 gl_sframe from_sframe(
const sframe& sframe);
71 size_t get_column_index(
const gl_sframe& sframe,
72 const std::string& column_name);
86 gl_sframe* gl_clustered_user_data,
87 gl_sframe* gl_centroids);
96 void compute_cluster_argmax_label(
97 gl_sframe gl_clustered_user_data,
98 gl_sframe gl_centroids,
99 gl_sframe* gl_centroid_with_label);
108 bool all_centroids_labeled(gl_sframe gl_centroid_with_label);
116 void rename_labels(gl_sframe* gl_centroid_with_label);
125 gl_sgraph build_distance_graph(gl_sframe gl_centroid_with_label);
134 gl_sframe call_label_propagation(gl_sgraph centroid_graph);
146 gl_sframe get_prediction_probability(gl_sframe* label_propagation_output);
158 gl_sframe join_user_data_and_predictions(
159 gl_sframe gl_clustered_user_data,
160 gl_sframe clusters_with_predictions);
171 gl_sframe join_user_data_and_kmeans_output(
172 gl_sframe gl_clustered_user_data,
173 gl_sframe gl_centroid_with_label);
182 virtual inline ~categorical_imputer() {}
191 void init_options(
const std::map<std::string, flexible_type>&_options)
override;
196 size_t get_version()
const override;
212 void init_transformer(
const std::map<std::string, flexible_type>& _options)
override;
219 void fit(gl_sframe data)
override;
231 gl_sframe
transform(gl_sframe data)
override;
240 gl_sframe fit_transform(gl_sframe data) {
257 categorical_imputer::get_default_options);
259 categorical_imputer::get_value_from_state, "key");
#define BEGIN_CLASS_MEMBER_REGISTRATION(python_facing_classname)
#define REGISTER_CLASS_MEMBER_FUNCTION(function,...)
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
#define END_CLASS_MEMBER_REGISTRATION
#define REGISTER_NAMED_CLASS_MEMBER_FUNCTION(name, function,...)
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
void transform(S &&input, T &&output, TransformFn transformfn, std::set< size_t > constraint_segments=std::set< size_t >())