Turi Create  4.0
feature_binner.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef _FEATURE_BINNER_H_
7 #define _FEATURE_BINNER_H_
8 #include <string>
9 #include <model_server/lib/toolkit_class_macros.hpp>
10 #include <toolkits/feature_engineering/transformer_base.hpp>
11 #include <toolkits/feature_engineering/topk_indexer.hpp>
12 #include <core/storage/serialization/serialization_includes.hpp>
13 #include <core/export.hpp>
14 namespace turi {
15 struct bin {
16  double left;
17  double right;
18  size_t bin_id;
19 };
20 }
21 SERIALIZABLE_POD(turi::bin);
22 
23 
24 namespace turi{
25 namespace sdk_model {
26 namespace feature_engineering {
27 
28 
29 /**
30  *
31  * This transformation 1) creates a set of named bins and a mapping from
32  * the reals to each bin, and 2) for each value returns the name of
33  * the assigned bin.
34  *
35  * int/reals: returns the name of the bin for which bin_l < x < bin_r.
36  *
37  *
38  * Bin creation options include:
39  * - quantile: the bins are defined by the quantiles of all of the values
40  * seen by the transformer
41  * - exponential: the bins are defined in a logarithmic scale...
42  * [0, 1), [1, 10), [10, 100), ..., [1e6, Inf)
43  *
44  *
45  * Options:
46  * bins = 'quantile'=> creates quantile bins?
47  * bins = 'exponential'=>
48  * bins = [0, 1, 5] => creates 5 bins: (-Inf, 0), [0, 1), [1, 5), [5,Inf)
49  */
50 class EXPORT feature_binner : public transformer_base {
51 
52  static constexpr size_t FEATURE_BINNER_VERSION = 1;
53  //VERSION 1 disabled support for array, list, and dict types
54  // VERSION 1 also enumerates bins as <column_name>_1 instead
55  // of making them the string representation of the range
56  std::map<std::string, flex_type_enum> feature_types;
57  std::vector<std::string> feature_columns;
58  flexible_type unprocessed_features; // Input provided by the user.
59  bool fitted = false;
60  bool exclude = false;
61  std::map<std::string, std::vector<bin>> bins;
62 
63  public:
64 
65  /**
66  * Methods that must be implemented in a new transformer model.
67  * -------------------------------------------------------------------------
68  */
69 
70  virtual inline ~feature_binner() {}
71 
72  /**
73  * Set one of the options in the model. Use the option manager to set
74  * these options. If the option does not satisfy the conditions that the
75  * option manager has imposed on it. Errors will be thrown.
76  *
77  * \param[in] options Options to set
78  */
79  void init_options(const std::map<std::string, flexible_type>&_options) override;
80 
81  /**
82  * Get a version for the object.
83  */
84  size_t get_version() const override;
85 
86  /**
87  * Save the object using Turi's oarc.
88  */
89  void save_impl(turi::oarchive& oarc) const override;
90 
91  /**
92  * Load the object using Turi's iarc.
93  */
94  void load_version(turi::iarchive& iarc, size_t version) override;
95 
96 
97  /**
98  * Initialize the transformer.
99  */
100  void init_transformer(const std::map<std::string,
101  flexible_type>& _options) override;
102 
103  /**
104  * Set constant.
105  *
106  * \param[in] data (SFrame of data)
107  */
108  void fit(gl_sframe data) override;
109 
110  /**
111  * Transform the given data.
112  *
113  * \param[in] data (SFrame of data)
114  *
115  * Python side interface
116  * ------------------------
117  * This function directly interfaces with "transform" in python.
118  *
119  */
120  gl_sframe transform(gl_sframe data) override;
121 
122  /**
123  * Fit and transform the given data. Intended as an optimization because
124  * fit and transform are usually always called together. The default
125  * implementaiton calls fit and then transform.
126  *
127  * \param[in] data (SFrame of data)
128  */
130  data.materialize();
131  fit(data);
132  return transform(data);
133  }
134 
135 
136  // Functions that all transformers need to register. Can be copied verbatim
137  // for other classes.
138  // --------------------------------------------------------------------------
139  BEGIN_CLASS_MEMBER_REGISTRATION("_FeatureBinner")
140  REGISTER_CLASS_MEMBER_FUNCTION(feature_binner::init_transformer, "_options");
141  REGISTER_CLASS_MEMBER_FUNCTION(feature_binner::fit, "data");
142  REGISTER_CLASS_MEMBER_FUNCTION(feature_binner::fit_transform, "data");
143  REGISTER_CLASS_MEMBER_FUNCTION(feature_binner::transform, "data");
144  REGISTER_CLASS_MEMBER_FUNCTION(feature_binner::get_current_options);
145  REGISTER_CLASS_MEMBER_FUNCTION(feature_binner::list_fields);
146  REGISTER_NAMED_CLASS_MEMBER_FUNCTION("_get_default_options",
147  feature_binner::get_default_options);
149  feature_binner::get_value_from_state,
150  "key");
152 
153 };
154 
155 
156 } // feature_engineering
157 } // sdk_model
158 } // turicreate
159 
160 #endif
#define BEGIN_CLASS_MEMBER_REGISTRATION(python_facing_classname)
#define REGISTER_CLASS_MEMBER_FUNCTION(function,...)
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
#define END_CLASS_MEMBER_REGISTRATION
#define REGISTER_NAMED_CLASS_MEMBER_FUNCTION(name, function,...)
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
void transform(S &&input, T &&output, TransformFn transformfn, std::set< size_t > constraint_segments=std::set< size_t >())
Definition: algorithm.hpp:64