Turi Create  4.0
count_featurizer.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_FEATURE_ENGINEERING_COUNT_FEATURIZER_HPP
7 #define TURI_FEATURE_ENGINEERING_COUNT_FEATURIZER_HPP
8 #include <model_server/lib/toolkit_class_macros.hpp>
9 #include <toolkits/feature_engineering/transformer_base.hpp>
10 #include <ml/sketches/countmin.hpp>
11 #include <core/export.hpp>
12 using namespace turi;
13 
14 
15 namespace turi {
16 namespace sdk_model {
17 namespace feature_engineering {
18 
19 /**
20  * An approximate, limited memory implementation of the feature engineering
21  * mechanism by Misha Bilenko in
22  * https://blogs.technet.microsoft.com/machinelearning/2015/02/17/big-learning-made-easy-with-counts/
23  * .
24  *
25  * For some k-ary classification task assuming we are going to try to predict
26  * column Y. Then for every column, X, we replace it with 2k-1 numeric features:
27  * Assuming the value X in row i is x_i.
28  * We replace it two columns:
29  * count_X: a list of the following values
30  * - #(Y = 0 & X = x_i) the number of times Y = 0 when X has the value x_i
31  * - #(Y = 1 & X = x_i)
32  * - ...
33  * - #(Y = k-1 & X = x_i)
34  * prob_X: a list of the following values
35  * - P(Y = 0 | X = x_i) the probability Y = 0 when X has the value x_i
36  * - P(Y = 1 | X = x_i)
37  * - ...
38  * - P(Y = k-2 | X = x_i)
39  *
40  * This procedure is generally quite memory intensive, requiring the
41  * count table #(Y=y, X=x) to be built up for every column, requiring
42  * O(k * N(X)) memory per column where N(X) is the number of unique values of X.
43  *
44  * Instead, we will approximate the count table this way:
45  * For each input column X,
46  * - We maintain k count tables. i.e. one count for each Y value, where
47  * each count table is represented by a count-min-sketch.
48  * http://www.cs.rutgers.edu/~muthu/cmz-sdm.pdf provides some analysis on the
49  * use of the count-min sketch and empirical validation as to why the
50  * count-min sketch is preferred over the count sketch in the power-law
51  * distribution setting.
52  *
53  * This provides counts which are inherently upper bounds on the actual
54  * counts. Therefore, estimating
55  * P(Y = 0 | X = x_i) with #(Y = 0 & X = x_i) / #(X = x_i)
56  * is problematic since if #(X = x_i) is estimated indepedendently,
57  * the probabilities may not sum to 1.
58  * Instead #(X = x_i) should be estimated with \sum_y #(Y = y & X = x_i)
59  */
60 class EXPORT count_featurizer : public transformer_base {
61  public:
62  static constexpr size_t count_featurizer_VERSION = 0;
63  static constexpr char COUNTS_PREFIX[] = "count_";
64  static constexpr char PROBABILITY_PREFIX[] = "prob_";
65 
66  count_featurizer() = default;
67 
68  // serializers
69  size_t get_version() const override;
70  void save_impl(oarchive& oarc) const override;
71  void load_version(iarchive& iarc, size_t version) override;
72 
73  // transformer
74  void init_options(const std::map<std::string,
75  flexible_type>& _options) override;
76  void init_transformer(const std::map<std::string,
77  flexible_type>& _options) override;
78  void fit(gl_sframe data) override;
79  gl_sframe transform(gl_sframe data) override;
80  gl_sframe fit_transform(gl_sframe data);
81 
82  // collect the state in a single shared pointer so that this can be
83  // shared across to the lazy apply.
84  struct transform_state {
85  // random seed
86  size_t seed = 0;
87  double laplace_smearing = 0.0;
88  std::string count_column_prefix;
89  std::string prob_column_prefix;
90  std::vector<flexible_type> y_values;
91  // counters[i][colnumber] contains the sketch for column colnumber
92  // and y value y_values[i].
93  std::vector<
94  std::vector<sketches::countmin<flexible_type>>
95  > counters;
96  };
97 
98  BEGIN_CLASS_MEMBER_REGISTRATION("_CountFeaturizer")
102  REGISTER_CLASS_MEMBER_FUNCTION(count_featurizer::fit_transform, "data")
105  REGISTER_NAMED_CLASS_MEMBER_FUNCTION("_get_default_options",
109  "key");
110 
112  private:
113  std::vector<std::string> feature_columns;
114  flexible_type unprocessed_features;
115  bool exclude = false;
116  bool fitted = false;
117 
118  turi::mutex m_lock;
119 
120  std::shared_ptr<transform_state> m_state;
121 };
122 
123 } // namespace feature_engineering
124 } // namespace sdk_model
125 } // namespace turi
126 #endif
#define BEGIN_CLASS_MEMBER_REGISTRATION(python_facing_classname)
#define REGISTER_CLASS_MEMBER_FUNCTION(function,...)
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
const std::map< std::string, flexible_type > & get_current_options() const
#define END_CLASS_MEMBER_REGISTRATION
gl_sframe transform(gl_sframe data) override
const variant_type & get_value_from_state(std::string key)
std::vector< std::string > list_fields()
#define REGISTER_NAMED_CLASS_MEMBER_FUNCTION(name, function,...)
std::map< std::string, flexible_type > get_default_options() const
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
void transform(S &&input, T &&output, TransformFn transformfn, std::set< size_t > constraint_segments=std::set< size_t >())
Definition: algorithm.hpp:64
void init_transformer(const std::map< std::string, flexible_type > &_options) override