Turi Create  4.0
dimension_reduction.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_DIMENSION_REDUCTION_H_
7 #define TURI_DIMENSION_REDUCTION_H_
8 #include <model_server/lib/toolkit_class_macros.hpp>
9 #include <toolkits/feature_engineering/transformer_base.hpp>
10 #include <Eigen/Core>
11 #include <core/export.hpp>
12 
13 namespace turi {
14 namespace sdk_model {
15 namespace feature_engineering {
16 
17 typedef Eigen::MatrixXd dense_matrix;
18 typedef Eigen::Matrix<double, Eigen::Dynamic, 1> dense_vector;
19 
20 
21 /**
22  * Create a random projection matrix if the dimenions were already set in the
23  * constructor, in which case no data is needed.
24  *
25  * The Gaussian random projection is Y = (1 / \sqrt(k)) X * R, where:
26  * - X is the original data (n x d)
27  * - R is the projection matrix (d x k)
28  * - Y is the output data (n x k)
29  * - k is the embedding dimension, i.e. `embedding_dim`.
30  *
31  * See Achlioptas (2003) and Li, Hastie, and Church (2006) for details. Our
32  * naming convention is to call d the "ambient dimension" and k the "embedded
33  * dimension".
34  * -----------------------------------------------------------------------------
35  *
36  * The private members of a random_projection instance are:
37  *
38  * unprocessed_features:
39  * Column names before feature validation and preprocessing, particularly
40  * whether the names are to be included or excluded.
41  *
42  * feature_columns:
43  * Feature column names after validation and pre-processing. These are the
44  * actual columns that we will work with.
45  *
46  * original_dimension
47  * Dimension of the data input to the `transform` function, as determined by
48  * data passed to the `fit` function.
49  *
50  * projection_matrix:
51  * Where the rubber meets the road. This is post-multiplied by the data to
52  * produce the output data.
53  *
54  * fitted:
55  * Indicates if the model has been fitted yet.
56  *
57  * exclude:
58  * Indicates if the `unprocessed_features` should be included or excluded.
59  * -----------------------------------------------------------------------------
60  *
61  * Several items are added to the model's state (in addition to the options
62  * defined in `init_options`) so they will be visible to the Python user. More
63  * information about these can be found in the Python documentation. The items
64  * are:
65  *
66  * - original_dimension: dimension of the input data, unpacked.
67  * - features: list of column names to project.
68  * - excluded_features: list of column names to exclude.
69  * - random_seed: seed for generating the projection matrix.
70  */
71 class EXPORT random_projection : public transformer_base {
72 
73  static constexpr size_t RANDOM_PROJECTION_VERSION = 0;
74 
75  flexible_type unprocessed_features;
76  std::vector<std::string> feature_columns;
77  std::map<std::string, flex_type_enum> feature_types;
78 
79  size_t original_dimension = 0;
80  std::shared_ptr<dense_matrix> projection_matrix;
81 
82  bool fitted = false;
83  bool exclude = false;
84 
85  public:
86 
87  virtual inline ~random_projection() {}
88 
89  /**
90  * Define the options manager and set the initial options.
91  */
92  void init_options(const std::map<std::string, flexible_type>& user_opts) override;
93 
94  /**
95  * Get the version number for a `random_projection` object.
96  */
97  size_t get_version() const override;
98 
99  /**
100  * Save a `random_projection` object using Turi's oarc.
101  */
102  void save_impl(oarchive& iarc) const override;
103 
104  /**
105  * Load a `random_projection` object using Turi's iarc.
106  */
107  void load_version(iarchive & iarc, size_t version) override;
108 
109  /**
110  * Initialize the transformer. This is the primary entry points for C++ users.
111  */
112  void init_transformer(const std::map<std::string, flexible_type>& user_opts) override;
113 
114  /**
115  * Fit the random projection. There is no real logic to write here, right?
116  */
117  void fit(gl_sframe data) override;
118 
119  /**
120  * Transform data into a low-dimensional space.
121  */
122  gl_sframe transform(gl_sframe data) override;
123 
124  /**
125  * Fit and transform the given data. Intended as an optimization because
126  * fit and transform are usually always called together. The default
127  * implementaiton calls fit and then transform.
128  *
129  * \param data
130  */
132  data.materialize();
133  fit(data);
134  return transform(data);
135  }
136 
137  BEGIN_CLASS_MEMBER_REGISTRATION("_RandomProjection")
144  REGISTER_NAMED_CLASS_MEMBER_FUNCTION("_get_default_options",
148  "key");
149 
151 };
152 
153 } //namespace feature_engineering
154 } //namespace sdk_model
155 } // namespace turi
156 #endif
#define BEGIN_CLASS_MEMBER_REGISTRATION(python_facing_classname)
#define REGISTER_CLASS_MEMBER_FUNCTION(function,...)
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
const std::map< std::string, flexible_type > & get_current_options() const
#define END_CLASS_MEMBER_REGISTRATION
const variant_type & get_value_from_state(std::string key)
std::vector< std::string > list_fields()
#define REGISTER_NAMED_CLASS_MEMBER_FUNCTION(name, function,...)
std::map< std::string, flexible_type > get_default_options() const
gl_sframe transform(gl_sframe data) override
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
void init_transformer(const std::map< std::string, flexible_type > &user_opts) override
void transform(S &&input, T &&output, TransformFn transformfn, std::set< size_t > constraint_segments=std::set< size_t >())
Definition: algorithm.hpp:64