Turi Create  4.0
standardization-inl.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_STANDARDIZATION_H_
7 #define TURI_STANDARDIZATION_H_
8 
9 #include <string>
10 #include <core/data/flexible_type/flexible_type.hpp>
11 
12 // Eigen
13 #include <Eigen/Core>
14 #include <Eigen/SparseCore>
15 
16 // Optimizaiton
17 #include <ml/optimization/optimization_interface.hpp>
18 
19 // ML-Data
20 #include <ml/ml_data/ml_data.hpp>
21 #include <ml/ml_data/metadata.hpp>
22 
23 // TODO: List of todo's for this file
24 //------------------------------------------------------------------------------
25 //
26 
27 namespace turi {
28 namespace supervised {
29 
30 /**
31  *
32  * Interface for affine transformation of data for machine learning and
33  * optimization purposes.
34  *
35  *
36  * Background: Feature Scaling
37  * --------------------------------------------------------------------------
38  *
39  * Feature scaling performs standardization of data for supervised learning
40  * methods. Since the range of values of raw data varies widely, in some
41  * machine learning algorithms, objective functions will not work properly
42  * without normalization. Therefore, the range of all features should be
43  * normalized so that each feature contributes approximately equally.
44  *
45  * What we need for a standardization scheme.
46  * ---------------------------------------------------------------
47  *
48  * The standardization interface makes sure that you can implement various
49  * types of data standardization methods without effecting much of the code
50  * base.
51  *
52  *
53  * Each standardization scheme requires the following methods:
54  *
55  * *) Construction based on metadata: Given a complete metadata object,
56  * we can construct the standardization object.
57  *
58  * *) Transform: Perform a transformation from the original space to the
59  * standardized space.
60  *
61  * *) Inverse-Transform: Perform a transformation from the standardized space
62  * to the original space.
63  *
64  * Comparison of various methods for standardization
65  * ---------------------------------------------------------------
66  *
67  * 1) Norm-Rescaling: Given a column of data x, the norm re-scaling changes
68  * the column to:
69  * x' = x / ||x||
70  *
71  * where ||x|| can be the L1, L2, or L-Inf norm.
72  *
73  * PROS: Sparsity preserving.
74  * CONS: May not be the right thing to do for regularized problems.
75  *
76  * 2) Mean-Stdev: Given a column of data x, the norm re-scaling changes
77  * the column to:
78  * x' = (x - mean) / stdev
79  *
80  * PROS: Statistically well documented.
81  * CONS: Sparsity breaking
82  *
83  * 3) Min-Max: Given a column of data x, the norm re-scaling changes
84  * the column to:
85  * x' = (x - min(x)) / (max(x) - min(x))
86  *
87  * PROS: Well documented for SVM.
88  * CONS: Sparsity breaking
89  *
90  * \note The important part is for us to get something that helps with
91  * numerical issues and is sparsity preserving. The interface here allows
92  * us to try many things and see what works best.
93  *
94 */
96 
97  protected:
98 
99  size_t total_size; /**< # Total size */
100 
101  public:
102 
103  /**
104  * Default destructor.
105  */
106  virtual ~standardization_interface() = default;
107 
108  /**
109  * Default constructor.
110  */
111  standardization_interface() = default;
112 
113  // Dense Vectors
114  // --------------------------------------------------------------------------
115 
116  /**
117  * Transform a point from the original space to the standardized space.
118  *
119  * \param[in,out] point(DenseVector) Point to be transformed.
120  *
121  */
122  virtual void transform(DenseVector &point) const = 0;
123 
124 
125  /**
126  * Inverse transform a point from the standardized space to the original space.
127  *
128  * \param[in,out] point(DenseVector) Point to be transformed.
129  *
130  */
131  virtual void inverse_transform(DenseVector &point) const = 0;
132 
133 
134  // Sparse Vectors
135  // --------------------------------------------------------------------------
136 
137  /**
138  * Inverse transform a point from the standardized space to the original space.
139  *
140  * \param[in,out] point(SparseVector) Point to be transformed.
141  *
142  */
143  virtual void inverse_transform(SparseVector &point) const = 0;
144 
145  /**
146  * Transform a point from the original space to the standardized space.
147  *
148  * \param[in,out] point(SparseVector) Point to be transformed.
149  *
150  */
151  virtual void transform(SparseVector &point) const = 0;
152 
153 
154  /**
155  * Serialization -- Save object
156  *
157  * Save this class to a Turi oarc object.
158  * \param[in] oarc Turi oarc object
159  */
160  virtual void save(turi::oarchive& oarc) const = 0;
161 
162  /**
163  * Serialization -- Load object
164  *
165  * Load this class from a Turi iarc object.
166  * \param[in] iarc Turi iarc object
167  */
168  virtual void load(turi::iarchive& iarc) = 0;
169 
170 
171  /**
172  * Return the total size of all the variables in the space.
173  *
174  * \param[out] total_size Size of all the variables in the space.
175  *
176  * \note This is the sum of the sizes of the individual features that created
177  * this object. They are
178  *
179  * Numeric : 1
180  * Categorical : # Unique categories
181  * Vector : Size of the vector.
182  * CategoricalVector : # Unique categories.
183  * Dictionary : # Keys
184  *
185  * For reference encoding, subtract 1 from the Categorical and
186  * Categorical-Vector types.
187  *
188  * \return Column size.
189  *
190  */
191  size_t get_total_size() const {
192  return total_size;
193  }
194 
195 
196 };
197 
198 
199 /**
200  * Rescale columns by L2-norm
201  * x >= 0
202  */
204 
205  protected:
206 
207  DenseVector scale; /**< Scale */
208  bool use_reference; /**< Reference encoding */
209 
210  public:
211 
212 
213  /**
214  * Default destructor.
215  */
216  virtual ~l2_rescaling() {}
217 
218  /**
219  * Default constructor.
220  *
221  * \param[in] metadata Metadata object for the features.
222  * \param[in] index_size Sizes of each of the features.
223  * \param[in] use_reference Reference encoding of categorical?
224  *
225  * \note The index_size refers to the size of each of the features. The
226  * sizes of each type of features are:
227  *
228  * Numeric : 1
229  * String : # categories
230  * List : Size
231  * Categorical Vector : Total number of categories
232  * Dictionary : # keys
233  *
234  * \note Although the metadata keeps a copy of these sizes, they may not be
235  * consistent with what was seen during training (because of new categories).
236  * Hence, you would need both the metadata for the column stats collected
237  * during training and the index_size for feature sizes captured at the
238  * end of training.
239  *
240  */
242  const std::shared_ptr<ml_metadata> & ml_mdata,
243  bool _use_reference = true) {
244 
245  // Make sure the size is set
246  use_reference = _use_reference;
247  total_size = 1;
248  for(size_t i = 0; i < ml_mdata->num_columns(); i++){
249  if (ml_mdata->is_categorical(i)) {
250  total_size += ml_mdata->index_size(i) - use_reference;
251  } else {
252  total_size += ml_mdata->index_size(i);
253  }
254  }
255 
256  // Init the scale
257  scale.resize(total_size);
258  scale.setZero();
259  size_t idx = 0;
260 
261  for(size_t i = 0; i < ml_mdata->num_columns(); i++) {
262 
263  const auto& stats = ml_mdata->statistics(i);
264  bool skip_first = (use_reference && ml_mdata->is_categorical(i));
265 
266  // For each column in the metadata
267  // \note: Computing the L2 norm (averaged over example)
268  // Here, we compute the scale using the variance and means are follows:
269  //
270  // scale = sqrt(E[X^2]) = sqrt(Var(x) + E[X]^2)
271  //
272  // The stdev is the L2 norm of the data shifted by the mean. This undoes
273  // this shift. There could be an multiplication by an "N" to get the
274  // L2 norm but that multiple doesn't quite help.
275 
276  for (size_t k = skip_first ? 1 : 0; k < ml_mdata->index_size(i); ++k) {
277  double r = std::pow(stats->mean(k), 2) + std::pow(stats->stdev(k), 2);
278  scale(idx) = std::sqrt(std::max(r, optimization::OPTIMIZATION_ZERO));
279  ++idx;
280  }
281  }
282  scale(total_size-1) = 1;
283 
284  } // End of constructor
285 
286  // Dense Vectors
287  // --------------------------------------------------------------------------
288 
289  /**
290  * Transform a point from the original space to the standardized space.
291  *
292  * \param[in,out] point(DenseVector) Point to be transformed.
293  *
294  */
295  void transform(DenseVector &point) const {
296  DASSERT_EQ(point.size(), total_size);
297  point = point.cwiseQuotient(scale);
298 
299  }
300 
301  /**
302  * Transform a row of points from the original space to the standardized space.
303  *
304  * \param[in,out] point(DenseVector) Point to be transformed.
305  *
306  */
307  void transform(DenseMatrix &points) const {
308  DASSERT_EQ(points.cols(), total_size);
309  for (size_t i = 0; i < size_t(points.rows()); i++) {
310  points.row(i) = points.row(i).cwiseQuotient(scale.transpose());
311  }
312  }
313 
314  /**
315  * Inverse transform a point from the standardized space to the original space.
316  *
317  * \param[in,out] point(DenseVector) Point to be transformed.
318  *
319  */
320  void inverse_transform(DenseVector &point) const {
321  DASSERT_EQ(point.size(), total_size);
322  point = point.cwiseProduct(scale);
323  }
324 
325  // Sparse Vectors
326  // --------------------------------------------------------------------------
327 
328  /**
329  * Inverse transform a point from the standardized space to the original space.
330  *
331  * \param[in,out] point(SparseVector) Point to be transformed.
332  *
333  */
334  void inverse_transform(SparseVector &point) const {
335  DASSERT_EQ(point.size(), total_size);
336  for (SparseVector::InnerIterator i(point); i; ++i){
337  i.valueRef() = i.value() * scale(i.index());
338  }
339 
340  }
341 
342  /**
343  * Transform a point from the original space to the standardized space.
344  *
345  * \param[in,out] point(SparseVector) Point to be transformed.
346  *
347  */
348  void transform(SparseVector &point) const {
349  DASSERT_EQ(point.size(), total_size);
350  for (SparseVector::InnerIterator i(point); i; ++i){
351  i.valueRef() = i.value() / scale(i.index());
352  }
353  }
354 
355  /**
356  * Serialization -- Save object
357  *
358  * Save this class to a Turi oarc object.
359  * \param[in] oarc Turi oarc object
360  */
361  void save(turi::oarchive& oarc) const{
362  oarc << total_size
363  << scale
364  << use_reference;
365  }
366 
367  /**
368  * Serialization -- Load object
369  *
370  * Load this class from a Turi iarc object.
371  * \param[in] iarc Turi iarc object
372  */
373  void load(turi::iarchive& iarc){
374  iarc >> total_size
375  >> scale
376  >> use_reference;
377  }
378 
379 
380 };
381 
382 
383 } // supervised
384 } // turicreate
385 #endif
void transform(DenseMatrix &points) const
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
const double OPTIMIZATION_ZERO
Optimization method zero.
void transform(DenseVector &point) const
void save(turi::oarchive &oarc) const
void load(turi::iarchive &iarc)
virtual void load(turi::iarchive &iarc)=0
virtual void transform(DenseVector &point) const =0
void inverse_transform(DenseVector &point) const
virtual void save(turi::oarchive &oarc) const =0
void inverse_transform(SparseVector &point) const
l2_rescaling(const std::shared_ptr< ml_metadata > &ml_mdata, bool _use_reference=true)
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
virtual void inverse_transform(DenseVector &point) const =0
void transform(SparseVector &point) const