Turi Create  4.0
standardization-inl.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_STANDARDIZATION_H_
7 #define TURI_STANDARDIZATION_H_
8 
9 #include <string>
10 #include <core/data/flexible_type/flexible_type.hpp>
11 
12 // Eigen
13 #include <Eigen/Core>
14 #include <Eigen/SparseCore>
15 
16 // Optimizaiton
17 #include <ml/optimization/optimization_interface.hpp>
18 
19 // ML-Data
20 #include <toolkits/ml_data_2/ml_data.hpp>
21 #include <toolkits/ml_data_2/metadata.hpp>
22 
23 // TODO: List of todo's for this file
24 //------------------------------------------------------------------------------
25 //
26 
27 namespace turi {
28 
29 /**
30  *
31  * Interface for affine transformation of data for machine learning and
32  * optimization purposes.
33  *
34  *
35  * Background: Feature Scaling
36  * --------------------------------------------------------------------------
37  *
38  * Feature scaling performs standardization of data for supervised learning
39  * methods. Since the range of values of raw data varies widely, in some
40  * machine learning algorithms, objective functions will not work properly
41  * without normalization. Therefore, the range of all features should be
42  * normalized so that each feature contributes approximately equally.
43  *
44  * What we need for a standardization scheme.
45  * ---------------------------------------------------------------
46  *
47  * The standardization interface makes sure that you can implement various
48  * types of data standardization methods without effecting much of the code
49  * base.
50  *
51  *
52  * Each standardization scheme requires the following methods:
53  *
54  * *) Construction based on metadata: Given a complete metadata object,
55  * we can construct the standardization object.
56  *
57  * *) Transform: Perform a transformation from the original space to the
58  * standardized space.
59  *
60  * *) Inverse-Transform: Perform a transformation from the standardized space
61  * to the original space.
62  *
63  * Comparison of various methods for standardization
64  * ---------------------------------------------------------------
65  *
66  * 1) Norm-Rescaling: Given a column of data x, the norm re-scaling changes
67  * the column to:
68  * x' = x / ||x||
69  *
70  * where ||x|| can be the L1, L2, or L-Inf norm.
71  *
72  * PROS: Sparsity preserving.
73  * CONS: May not be the right thing to do for regularized problems.
74  *
75  * 2) Mean-Stdev: Given a column of data x, the norm re-scaling changes
76  * the column to:
77  * x' = (x - mean) / stdev
78  *
79  * PROS: Statistically well documented.
80  * CONS: Sparsity breaking
81  *
82  * 3) Min-Max: Given a column of data x, the norm re-scaling changes
83  * the column to:
84  * x' = (x - min(x)) / (max(x) - min(x))
85  *
86  * PROS: Well documented for SVM.
87  * CONS: Sparsity breaking
88  *
89  * \note The important part is for us to get something that helps with
90  * numerical issues and is sparsity preserving. The interface here allows
91  * us to try many things and see what works best.
92  *
93 */
95 
96  protected:
97 
98  size_t total_size; /**< # Total size */
99 
100  public:
101 
102  /**
103  * Default destructor.
104  */
106 
107  /**
108  * Default constructor.
109  */
111 
112  // Dense Vectors
113  // --------------------------------------------------------------------------
114 
115  /**
116  * Transform a point from the original space to the standardized space.
117  *
118  * \param[in,out] point(DenseVector) Point to be transformed.
119  *
120  */
121  virtual void transform(DenseVector &point) const = 0;
122 
123 
124  /**
125  * Inverse transform a point from the standardized space to the original space.
126  *
127  * \param[in,out] point(DenseVector) Point to be transformed.
128  *
129  */
130  virtual void inverse_transform(DenseVector &point) const = 0;
131 
132 
133  // Sparse Vectors
134  // --------------------------------------------------------------------------
135 
136  /**
137  * Inverse transform a point from the standardized space to the original space.
138  *
139  * \param[in,out] point(SparseVector) Point to be transformed.
140  *
141  */
142  virtual void inverse_transform(SparseVector &point) const = 0;
143 
144  /**
145  * Transform a point from the original space to the standardized space.
146  *
147  * \param[in,out] point(SparseVector) Point to be transformed.
148  *
149  */
150  virtual void transform(SparseVector &point) const = 0;
151 
152 
153  /**
154  * Serialization -- Save object
155  *
156  * Save this class to a Turi oarc object.
157  * \param[in] oarc Turi oarc object
158  */
159  virtual void save(turi::oarchive& oarc) const = 0;
160 
161  /**
162  * Serialization -- Load object
163  *
164  * Load this class from a Turi iarc object.
165  * \param[in] iarc Turi iarc object
166  */
167  virtual void load(turi::iarchive& iarc) = 0;
168 
169 
170  /**
171  * Return the total size of all the variables in the space.
172  *
173  * \param[out] total_size Size of all the variables in the space.
174  *
175  * \note This is the sum of the sizes of the individual features that created
176  * this object. They are
177  *
178  * Numeric : 1
179  * Categorical : # Unique categories
180  * Vector : Size of the vector.
181  * CategoricalVector : # Unique categories.
182  * Dictionary : # Keys
183  *
184  * For reference encoding, subtract 1 from the Categorical and
185  * Categorical-Vector types.
186  *
187  * \return Column size.
188  *
189  */
190  size_t get_total_size() const {
191  return total_size;
192  }
193 
194 
195 };
196 
197 
198 /**
199  * Rescale columns by L2-norm
200  * x >= 0
201  */
203 
204  protected:
205 
206  DenseVector scale; /**< Scale */
207  bool use_reference; /**< Reference encoding */
208 
209  public:
210 
211 
212  /**
213  * Default destructor.
214  */
215  virtual ~l2_rescaling() {};
216 
217  /**
218  * Default constructor.
219  *
220  * \param[in] metadata Metadata object for the features.
221  * \param[in] index_size Sizes of each of the features.
222  * \param[in] use_reference Reference encoding of categorical?
223  *
224  * \note The index_size refers to the size of each of the features. The
225  * sizes of each type of features are:
226  *
227  * Numeric : 1
228  * String : # categories
229  * List : Size
230  * Categorical Vector : Total number of categories
231  * Dictionary : # keys
232  *
233  * \note Although the metadata keeps a copy of these sizes, they may not be
234  * consistent with what was seen during training (because of new categories).
235  * Hence, you would need both the metadata for the column stats collected
236  * during training and the index_size for feature sizes captured at the
237  * end of training.
238  *
239  */
241  const std::shared_ptr<v2::ml_metadata> & ml_metadata,
242  bool _use_reference = true){
243 
244  // Make sure the size is set
245  use_reference = _use_reference;
246  total_size = 1;
247  for(size_t i = 0; i < ml_metadata->num_columns(); i++){
248  if (ml_metadata->is_categorical(i)) {
249  total_size += ml_metadata->index_size(i) - use_reference;
250  } else {
251  total_size += ml_metadata->index_size(i);
252  }
253  }
254 
255  // Init the scale
256  scale.resize(total_size);
257  scale.setZero();
258  size_t idx = 0;
259 
260  for(size_t i = 0; i < ml_metadata->num_columns(); i++){
261 
262  const auto& stats = ml_metadata->statistics(i);
263 
264  // For each column in the metadata
265  // \note: Computing the L2 norm (averaged over example)
266  // Here, we compute the scale using the variance and means are follows:
267  //
268  // scale = sqrt(E[X^2]) = sqrt(Var(x) + E[X]^2)
269  //
270  // The stdev is the L2 norm of the data shifted by the mean. This undoes
271  // this shift. There could be an multiplication by an "N" to get the
272  // L2 norm but that multiple doesn't quite help.
273  switch(ml_metadata->column_mode(i)) {
274 
275  // Numeric
276  case v2::ml_column_mode::NUMERIC: {
277  scale(idx) = stats->mean(0) * stats->mean(0) +
278  stats->stdev(0) * stats->stdev(0);
279  idx += 1;
280  break;
281  }
282 
283  // Categorical
284  case v2::ml_column_mode::CATEGORICAL: {
285  for (size_t c = 0; c < ml_metadata->index_size(i); c++){
286  if(c >= use_reference){
287  scale(idx) = stats->mean(c) * stats->mean(c) +
288  stats->stdev(c) * stats->stdev(c);
289  idx++;
290  }
291  }
292  break;
293  }
294 
295  // Numeric vector
296  case v2::ml_column_mode::NUMERIC_VECTOR: {
297  for (size_t c = 0; c < ml_metadata->index_size(i); c++){
298  scale(idx) = stats->mean(c) * stats->mean(c) +
299  stats->stdev(c) * stats->stdev(c);
300  ++idx;
301  }
302  break;
303  }
304 
305  // Categorical vector
306  case v2::ml_column_mode::CATEGORICAL_VECTOR: {
307  for (size_t c = 0; c < ml_metadata->index_size(i); c++){
308  if(c >= use_reference){
309  scale(idx) = stats->mean(c) * stats->mean(c) +
310  stats->stdev(c) * stats->stdev(c);
311  idx++;
312  }
313  }
314  break;
315  }
316 
317  // Dictionary
318  case v2::ml_column_mode::DICTIONARY: {
319  for(size_t k = 0; k < ml_metadata->index_size(i); ++k) {
320  scale(idx) = stats->mean(k) * stats->mean(k) +
321  stats->stdev(k) * stats->stdev(k);
322  idx++;
323  }
324  break;
325  }
326 
327  case v2::ml_column_mode::UNTRANSLATED: {
328  break;
329  }
330 
331  default: {
332  std::cerr << "Unsupported ml_column_mode for L2 rescaling" << std::endl;
333  ASSERT_UNREACHABLE();
334  break;
335  }
336  } // End of column-switch-case
337  } // End-of metadata for
338 
339  scale = scale.cwiseMax(optimization::OPTIMIZATION_ZERO);
340  scale = scale.array().pow(0.5);
341  scale(total_size-1) = 1;
342 
343  } // End of constructor
344 
345  // Dense Vectors
346  // --------------------------------------------------------------------------
347 
348  /**
349  * Transform a point from the original space to the standardized space.
350  *
351  * \param[in,out] point(DenseVector) Point to be transformed.
352  *
353  */
354  void transform(DenseVector &point) const {
355  DASSERT_EQ(point.size(), total_size);
356  point = point.cwiseQuotient(scale);
357 
358  }
359 
360  /**
361  * Transform a row of points from the original space to the standardized space.
362  *
363  * \param[in,out] point(DenseVector) Point to be transformed.
364  *
365  */
366  void transform(DenseMatrix &points) const {
367  DASSERT_EQ(points.cols(), total_size);
368  for (size_t i = 0; i < size_t(points.rows()); i++) {
369  points.row(i) = points.row(i).cwiseQuotient(scale.transpose());
370  }
371  }
372 
373  /**
374  * Inverse transform a point from the standardized space to the original space.
375  *
376  * \param[in,out] point(DenseVector) Point to be transformed.
377  *
378  */
379  void inverse_transform(DenseVector &point) const {
380  DASSERT_EQ(point.size(), total_size);
381  point = point.cwiseProduct(scale);
382  }
383 
384  // Sparse Vectors
385  // --------------------------------------------------------------------------
386 
387  /**
388  * Inverse transform a point from the standardized space to the original space.
389  *
390  * \param[in,out] point(SparseVector) Point to be transformed.
391  *
392  */
393  void inverse_transform(SparseVector &point) const {
394  DASSERT_EQ(point.size(), total_size);
395  for (SparseVector::InnerIterator i(point); i; ++i){
396  i.valueRef() = i.value() * scale(i.index());
397  }
398 
399  }
400 
401  /**
402  * Transform a point from the original space to the standardized space.
403  *
404  * \param[in,out] point(SparseVector) Point to be transformed.
405  *
406  */
407  void transform(SparseVector &point) const {
408  DASSERT_EQ(point.size(), total_size);
409  for (SparseVector::InnerIterator i(point); i; ++i){
410  i.valueRef() = i.value() / scale(i.index());
411  }
412  }
413 
414  /**
415  * Serialization -- Save object
416  *
417  * Save this class to a Turi oarc object.
418  * \param[in] oarc Turi oarc object
419  */
420  void save(turi::oarchive& oarc) const{
421  oarc << total_size
422  << scale
423  << use_reference;
424  }
425 
426  /**
427  * Serialization -- Load object
428  *
429  * Load this class from a Turi iarc object.
430  * \param[in] iarc Turi iarc object
431  */
432  void load(turi::iarchive& iarc){
433  iarc >> total_size
434  >> scale
435  >> use_reference;
436  }
437 
438 
439 };
440 
441 
442 } // turicreate
443 #endif
void load(turi::iarchive &iarc)
virtual void load(turi::iarchive &iarc)=0
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
const double OPTIMIZATION_ZERO
Optimization method zero.
virtual void transform(DenseVector &point) const =0
virtual void inverse_transform(DenseVector &point) const =0
void transform(DenseMatrix &points) const
void transform(DenseVector &point) const
void save(turi::oarchive &oarc) const
virtual void save(turi::oarchive &oarc) const =0
l2_rescaling(const std::shared_ptr< v2::ml_metadata > &ml_metadata, bool _use_reference=true)
void transform(SparseVector &point) const
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
void inverse_transform(DenseVector &point) const
void inverse_transform(SparseVector &point) const