Turi Create  4.0
row_reference.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_ML_DATA_ROW_REFERENCE_H_
7 #define TURI_ML_DATA_ROW_REFERENCE_H_
8 
9 #include <core/logging/assertions.hpp>
10 #include <toolkits/ml_data_2/data_storage/ml_data_row_translation.hpp>
11 #include <toolkits/ml_data_2/data_storage/ml_data_block_manager.hpp>
12 #include <toolkits/ml_data_2/ml_data.hpp>
13 #include <toolkits/ml_data_2/side_features.hpp>
14 #include <core/util/code_optimization.hpp>
15 
16 #include <Eigen/SparseCore>
17 #include <Eigen/Core>
18 
19 #include <array>
20 
21 namespace turi { namespace v2 {
22 
23 typedef Eigen::Matrix<double, Eigen::Dynamic,1> DenseVector;
24 typedef Eigen::SparseVector<double> SparseVector;
25 
26 /**
27  * A class containing a reference to the row of an ml_data instance.
28  * The row can then be used to fill any sort of data row that an
29  * iterator can be used to fill.
30  *
31  * In other words,
32  *
33  * it.fill_observation(x);
34  *
35  * Can be replaced with
36  *
37  * auto row_ref = it.get_reference();
38  *
39  * // do stuff ...
40  * row_ref.fill(x);
41  *
42  * The data block pointed to by this reference is kept alive as long
43  * as this reference class exists.
44  *
45  *
46  * Another example of how it is used is below:
47  *
48  * sframe X = make_integer_testing_sframe( {"C1", "C2"}, { {0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4} } );
49  *
50  * v2::ml_data data;
51  *
52  * data.fill(X);
53  *
54  * // Get row references
55  *
56  * std::vector<v2::ml_data_row_reference> rows(data.num_rows());
57  *
58  * for(auto it = data.get_iterator(); !it.done(); ++it) {
59  * rows[it.row_index()] = it.get_reference();
60  * }
61  *
62  * // Now go through and make sure that each of these hold the
63  * // correct answers.
64  *
65  * std::vector<v2::ml_data_entry> x;
66  *
67  * for(size_t i = 0; i < rows.size(); ++i) {
68  *
69  * // The metadata for the row is the same as that in the data.
70  * ASSERT_TRUE(rows[i].metadata().get() == data.metadata().get());
71  *
72  * rows[i].fill(x);
73  *
74  * ASSERT_EQ(x.size(), 2);
75  *
76  * ASSERT_EQ(x[0].column_index, 0);
77  * ASSERT_EQ(x[0].index, 0);
78  * ASSERT_EQ(x[0].value, i);
79  *
80  * ASSERT_EQ(x[1].column_index, 1);
81  * ASSERT_EQ(x[1].index, 0);
82  * ASSERT_EQ(x[1].value, i);
83  * }
84  * }
85  *
86  */
88  public:
89 
90  /**
91  * Fill an observation vector, represented as an ml_data_entry
92  * struct. (column_index, index, value) pairs, from this row
93  * reference. For each column:
94  *
95  * Categotical: Returns (col_id, v, 1)
96  * Numeric : Returns (col_id, 0, v)
97  * Vector : Returns (col_id, i, v) for each (i,v) in vector.
98  *
99  * Example use is given by the following code:
100  *
101  * std::vector<ml_data_entry> x;
102  *
103  * row_ref.fill(x);
104  * double y = row_ref.target_value();
105  * ...
106  */
107  template <typename Entry>
109  inline void fill(std::vector<Entry>& x) const {
110 
111  x.clear();
112 
113  if(!data_block->metadata->has_translated_columns())
114  return;
115 
116  ml_data_internal::copy_raw_into_ml_data_entry_row(
117  x, data_block->rm, current_data_iter(),
118  side_features);
119  }
120 
121  /**
122  * Fill an observation vector with the untranslated columns, if any
123  * have been specified at setup time. These columns are simply
124  * mapped back to their sarray counterparts.
125  */
126  inline void fill_untranslated_values(std::vector<flexible_type>& x) const GL_HOT_INLINE_FLATTEN {
127 
128  if(!data_block->metadata->has_untranslated_columns()) {
129  x.clear();
130  return;
131  }
132 
133  x.resize(data_block->untranslated_columns.size());
134 
135  for(size_t i = 0; i < data_block->untranslated_columns.size(); ++i) {
136  x[i] = data_block->untranslated_columns[i][current_in_block_index];
137  }
138 
139  DASSERT_TRUE(x.size() >= 1);
140  }
141 
142 
143  /**
144  * Fill an observation vector, represented as an Eigen Sparse Vector, from
145  * the current location in the iteration.
146  *
147  * \note A reference category is used in this version of the function.
148  * \note For performance reasons, this function does not check for new
149  * categories during predict time. That must be checked externally.
150  *
151  * This function returns a flattened version of the vector provided by the
152  * std::pair version of fill.
153  *
154  * Example
155  * ---------------------------------------------
156  *
157  * \warning This only works when the SFrame is "mapped" to integer keys.
158  *
159  * For a dataset with a 3 column SFrame
160  *
161  * Row 1: 1.0 0(categorical) <9.1, 2.4>
162  * Row 2: 2.0 1(categorical) <1.0, 4.5>
163  *
164  * with index = {1,2,2}
165  *
166  * the SparseVector format would return
167  *
168  * Row 1: < (0, 1.0), (1, 1) ,(3, 9.1) ,(4, 2.4)>
169  * Row 2: < (0, 2.0), (2, 1) ,(3, 1.0) ,(4, 4.5)>
170  *
171  * \note The '0'th category is used as reference.
172  *
173  * \param[in,out] x Data containing everything!
174  *
175  */
176  inline void fill(SparseVector& x) const GL_HOT_INLINE_FLATTEN {
177 
178  x.setZero();
179 
180  if(!data_block->metadata->has_translated_columns())
181  return;
182 
183  ml_data_internal::copy_raw_into_eigen_array(
184  x,
185  data_block->rm, current_data_iter(),
186  side_features,
187  use_reference_encoding);
188  }
189 
190  /**
191  * Fill an observation vector, represented as an Eigen Dense Vector, from
192  * the current location in the iteration.
193  *
194  * \note The 0th category is used as a reference category.
195  *
196  * \note For performance reasons, this function does not check for new
197  * categories during predict time. That must be checked externally.
198  *
199  * This function returns a flattened version of the vector provided by the
200  * std::pair version of fill.
201  *
202  * Example
203  * ---------------------------------------------
204  *
205  * \warning This only works when the SFrame is "mapped" to intger keys.
206  *
207  * For a dataset with a 3 column SFrame
208  *
209  * Row 1: 1.0 0(categorical) <9.1, 2.4>
210  * Row 2: 2.0 1(categorical) <1.0, 4.5>
211  *
212  * with index = {1,2,2}
213  *
214  * the DenseVector format would return
215  *
216  * Row 1: <1.0, 0, 1, 9.1, 2.4>
217  * Row 2: <2.0, 1, 0, 1.0, 4.5>
218  *
219  * \param[in,out] x Data containing everything!
220  *
221  */
222  inline void fill(DenseVector& x) const GL_HOT_INLINE_FLATTEN {
223 
224  x.setZero();
225 
226  if(!data_block->metadata->has_translated_columns())
227  return;
228 
229  ml_data_internal::copy_raw_into_eigen_array(
230  x,
231  data_block->rm, current_data_iter(),
232  side_features,
233  use_reference_encoding);
234  }
235 
236  /**
237  * Fill a row of an Eigen Dense Vector, from
238  * the current location in the iteration.
239  *
240  * \note The 0th category is used as a reference category.
241  *
242  *
243  * Example:
244  *
245  * Eigen::MatrixXd X;
246  *
247  * ...
248  *
249  * it.fill_eigen_row(X.row(row_idx));
250  *
251  * ---------------------------------------------
252  *
253  * \param[in,out] x An eigen row expression.
254  *
255  */
256  template <typename DenseRowXpr>
258  inline void fill_eigen_row(DenseRowXpr&& x) const {
259 
260  x.setZero();
261 
262  ml_data_internal::copy_raw_into_eigen_array(
263  x,
264  data_block->rm, current_data_iter(),
265  side_features,
266  use_reference_encoding);
267  }
268 
269  /** Returns the current target value, if present, or 1 if not
270  * present. If the target column is supposed to be a categorical
271  * value, then use categorical_target_index().
272  */
274  return get_target_value(data_block->rm, current_data_iter());
275  }
276 
277  /** Returns the current categorical target index, if present, or 0
278  * if not present.
279  */
281  return get_target_index(data_block->rm, current_data_iter());
282  }
283 
284  /** Returns a pointer to the metadata class that describes the data
285  * that this row reference refers to.
286  */
287  const std::shared_ptr<ml_metadata>& metadata() const {
288  return data_block->metadata;
289  }
290 
291 
292  private:
293  friend class ml_data_iterator_base;
294 
295  std::shared_ptr<ml_data_internal::ml_data_block> data_block;
296  std::shared_ptr<ml_data_side_features> side_features;
297  size_t current_in_block_index = size_t(-1);
298  bool use_reference_encoding = false;
299 
300  /** Return a pointer to the current location in the data.
301  */
302  inline ml_data_internal::entry_value_iterator current_data_iter() const GL_HOT_INLINE_FLATTEN {
303 
304  DASSERT_LT(current_in_block_index, data_block->translated_rows.entry_data.size());
305 
306  return &(data_block->translated_rows.entry_data[current_in_block_index]);
307  }
308 
309 };
310 
311 
312 }}
313 
314 #endif /* TURI_ML_DATA_ROW_REFERENCE_H_ */
const std::shared_ptr< ml_metadata > & metadata() const
void fill(SparseVector &x) const GL_HOT_INLINE_FLATTEN
double target_value() const GL_HOT_INLINE_FLATTEN
GL_HOT_INLINE_FLATTEN void fill_eigen_row(DenseRowXpr &&x) const
size_t target_index() const GL_HOT_INLINE_FLATTEN
#define GL_HOT_INLINE
GL_HOT_INLINE void fill(std::vector< Entry > &x) const
#define GL_HOT_INLINE_FLATTEN
void fill_untranslated_values(std::vector< flexible_type > &x) const GL_HOT_INLINE_FLATTEN
#define DASSERT_TRUE(cond)
Definition: assertions.hpp:364
void fill(DenseVector &x) const GL_HOT_INLINE_FLATTEN