Turi Create  4.0
ml_data_iterator_base.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_ML2_DATA_ITERATOR_BASE_H_
7 #define TURI_ML2_DATA_ITERATOR_BASE_H_
8 
9 #include <core/logging/assertions.hpp>
10 #include <toolkits/ml_data_2/data_storage/ml_data_row_translation.hpp>
11 #include <toolkits/ml_data_2/data_storage/ml_data_block_manager.hpp>
12 #include <toolkits/ml_data_2/ml_data.hpp>
13 #include <toolkits/ml_data_2/side_features.hpp>
14 #include <toolkits/ml_data_2/iterators/composite_row_type.hpp>
15 #include <toolkits/ml_data_2/iterators/row_reference.hpp>
16 #include <core/util/code_optimization.hpp>
17 
18 // SArray and Flex type
19 #include <core/storage/sframe_data/sarray.hpp>
20 
21 #include <Eigen/SparseCore>
22 #include <Eigen/Core>
23 
24 #include <array>
25 
26 namespace turi { namespace v2 {
27 
28 class ml_data;
29 
30 typedef Eigen::Matrix<double, Eigen::Dynamic,1> DenseVector;
31 typedef Eigen::SparseVector<double> SparseVector;
32 
33 /**
34  * Just a simple iterator on the ml_data class. It's just a
35  * convenience structure that keeps track of everything relevant for
36  * the toolkits.
37  */
39  private:
40 
41  // To be initialized only from the get_iterator() method of ml_data.
42  friend class ml_data;
43 
44  /**
45  * Default method of constructing the data.
46  *
47  * \param[in] ml_init ML Data iterator initializer.
48  */
49  void setup(const ml_data& _data,
51  size_t thread_idx, size_t num_threads,
52  const std::map<std::string, flexible_type>& options);
53 
54  protected:
55 
56  virtual void internal_setup(const std::map<std::string, flexible_type>& options) {}
57 
58  public:
59 
60  /** Yup, need this.
61  */
63 
64  public:
65 
69 
70  ml_data_iterator_base& operator=(const ml_data_iterator_base&) = delete;
71  ml_data_iterator_base& operator=(ml_data_iterator_base&&) = default;
72 
73  /// Resets the iterator to the start of the sframes in ml_data.
74  virtual void reset();
75 
76  /// Returns true if the iteration is done, false otherwise.
77  virtual inline bool done() const { return current_row_index == iter_row_index_end; }
78 
79  /// Returns the current index of the sframe row, respecting all
80  /// slicing operations on the original ml_data.
81  inline size_t row_index() const { return current_row_index - global_row_start; }
82 
83  /// Returns the absolute row index
84  inline size_t unsliced_row_index() const { return current_row_index; }
85 
86  /**
87  * Fill an observation vector, represented as an ml_data_entry struct.
88  * (column_index, index, value) pairs, from the current location in the
89  * iteration. For each column:
90  *
91  * Categotical: Returns (col_id, v, 1)
92  * Numeric : Returns (col_id, 0, v)
93  * Vector : Returns (col_id, i, v) for each (i,v) in vector.
94  *
95  * Example use is given by the following code:
96  *
97  * std::vector<ml_data_entry> x;
98  *
99  * for(ml_data_iterator it(data); !it.is_done(); ++it) {
100  * it.fill_observation(x);
101  * double y = it.target_value();
102  * ...
103  * }
104  */
105  template <typename Entry>
107  inline void fill_observation(std::vector<Entry>& x) const {
108 
109 
110  if(UNLIKELY(x.capacity() < max_row_size)) {
111  x.reserve(max_row_size);
112  }
113 
114  x.clear();
115 
116  if(!has_translated_columns)
117  return;
118 
119  ml_data_internal::copy_raw_into_ml_data_entry_row(
120  x, rm, current_data_iter(),
121  side_features);
122 
123  DASSERT_LE(x.size(), data->max_row_size());
124  }
125 
126  /**
127  * Fill an observation vector with the untranslated columns, if any
128  * have been specified at setup time. These columns are simply
129  * mapped back to their sarray counterparts.
130  *
131  * The metadata surrounding the original column indices are
132  */
133  inline void fill_untranslated_values(std::vector<flexible_type>& x) const GL_HOT_INLINE_FLATTEN {
134 
135  if(!has_untranslated_columns) {
136  x.clear();
137  return;
138  }
139 
140  x.resize(data_block->untranslated_columns.size());
141 
143 
144  for(size_t i = 0; i < data_block->untranslated_columns.size(); ++i) {
145  x[i] = data_block->untranslated_columns[i][row_index];
146  }
147 
148  DASSERT_TRUE(x.size() >= 1);
149  }
150 
151  /**
152  * Fill an observation vector, represented as an Eigen Sparse Vector, from
153  * the current location in the iteration.
154  *
155  * \note A reference category is used in this version of the function.
156  * \note For performance reasons, this function does not check for new
157  * categories during predict time. That must be checked externally.
158  *
159  * This function returns a flattened version of the vector provided by the
160  * std::pair version of fill_observation.
161  *
162  * Example
163  * ---------------------------------------------
164  *
165  * \warning This only works when the SFrame is "mapped" to integer keys.
166  *
167  * For a dataset with a 3 column SFrame
168  *
169  * Row 1: 1.0 0(categorical) <9.1, 2.4>
170  * Row 2: 2.0 1(categorical) <1.0, 4.5>
171  *
172  * with index = {1,2,2}
173  *
174  * the SparseVector format would return
175  *
176  * Row 1: < (0, 1.0), (1, 1) ,(3, 9.1) ,(4, 2.4)>
177  * Row 2: < (0, 2.0), (2, 1) ,(3, 1.0) ,(4, 4.5)>
178  *
179  * \note The '0'th category is used as reference.
180  *
181  * \param[in,out] x Data containing everything!
182  *
183  */
184  inline void fill_observation(SparseVector& x) const GL_HOT_INLINE_FLATTEN {
185 
186 
187  x.setZero();
188 
189  if(!has_translated_columns)
190  return;
191 
192  ml_data_internal::copy_raw_into_eigen_array(
193  x,
194  rm, current_data_iter(),
195  side_features,
196  use_reference_encoding);
197  }
198 
199 
200  /**
201  * Fill an observation vector, represented as an Eigen Dense Vector, from
202  * the current location in the iteration.
203  *
204  * \note The 0th category is used as a reference category.
205  *
206  * \note For performance reasons, this function does not check for new
207  * categories during predict time. That must be checked externally.
208  *
209  * This function returns a flattened version of the vector provided by the
210  * std::pair version of fill_observation.
211  *
212  * Example
213  * ---------------------------------------------
214  *
215  * \warning This only works when the SFrame is "mapped" to intger keys.
216  *
217  * For a dataset with a 3 column SFrame
218  *
219  * Row 1: 1.0 0(categorical) <9.1, 2.4>
220  * Row 2: 2.0 1(categorical) <1.0, 4.5>
221  *
222  * with index = {1,2,2}
223  *
224  * the DenseVector format would return
225  *
226  * Row 1: <1.0, 0, 1, 9.1, 2.4>
227  * Row 2: <2.0, 1, 0, 1.0, 4.5>
228  *
229  * \param[in,out] x Data containing everything!
230  *
231  */
232  inline void fill_observation(DenseVector& x) const GL_HOT_INLINE_FLATTEN {
233 
234  x.setZero();
235 
236  if(!has_translated_columns)
237  return;
238 
239  ml_data_internal::copy_raw_into_eigen_array(
240  x,
241  rm, current_data_iter(),
242  side_features,
243  use_reference_encoding);
244  }
245 
246 
247  /**
248  * Fill a row of an Eigen Dense Vector, from
249  * the current location in the iteration.
250  *
251  * \note The 0th category is used as a reference category.
252  *
253  *
254  * Example:
255  *
256  * Eigen::MatrixXd X;
257  *
258  * ...
259  *
260  * it.fill_eigen_row(X.row(row_idx));
261  *
262  * ---------------------------------------------
263  *
264  * \param[in,out] x An eigen row expression.
265  *
266  */
267  template <typename DenseRowXpr>
269  inline void fill_eigen_row(DenseRowXpr&& x) const {
270 
271  x.setZero();
272 
273  ml_data_internal::copy_raw_into_eigen_array(
274  x,
275  rm, current_data_iter(),
276  side_features,
277  use_reference_encoding);
278  }
279 
280 
281  /** Fill a composite row container. The composite row container
282  * must have its specification set; this specification is used to
283  * then fill the observation.
284  */
286 
287  DASSERT_TRUE(crc.subrow_spec != nullptr);
288 
289  fill_untranslated_values(crc.flextype_buffer);
290  crc.subrow_spec->fill(crc, rm, current_data_iter(), crc.flextype_buffer);
291  }
292 
293 
294  /** Returns the current target value, if present, or 1 if not
295  * present. If the target column is supposed to be a categorical
296  * value, then use categorical_target_index().
297  */
299 
300  DASSERT_FALSE(done());
301  DASSERT_FALSE(current_in_block_index == data_block->translated_rows.entry_data.size());
302 
303  return get_target_value(rm, current_data_iter());
304  }
305 
306  /** Returns the current categorical target index, if present, or 0
307  * if not present.
308  */
310 
311  DASSERT_FALSE(done());
312  DASSERT_FALSE(current_in_block_index == data_block->translated_rows.entry_data.size());
313 
314  return get_target_index(rm, current_data_iter());
315  }
316 
317  ////////////////////////////////////////////////////////////////////////////////
318 
319  /** Return a row reference instead of the actual observation. The
320  * row reference can be used to fill the observation vectors just
321  * like the iterator can, and can easily be passed around by value.
322  */
325 
326  ref.data_block = data_block;
327  ref.side_features = side_features;
328  ref.current_in_block_index = current_in_block_index;
329  ref.use_reference_encoding = use_reference_encoding;
330 
331  return ref;
332  }
333 
334 
335  ////////////////////////////////////////////////////////////////////////////////
336 
337  /** Return the data this iterator is working with.
338  */
339  inline const ml_data& ml_data_source() const {
340  return *data;
341  }
342 
343  /** Return the raw value of the internal row storage. Used by some
344  * of the internal ml_data processing routines.
345  */
347 
348  if(!rm.data_size_is_constant)
349  ++raw_index;
350 
351  return *(current_data_iter() + raw_index);
352  }
353 
354  protected:
355 
356  // Internally, ml_data is just a bunch of shared pointers, so it's
357  // not expensive to store a copy.
358  std::shared_ptr<ml_data> data;
359 
361 
362  std::shared_ptr<ml_data_side_features> side_features;
363 
364  /** The options used for this iterator.
365  */
366  bool add_side_information = false;
367  bool use_reference_encoding = false;
368  bool has_untranslated_columns = false;
369  bool has_translated_columns = false;
370 
371  size_t row_block_size = -1;
372  size_t iter_row_index_start = -1; /**< Starting row index for this iterator. */
373  size_t iter_row_index_end = -1; /**< Ending row index for this iterator. */
374  size_t current_row_index = -1; /**< Current row index for this iterator. */
375  size_t current_block_index = -1; /**< Index of the currently loaded block. */
376 
377  /** The current index pointed to inside the block.
378  */
380 
381  /** The absolute values of the global row starting locations.
382  */
383  size_t global_row_start, global_row_end;
384 
385  /** The maximum row size across all rows in the given ml_data object.
386  * Each row's size is defined to be the number of unpacked features in that
387  * row. For example, this is useful when one needs to preallocate a vector
388  * to be the largest size needed for any row that will be given by this
389  * iterator.
390  */
391  size_t max_row_size;
392 
393  /** The total sum of column sizes.
394  */
396 
397  private:
398 
399  /** A pointer to the current block.
400  */
401  std::shared_ptr<ml_data_internal::ml_data_block> data_block;
402 
403  protected:
404 
405  /** Return a pointer to the current location in the data.
406  */
408 
409  DASSERT_FALSE(done());
410  DASSERT_LT(current_in_block_index, data_block->translated_rows.entry_data.size());
411 
412  return &(data_block->translated_rows.entry_data[current_in_block_index]);
413  }
414 
415  /** Return a pointer to the current location in the data.
416  */
418 
419  size_t index = current_row_index - (current_block_index * row_block_size);
420 
421  DASSERT_FALSE(done());
422  DASSERT_LT(index, row_block_size);
423 
424  return index;
425  }
426 
427 
428  /** Advance to the next row.
429  */
431 
432  if(has_translated_columns)
433  current_in_block_index += get_row_data_size(rm, current_data_iter());
434 
436 
437  if(current_row_index == (current_block_index + 1) * row_block_size && !done())
438  load_next_block();
439  }
440 
441  ////////////////////////////////////////////////////////////////////////////////
442  // Internal reader functions
443 
444  /// Loads the block containing the row index row_index
445  void setup_block_containing_current_row_index() GL_HOT_NOINLINE;
446 
447  /// Loads the next block, resetting all the values so iteration will
448  /// be supported over the next row.
449  void load_next_block() GL_HOT_NOINLINE;
450 
451 };
452 
453 }}
454 
455 #endif /* TURI_ML2_DATA_ITERATOR_H_ */
void load_next_block() GL_HOT_NOINLINE
void advance_row() GL_HOT_INLINE_FLATTEN
ml_data_row_reference get_reference() const
GL_HOT_INLINE void fill_observation(std::vector< Entry > &x) const
ml_data_internal::entry_value_iterator current_data_iter() const GL_HOT_INLINE_FLATTEN
GL_HOT_INLINE_FLATTEN void fill_eigen_row(DenseRowXpr &&x) const
void fill_observation(DenseVector &x) const GL_HOT_INLINE_FLATTEN
virtual bool done() const
Returns true if the iteration is done, false otherwise.
virtual void reset()
Resets the iterator to the start of the sframes in ml_data.
void fill_observation(SparseVector &x) const GL_HOT_INLINE_FLATTEN
size_t target_index() const GL_HOT_INLINE_FLATTEN
void fill_observation(composite_row_container &crc) GL_HOT_INLINE_FLATTEN
void fill_untranslated_values(std::vector< flexible_type > &x) const GL_HOT_INLINE_FLATTEN
double target_value() const GL_HOT_INLINE_FLATTEN
ml_data_internal::entry_value _raw_row_entry(size_t raw_index) const GL_HOT_INLINE_FLATTEN
#define GL_HOT_INLINE
#define DASSERT_FALSE(cond)
Definition: assertions.hpp:365
#define GL_HOT_INLINE_FLATTEN
void setup_block_containing_current_row_index() GL_HOT_NOINLINE
Loads the block containing the row index row_index.
#define DASSERT_TRUE(cond)
Definition: assertions.hpp:364
size_t current_block_row_index() const GL_HOT_INLINE_FLATTEN
size_t unsliced_row_index() const
Returns the absolute row index.