Turi Create  4.0
ac_data_iterator.hpp
1 /* Copyright © 2019 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 
7 #ifndef TURI_ACTIVITY_CLASSIFICATION_AC_DATA_ITERATOR_HPP_
8 #define TURI_ACTIVITY_CLASSIFICATION_AC_DATA_ITERATOR_HPP_
9 
10 #include <random>
11 #include <string>
12 #include <vector>
13 
14 #include <core/data/sframe/gl_sframe.hpp>
15 #include <ml/neural_net/float_array.hpp>
16 
17 namespace turi {
18 namespace activity_classification {
19 
20 /**
21  * Pure virtual interface for classes that produce batches of activity
22  * classification data from a raw SFrame.
23  */
25 public:
26 
27  /** Defines the inputs to a data_iterator factory function. */
28  struct parameters {
29 
30  /** The SFrame to traverse */
32 
33  /**
34  * The name of the column containing the target variable.
35  *
36  * If empty, then the output will not contain labels or weights.
37  */
38  std::string target_column_name;
39 
40  /** The name of the column containing the session ID. */
42 
43  /** The names of the feature columns. */
44  std::vector<std::string> feature_column_names;
45 
46  /**
47  * Each group of this many consecutive samples from the same session are
48  * assumed to have the same class label.
49  */
50  size_t prediction_window = 100;
51 
52  /**
53  * Each session is segmented into chunks of this many prediction windows.
54  */
55  size_t predictions_in_chunk = 20;
56 
57  /**
58  * The expected class labels, indexed by identifier.
59  *
60  * If empty, then the labels will be inferred from the data. If non-empty,
61  * an exception will be thrown upon encountering an unexpected label.
62  */
64 
65  /** Set to true, when the data is used for training. */
66  bool is_train = false;
67 
68  /** Augments training data when set to true*/
69  bool use_data_augmentation = false;
70 
71  /** Determines results of data augmentation if enabled. */
72  int random_seed = 0;
73  };
74 
75  /** Defines the output of a data_iterator. */
76  struct batch {
77 
78  /** Defines the metadata associated with each chunk. */
79  struct chunk_info {
80 
81  /** The session ID from which the chunk was segmented. */
83 
84  /** The position of this chunk within the session. */
85  size_t chunk_index;
86 
87  /** Number of samples (rows from the raw SFrame) comprising the chunk. */
88  size_t num_samples;
89  };
90 
91  /**
92  * An array with shape: (requested_batch_size,
93  * 1, prediction_window * predictions_in_chunk, num_feature_columns)
94  *
95  * Each row is a chunk of feature values from one session.
96  */
97  neural_net::shared_float_array features;
98 
99  /**
100  * An array with shape: (requested_batch_size, 1, predictions_in_chunk, 1)
101  *
102  * Each row is the sequence of class label (indices) from one chunk
103  * (labels picked after majority voting).
104  *
105  * If no target was specified, then this value is default constructed.
106  */
107  neural_net::shared_float_array labels;
108 
109  /**
110  * An array with shape: (requested_batch_size, 1, predictions_in_chunk, 1)
111  *
112  * Each row is a sequence of 0 or 1 values indicating whether the
113  * corresponding label is padding (0) or refers to actual data (1).
114  *
115  * If no target was specified, then this value is default constructed.
116  */
117  neural_net::shared_float_array weights;
118 
119  /**
120  * An array with shape: (requested_batch_size,
121  * 1, prediction_window * predictions_in_chunk, 1)
122  *
123  * Each row is the sequence of raw class labels (indices) for each
124  * individual sample.
125  */
126  neural_net::shared_float_array labels_per_row;
127 
128  /**
129  * Metadata for each valid (non-padded) row in the batch.
130  *
131  * The size of this vector is at most requested_batch_size. The info at
132  * index `i` describes the array at `features[i]`.
133  */
134  std::vector<chunk_info> batch_info;
135  };
136 
137  virtual ~data_iterator();
138 
139  virtual const flex_list& feature_names() const = 0;
140  virtual const flex_list& class_labels() const = 0;
141  virtual const size_t num_sessions() const = 0;
142  virtual flex_type_enum session_id_type() const = 0;
143 
144  /**
145  * Returns true if and only if the next call to `next_batch` will return a
146  * batch with size greater than 0.
147  */
148  virtual bool has_next_batch() const = 0;
149 
150  /**
151  * Returns a batch containing float arrays with the indicated batch size.
152  *
153  * Eventually returns a batch with size smaller than the requested size,
154  * indicating that the entire dataset has been traversed. All subsequent
155  * calls will return an empty (all padding) batch, until reset.
156  */
157  virtual batch next_batch(size_t batch_size) = 0;
158 
159  /** Begins a fresh traversal of the dataset. */
160  virtual void reset() = 0;
161 };
162 
163 /**
164  * Concrete data_iterator implementation that doesn't attempt any
165  * parallelization or background I/O.
166  */
168 public:
169 
170  simple_data_iterator(const parameters& params);
171 
172  // Not copyable or movable.
174  simple_data_iterator& operator=(const simple_data_iterator&) = delete;
175 
176  const flex_list& feature_names() const override;
177  const flex_list& class_labels() const override;
178  const size_t num_sessions() const override;
179  flex_type_enum session_id_type() const override;
180  bool has_next_batch() const override;
181  batch next_batch(size_t batch_size) override;
182  void reset() override;
183 
184 private:
185 
186  struct preprocessed_data {
187  gl_sframe chunks;
188  size_t num_sessions = 0;
189  flex_type_enum session_id_type = flex_type_enum::UNDEFINED;
190  bool has_target = false;
191  flex_list feature_names;
193  };
194 
195  static preprocessed_data preprocess_data(const parameters& params);
196 
197  const preprocessed_data data_;
198  const size_t num_samples_per_prediction_;
199  const size_t num_predictions_per_chunk_;
200 
201  gl_sframe_range range_iterator_;
202  gl_sframe_range::iterator next_row_;
203  gl_sframe_range::iterator end_of_rows_;
204  size_t sample_in_row_ = 0;
205  size_t num_chunks_in_row_ = 0;
206  bool is_train_ = false;
207  bool use_data_augmentation_ = false;
208  std::default_random_engine random_engine_;
209 };
210 
211 /**
212  * Convert SFrame to batch form, where each row contains a sequence of length
213  * predictions_in_chunk * prediction_window, and there is a single label per
214  * prediction window.
215  *
216  * \param[in] data Original data. Sframe containing one line per time sample.
217  * \param[in] features List of names of the columns containing the input features.
218  * \param[in] session_id Name of the column containing ids for each session in the dataset.
219  * A session is a single user time-series sequence.
220  * \param[in] prediction_window Number of time samples in every prediction window. A label is expected
221  * (for training), or predicted (in inference) every time a sequence of
222  * prediction_window samples have been collected.
223  * \param[in] predictions_in_chunk Each session is chunked into shorter sequences. This is the number of
224  * prediction windows desired in each chunk.
225  * \param[in] target Name of the coloumn containing the output labels. Empty string if None.
226  *
227  * \return SFrame with the data converted to batch form.
228  */
229 EXPORT variant_map_type _activity_classifier_prepare_data(const gl_sframe &data,
230  const std::vector<std::string> &features,
231  const std::string &session_id,
232  const int &prediction_window,
233  const int &predictions_in_chunk,
234  const std::string &target);
235 
236 
237 // Same as above, with verbose=True
238 EXPORT variant_map_type _activity_classifier_prepare_data_verbose(const gl_sframe &data,
239  const std::vector<std::string> &features,
240  const std::string &session_id,
241  const int &prediction_window,
242  const int &predictions_in_chunk,
243  const std::string &target);
244 
245 } // namespace activity_classification
246 } // namespace turi
247 
248 #endif // TURI_ACTIVITY_CLASSIFICATION_AC_DATA_ITERATOR_HPP_
virtual batch next_batch(size_t batch_size)=0
std::vector< flexible_type > flex_list