Turi Create  4.0
od_model_trainer.hpp
Go to the documentation of this file.
1 /* Copyright © 2020 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at
5  * https://opensource.org/licenses/BSD-3-Clause
6  */
7 
8 #ifndef TOOLKITS_OBJECT_DETECTION_OD_MODEL_TRAINER_HPP_
9 #define TOOLKITS_OBJECT_DETECTION_OD_MODEL_TRAINER_HPP_
10 
11 /**
12  * \file od_model_trainer.hpp
13  *
14  * Defines the value types representing each stage of an object-detection
15  * training pipeline, and the virtual interface for arbitrary object-detection
16  * models.
17  */
18 
20 #include <ml/neural_net/compute_context.hpp>
21 #include <ml/neural_net/model_spec.hpp>
22 #include <toolkits/object_detection/od_data_iterator.hpp>
23 
24 namespace turi {
25 namespace object_detection {
26 
27 class ModelTrainer;
28 
29 /** Represents one batch of raw data: (possibly) annotated images. */
30 struct DataBatch {
31  /** The serial number for this batch, starting with 1. */
32  int iteration_id = 0;
33 
34  std::vector<neural_net::labeled_image> examples;
35 };
36 
37 /** Represents one batch of model-agnostic data, post-augmentation/resizing. */
38 struct InputBatch {
39  int iteration_id = 0;
40 
41  // TODO: Adopt NCHW.
42  /** The (RGB) images from a DataBatch encoded as NHWC. */
43  neural_net::shared_float_array images;
44 
45  /** The raw annotations from the DataBatch. */
46  std::vector<std::vector<neural_net::image_annotation>> annotations;
47 
48  /**
49  * The original height and width of each image, used to scale bounding-box
50  * predictions.
51  */
52  std::vector<std::pair<size_t, size_t>> image_sizes;
53 };
54 
55 /** Represents one batch of data, in a possibly model-specific format. */
57  int iteration_id = 0;
58 
59  // TODO: Migrate to neural_net::float_array_map
60  neural_net::shared_float_array images;
61  neural_net::shared_float_array labels;
62 
63  // The raw annotations are preserved to support evaluation, comparing raw
64  // annotations against model predictions.
65  std::vector<std::vector<neural_net::image_annotation>> annotations;
66 
67  // The original image sizes are preserved to support prediction.
68  std::vector<std::pair<size_t, size_t>> image_sizes;
69 };
70 
71 /** Represents the raw output of an object-detection model. */
72 // TODO: Adopt EncodedBatch instead.
74  int iteration_id = 0;
75  neural_net::shared_float_array loss;
76 };
77 
78 /** Represents the output conveyed to the user. */
80  int iteration_id = 0;
81  float smoothed_loss = 0.f;
82 };
83 
84 /**
85  * Represents the immediate (model-specific) input or output of a model backend,
86  * using the float_array_map representation.
87  */
88 struct EncodedBatch {
89  int iteration_id = 0;
90 
91  neural_net::float_array_map encoded_data;
92 
93  std::vector<std::vector<neural_net::image_annotation>> annotations;
94  std::vector<std::pair<size_t, size_t>> image_sizes;
95 };
96 
97 /** Represents one batch of inference results, in a generic format. */
99  int iteration_id = 0;
100 
101  std::vector<std::vector<neural_net::image_annotation>> predictions;
102 
103  std::vector<std::vector<neural_net::image_annotation>> annotations;
104  std::vector<std::pair<size_t, size_t>> image_sizes;
105 };
106 
107 /** Ostensibly model-agnostic parameters for object detection. */
108 struct Config {
109  /**
110  * The target number of training iterations to perform.
111  *
112  * If -1, then this target should be computed heuristically.
113  */
114  int max_iterations = -1;
115 
116  /**
117  * The number of images to process per training batch.
118  *
119  * If -1, then this size should be computed automatically.
120  */
121  int batch_size = -1;
122 
123  /** For darknet-yolo, the height of the final feature map. */
124  int output_height = 13;
125 
126  /** For darknet-yolo, the width of the final feature map. */
127  int output_width = 13;
128 
129  /** Determines the number of feature channels in the final feature map. */
130  int num_classes = -1;
131 };
132 
133 /** Stores additional data for specific model backend for a checkpoint. */
135  /** The number of predictions for the loaded model. */
136  size_t num_predictions = 0;
137 
138  /** The model type name for use in exported models. */
139  std::string model_type = "";
140 
141  /** The confidence threshold for evaluation */
142  float evaluate_confidence = 0.f;
143 
144  /** The confidence threshold for prediction */
145  float predict_confidence = 0.f;
146 
147  /** The Non Maximal Suppression threshold for evaluation */
148  float nms_threshold = 0.f;
149 
150  /** When true, use NMS only on the most confident class otherwise across all classes. */
151  bool use_most_confident_class = false;
152 };
153 
154 /**
155  * A representation of all the parameters needed to reconstruct a model.
156  *
157  * \todo Include optimizer state to allow training to resume seamlessly.
158  */
159 class Checkpoint {
160  public:
161  virtual ~Checkpoint() = default;
162 
163  virtual const Config& config() const = 0;
164  virtual const neural_net::float_array_map& weights() const = 0;
165 
166  /** Loads the checkpoint into an active ModelTrainer instance. */
167  virtual std::unique_ptr<ModelTrainer> CreateModelTrainer(
168  neural_net::compute_context* context) const = 0;
169 
170  /**
171  * Returns the CoreML spec corresponding to the current model.
172  *
173  * The result must be a pipeline that accepts an image input and yields at
174  * least two outputs, all with the given names. The outputs must be suitable
175  * for passing directly into a NonMaximumSuppression model.
176  */
177  virtual neural_net::pipeline_spec ExportToCoreML(const std::string& input_name,
178  const std::string& coordinates_name,
179  const std::string& confidence_name,
180  bool use_nms_layer, float iou_threshold,
181  float confidence_threshold) const = 0;
182 
183  virtual CheckpointMetadata GetCheckpointMetadata() const = 0;
184 };
185 /**
186  * Wrapper adapting object_detection::data_iterator to the Iterator interface.
187  */
188 class DataIterator : public neural_net::Iterator<DataBatch> {
189  public:
190  /**
191  * \param impl The object_detection::data_iterator to wrap
192  * \param batch_size The number of images to request from impl for each batch.
193  * \param offset The number of batches to skip. The first batch produced will
194  * have an iteration_id one more than the offset.
195  *
196  * \todo object_detection::data_iterator needs to support specifying the
197  * offset (and doing the right thing with random seeding)
198  */
199  DataIterator(std::unique_ptr<data_iterator> impl, size_t batch_size,
200  int offset = 0)
201  : impl_(std::move(impl)),
202  batch_size_(batch_size),
203  last_iteration_id_(offset) {}
204 
205  bool HasNext() const override { return impl_->has_next_batch(); }
206 
207  DataBatch Next() override;
208 
209  private:
210  std::unique_ptr<data_iterator> impl_;
211  size_t batch_size_ = 32;
212  int last_iteration_id_ = 0; // Next ID starts at 1, not 0, by default.
213 };
214 
215 /** Wrapper adapting image_augmenter to the Transform interface. */
216 class DataAugmenter : public neural_net::Transform<DataBatch, InputBatch> {
217  public:
218  DataAugmenter(std::unique_ptr<neural_net::image_augmenter> impl)
219  : impl_(std::move(impl)) {}
220 
221  InputBatch Invoke(DataBatch data_batch) override;
222 
223  private:
224  std::unique_ptr<neural_net::image_augmenter> impl_;
225 };
226 
227 /**
228  * Converts raw training output to user-visible progress updates.
229  *
230  * \todo Adopt this operator once model_backend supports an async API that would
231  * avoid performance regressions due to premature waiting on the futures that
232  * the model_backend implementations currently output.
233  */
235  : public neural_net::Transform<TrainingOutputBatch, TrainingProgress> {
236  public:
237  ProgressUpdater(std::unique_ptr<float> smoothed_loss)
238  : smoothed_loss_(std::move(smoothed_loss)) {}
239 
240  TrainingProgress Invoke(TrainingOutputBatch output_batch) override;
241 
242  private:
243  std::unique_ptr<float> smoothed_loss_;
244 };
245 
246 /**
247  * Abstract base class for object-detection model trainers.
248  *
249  * Responsible for constructing the model-agnostic portions of the overall
250  * training pipeline.
251  */
253  public:
254  ModelTrainer() : ModelTrainer(nullptr) {}
255 
256  // TODO: This class should be responsible for producing the augmenter itself.
257  ModelTrainer(std::unique_ptr<neural_net::image_augmenter> augmenter);
258 
259  virtual ~ModelTrainer() = default;
260 
261  /**
262  * Given a data iterator, return a publisher of model outputs.
263  *
264  * \todo Eventually this should return a TrainingProgress publisher.
265  */
266  virtual std::shared_ptr<neural_net::Publisher<TrainingOutputBatch>>
267  AsTrainingBatchPublisher(std::unique_ptr<data_iterator> training_data,
268  size_t batch_size, int offset);
269 
270  /**
271  * Given a data iterator, return a publisher of inference model outputs.
272  *
273  * \todo Publish InferenceOutputBatch instead of EncodedBatch.
274  */
275  virtual std::shared_ptr<neural_net::Publisher<EncodedBatch>>
276  AsInferenceBatchPublisher(std::unique_ptr<data_iterator> test_data,
277  size_t batch_size, float confidence_threshold,
278  float iou_threshold) = 0;
279 
280  /**
281  * Convert the raw output of the inference batch publisher into structured
282  * predictions.
283  *
284  * \todo This conversion should be incorporated into the inference pipeline
285  * once the backends support proper asynchronous complete handlers.
286  */
287  virtual InferenceOutputBatch DecodeOutputBatch(EncodedBatch batch,
288  float confidence_threshold,
289  float iou_threshold) = 0;
290 
291  /** Returns a publisher that can be used to request checkpoints. */
292  virtual std::shared_ptr<neural_net::Publisher<std::unique_ptr<Checkpoint>>>
293  AsCheckpointPublisher() = 0;
294 
295  protected:
296  // Used by subclasses to produce the model-specific portions of the overall
297  // training pipeline.
298  // TODO: Remove this method. Just let subclasses define the entire training
299  // pipeline.
300  virtual std::shared_ptr<neural_net::Publisher<TrainingOutputBatch>>
301  AsTrainingBatchPublisher(
302  std::shared_ptr<neural_net::Publisher<InputBatch>> augmented_data) = 0;
303 
304  private:
305  std::shared_ptr<DataAugmenter> augmenter_;
306 };
307 
308 } // namespace object_detection
309 } // namespace turi
310 
311 #endif // TOOLKITS_OBJECT_DETECTION_OD_MODEL_TRAINER_HPP_
STL namespace.
neural_net::shared_float_array images
std::vector< std::pair< size_t, size_t > > image_sizes
std::vector< std::vector< neural_net::image_annotation > > annotations
DataIterator(std::unique_ptr< data_iterator > impl, size_t batch_size, int offset=0)