Turi Create  4.0
image_augmentation.hpp
1 /* Copyright © 2018 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 
7 #ifndef TURI_NEURAL_NET_IMAGE_AUGMENTATION_HPP_
8 #define TURI_NEURAL_NET_IMAGE_AUGMENTATION_HPP_
9 
10 #include <memory>
11 #include <ostream>
12 #include <vector>
13 
14 #include <ml/neural_net/Image.hpp>
15 #include <ml/neural_net/float_array.hpp>
16 
17 namespace turi {
18 namespace neural_net {
19 
20 /**
21  * Represents a rectangular area within an image.
22  *
23  * The coordinate system is defined by the user. Any rect without a positive
24  * width and a positive height is an empty or null rect.
25  */
26 struct image_box {
27  image_box() = default;
28  image_box(float x, float y, float width, float height)
29  : x(x), y(y), width(width), height(height)
30  {}
31 
32  bool empty() const { return width <= 0.f || height <= 0.f; }
33 
34  // Computes the area if the width and height are positive, otherwise returns 0
35  float area() const {
36  return empty() ? 0.f : (width * height);
37  }
38 
39  // Divides each coordinate and length by the appropriate normalizer.
40  void normalize(float image_width, float image_height);
41 
42  // Sets this instance to the intersection with the given image_box. If no
43  // intersection exists, then the result will have area() of 0.f (and may have
44  // negative width or height).
45  void clip(image_box clip_box = image_box(0.f, 0.f, 1.f, 1.f));
46 
47  // Grows this instance (minimally) so that its area contains the (non-empty)
48  // area of the other image_box.
49  void extend(const image_box& other);
50 
51  float x = 0.f;
52  float y = 0.f;
53  float width = 0.f;
54  float height = 0.f;
55 };
56 
57 bool operator==(const image_box& a, const image_box& b);
58 std::ostream& operator<<(std::ostream& out, const image_box& box);
59 
60 /**
61  * Represents a labelled or predicted entity inside an image.
62  */
64  int identifier = 0;
65  image_box bounding_box;
66  float confidence = 0.f; // Typically 1 for training data
67 };
68 
69 bool operator==(const image_annotation& a, const image_annotation& b);
70 
71 /**
72  * Contains one image and its associated annotations.
73  */
74 struct labeled_image {
75  std::shared_ptr<Image> image;
76  std::vector<image_annotation> annotations;
77 
78  // Used when parsing saved predictions for evaluation.
79  std::vector<image_annotation> predictions;
80 };
81 
82 /**
83  * Pure virtual interface for objects that process/augment/mutate images and
84  * their associated annotations.
85  */
87 public:
88 
89  /** Parameters governing random crops. */
90  struct crop_options {
91 
92  /** Lower bound for the uniformly sampled aspect ratio (width/height) */
93  float min_aspect_ratio = 0.8f;
94 
95  /** Upper bound for the uniformly sampled aspect ratio (width/height) */
96  float max_aspect_ratio = 1.25f;
97 
98  /**
99  * Given a sampled aspect ratio, determines the lower bound of the uniformly
100  * sampled height.
101  */
102  float min_area_fraction = 0.15f;
103 
104  /**
105  * Given a sampled aspect ratio, determines the upper bound of the uniformly
106  * sampled height.
107  */
108  float max_area_fraction = 1.f;
109 
110  /**
111  * Given a sampled crop (aspect ratio, height, and location), specifies the
112  * minimum fraction of each bounding box's area that must be included to
113  * accept the crop. If 0.f, then the crop need not touch any object.
114  */
115  float min_object_covered = 0.f;
116 
117  /**
118  * The maximum number of random crops to sample in an attempt to generate
119  * one that satisfies the min_object_covered constraint.
120  */
121  size_t max_attempts = 50;
122 
123  /**
124  * Given an accepted crop, the minimum fraction of each bounding box's area
125  * that must be included to keep the (potentially cropped) bounding box in
126  * the annotations (instead of discarding it).
127  */
128  float min_eject_coverage = 0.5f;
129  };
130 
131  /** Parameters governing random padding. */
132  struct pad_options {
133 
134  /** Lower bound for the uniformly sampled aspect ratio (width/height) */
135  float min_aspect_ratio = 0.8f;
136 
137  /** Upper bound for the uniformly sampled aspect ratio (width/height) */
138  float max_aspect_ratio = 1.25f;
139 
140  /**
141  * Given a sampled aspect ratio, determines the lower bound of the uniformly
142  * sampled height.
143  */
144  float min_area_fraction = 1.f;
145 
146  /**
147  * Given a sampled aspect ratio, determines the upper bound of the uniformly
148  * sampled height.
149  */
150  float max_area_fraction = 2.f;
151 
152  /**
153  * The maximum number of random aspect ratios to sample, looking for one
154  * that satisfies the constraints on area.
155  */
156  size_t max_attempts = 50;
157  };
158 
159  /**
160  * Parameters for constructing new image_augmenter instances.
161  *
162  * Default constructed values perform no augmentation, outside of resizing to
163  * the output width and height (which must be specified).
164  */
165  struct options {
166 
167  /** The N dimension of the resulting float array. */
168  size_t batch_size = 0;
169 
170  /** The W dimension of the resulting float array. */
171  size_t output_width = 0;
172 
173  /** The H dimension of the resulting float array. */
174  size_t output_height = 0;
175 
176  /** Seed for all pseudo-random number generation used by augmentation. */
177  int random_seed = 0;
178 
179  /** The probability of applying (attempting) a random crop. */
180  float crop_prob = 0.f;
181  crop_options crop_opts;
182 
183  /** The probability of applying (attempting) a random pad. */
184  float pad_prob = 0.f;
185  pad_options pad_opts;
186 
187  /** The probability of flipping the image horizontally. */
188  float horizontal_flip_prob = 0.f;
189 
190  // TODO: The semantics below are adopted from Core Image.
191  // What should a shared interface specify?
192  // See also https://developer.apple.com/library/archive/documentation/GraphicsImaging/Reference/CoreImageFilterReference/index.html#//apple_ref/doc/filter/ci/CIColorControls
193 
194  /**
195  * Maximum pixel value to add or subtract to each channel.
196  *
197  * For example, a value of 0.05 adds a random value between -0.05 and 0.05
198  * to each channel of each pixel (represented as a value from 0 to 1).
199  */
200  float brightness_max_jitter = 0.f;
201 
202  /**
203  * Maximum proportion to increase or decrease contrast.
204  *
205  * For example, a value of 0.05 multiplies the contrast by a random value
206  * between 0.95 and 1.05.
207  */
208  float contrast_max_jitter = 0.f;
209 
210  /**
211  * Maximum proportion to increase or decrease saturation.
212  *
213  * For example, a value of 0.05 multiplies the saturation by a random value
214  * between 0.95 and 1.05.
215  */
216  float saturation_max_jitter = 0.f;
217 
218  /**
219  * Maximum proportion to rotate the hues.
220  *
221  * For example, a value of 0.05 applies a random rotation between
222  * -0.05 * pi and 0.05 * pi.
223  */
224  float hue_max_jitter = 0.f;
225  };
226 
227  /** The output of an image_augmenter. */
228  struct result {
229 
230  /** The augmented images, represented as a single NHWC array (RGB). */
231  shared_float_array image_batch;
232 
233  /**
234  * The transformed annotations for each augmented image. This vector's size
235  * should equal the size of the source batch that generated the result, and
236  * each inner vector should have the same length as the corresponding input
237  * image's annotations vector. */
238  std::vector<std::vector<image_annotation>> annotations_batch;
239  };
240 
241  virtual ~image_augmenter() = default;
242 
243  /** Returns the options parameterizing this instance. */
244  virtual const options& get_options() const = 0;
245 
246  /**
247  * Performs augmentation on a batch of images (and their annotations).
248  *
249  * If the source batch is smaller than the batch size specified in the
250  * options, then the result is padded with zeroes as needed.
251  */
252  virtual result prepare_images(std::vector<labeled_image> source_batch) = 0;
253 };
254 
255 /**
256  * An abstract class that inherits from image_augmenter used to convert
257  * input images, annotations and predictions to shared_float_arrays for
258  * tf_image_augmenter.
259  * Subclass must be written for it if needed. The subclass must implement the
260  * pure virtual method prepare_augmented_images.
261  */
263  public:
264  float_array_image_augmenter(const options& opts) : opts_(opts) {}
265 
266  const options& get_options() const override { return opts_; }
267 
268  result prepare_images(std::vector<labeled_image> source_batch) override;
269 
270  protected:
271  /** The output sent from TensorFlow after augmenting the images. */
273  /** The images after augmenting sent from Tensorflow */
274  shared_float_array images;
275 
276  /** The annotations associated with augmented images sent from Tensorflow */
277  std::vector<shared_float_array> annotations;
278  };
279 
280  /** The output sent to TensorFlow to augment the images. */
282  /** The images to be augmented are raw images decoded
283  * and send to tf_image_augmneter as vector of shared_float_array
284  */
285  std::vector<shared_float_array> images;
286 
287  /** The annotations of the images to be augmented are raw images decoded
288  * and send to tf_image_augmneter as vector of shared_float_array
289  */
290  std::vector<shared_float_array> annotations;
291  };
292 
293  virtual float_array_result prepare_augmented_images(
294  labeled_float_image data_to_augment) = 0;
295 
296  private:
297  options opts_;
298 };
299 
300 } // neural_net
301 } // turi
302 
303 #endif // TURI_NEURAL_NET_IMAGE_AUGMENTATION_HPP_
std::vector< std::vector< image_annotation > > annotations_batch
const options & get_options() const override