Turi Create  4.0
od_yolo.hpp
1 /* Copyright © 2018 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 
7 #ifndef TURI_OBJECT_DETECTION_OD_YOLO_H_
8 #define TURI_OBJECT_DETECTION_OD_YOLO_H_
9 
10 #include <ml/neural_net/image_augmentation.hpp>
11 #include <ml/neural_net/model_spec.hpp>
12 
13 namespace turi {
14 namespace object_detection {
15 
16 /**
17  * Writes a list of image_annotation values into an output float buffer.
18  *
19  * \param annotations The list of annotations (for one image) to write
20  * \param output_height The height of the YOLO output grid
21  * \param output_width The width of the YOLO output grid
22  * \param num_anchors The number of YOLO anchors
23  * \param num_classes The number of classes in the output one-hot encoding
24  * \param out Address to a float buffer of size output_height * output_width *
25  * num_anchors * (5 + num_classes)
26  * \todo Add a mutable_float_array or shared_float_buffer type for functions
27  * like this one to write into.
28  * \todo This strictly speaking doesn't belong in this data iterator type but
29  * probably doesn't warrant its own file yet (and would be nice not to
30  * bury in object_detector.cpp).
31  */
32 void convert_annotations_to_yolo(
33  const std::vector<neural_net::image_annotation>& annotations,
34  size_t output_height, size_t output_width, size_t num_anchors,
35  size_t num_classes, float* out);
36 
37 /**
38  * Parses the raw YOLO output map into annotations.
39  *
40  * \param yolo_map A float array with shape (H, W, B*(5+C)), where B is the
41  * number of anchors, C is the number of classes, and H and W are the
42  * height and width of the output grid.
43  * \param anchor_boxes The B anchor boxes used to train the YOLO model, as a
44  * vector of (width, height) pairs (in the output grid coordinates).
45  * \param min_confidence The smallest confidence score to allow in the returned
46  * results.
47  * \return Annotations in the coordinate space of the output grid.
48  */
49 std::vector<neural_net::image_annotation> convert_yolo_to_annotations(
50  const neural_net::float_array& yolo_map,
51  const std::vector<std::pair<float, float>>& anchor_boxes,
52  float min_confidence);
53 
54 /**
55  * Appends layers to an existing neural net spec, implementing the conversion
56  * from a trained YOLO model to predicted bounding boxes and class labels.
57  *
58  * \param nn_spec Model spec for the trained YOLO model
59  * \param coordinates_name The name to give to the CoreML layer which will
60  * output the predicted bounding boxes (B*H*W, 4, 1) for each of the
61  * B anchor boxes and each of the H*W output grid cells, in
62  * (x,y,width,height) order, normalized to the interval [0,1].
63  * \param confidence_name The name to give to the CoreML layer which will output
64  * the predicted class label confidences (B*H*W, C, 1) for each of
65  * the B anchor boxes, each of the H*W output grid cells, and each of
66  * the C class labels.
67  * \param input The name of the existing CoreML layer that outputs the raw
68  * (B*(5+C), H, W) predictions of the trained model: for each of B
69  * anchor boxes, the (x, y, width, height) bounding box, object
70  * confidence, and C class label confidences, for each of the H*W
71  * output grid cells.
72  * \param anchor_boxes The B anchor boxes used to train the YOLO model, as a
73  * vector of (width, height) pairs (in the output grid coordinates).
74  * \param num_classes The number of class labels C used to train the YOLO model.
75  * \param output_grid_height The height H of the output grid used to train the
76  * YOLO model.
77  * \param output_grid_width The width W of the output grid used to train the
78  * YOLO model.
79  */
80 void add_yolo(neural_net::model_spec* nn_spec, const std::string& coordinates_name,
81  const std::string& confidence_name, const std::string& input,
82  const std::vector<std::pair<float, float>>& anchor_boxes, size_t num_classes,
83  bool use_nms_layer, float iou_threshold, float confidence_threshold,
84  size_t output_grid_height, size_t output_grid_width);
85 
86 /**
87  * Appends layers to add non maximum suppression layer and
88  *
89  * \param nn_spec Model spec for the trained model
90  * \param coordinates_name The name to give to the CoreML layer which will
91  * output the predicted bounding boxes (B*H*W, 4, 1) for each of the
92  * B anchor boxes and each of the H*W output grid cells, in
93  * (x,y,width,height) order, normalized to the interval [0,1].
94  * \param confidence_name The name to give to the CoreML layer which will output
95  * the predicted class label confidences (B*H*W, C, 1) for each of
96  * the B anchor boxes, each of the H*W output grid cells, and each of
97  * the C class labels.
98  * \param num_bounding_boxes The number of anchor boxes.
99  * \param num_classes The number of class labels C used to train the YOLO model.
100  * \param confidence_threshold The confidence threshold to be applied in the NMS layer.
101  * \param iou_threshold The IoU threshold to be applied in the NMS layer.
102  * \param prefix Prefix string attached to layer names.
103  * \param nms_boxes The maximum number of boxes we want after NMS.
104  * \param use_most_confident_class Suppression can be done only across the most
105  * confident class.
106  */
107 void apply_nms_layer(neural_net::model_spec* nn_spec, const std::string& coordinates_name,
108  const std::string& confidence_name, const std::string& prefix,
109  size_t num_bounding_boxes, size_t num_classes, float confidence_threshold,
110  float iou_threshold, size_t nms_boxes, bool use_most_confident_class);
111 
112 } // object_detection
113 } // turi
114 
115 #endif // TURI_OBJECT_DETECTION_OD_YOLO_H_