Turi Create  4.0
evaluation_interface-inl.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_EVAL_INTERFACE_H_
7 #define TURI_EVAL_INTERFACE_H_
8 // Types
9 #include <core/storage/sframe_interface/unity_sframe.hpp>
10 #include <model_server/lib/variant.hpp>
11 #include <unordered_map>
12 
13 #ifdef __clang__
14  #pragma clang diagnostic push
15  #pragma clang diagnostic ignored "-Woverloaded-virtual" // TODO: fix these issues below
16 #endif
17 
18 const double EVAL_ZERO = 1.0e-9;
19 
20 namespace turi {
21 namespace evaluation {
22 
23 
24 /**
25  * An enumeration over the possible types of multi-class averaging
26  * that we support.
27  *
28  * \see average_type_from_name
29  */
30 enum class average_type_enum: char {
31  NONE = 0, /**< No averaging, return all. */
32  MICRO = 1, /**< Use global counts.*/
33  MACRO = 2, /**< Average per-class stats. */
34  DEFAULT = 3, /**< The default behavior . */
35 };
36 
37 /**
38  * Given the printable name of a average_type_enum type, it returns the name.
39  *
40  * \param[in] name Name of the average_type_enum type.
41  * \returns average_type_enum
42  */
43 inline average_type_enum average_type_enum_from_name(const flexible_type& name) {
44  static std::unordered_map<flexible_type, average_type_enum> type_map {
45  {FLEX_UNDEFINED, average_type_enum::NONE},
46  {flexible_type("micro"), average_type_enum::MICRO},
47  {flexible_type("default"), average_type_enum::DEFAULT},
48  {flexible_type("macro"), average_type_enum::MACRO}
49  };
50  auto it = type_map.find(name);
51  if (it == type_map.end()) {
52  log_and_throw(
53  std::string("Invalid average type name " + name.to<std::string>() + ".")
54  );
55  }
56  return it->second;
57 }
58 
59 /**
60  * Hash for a pair of flexible types (needed for insertion into an unordered_map)
61  */
63 public:
64  std::size_t operator()(const std::pair<flexible_type, flexible_type> &x) const {
65  return hash64_combine(x.first.hash(), x.second.hash());
66  }
67 };
68 
69 
70 /**
71  * Get the "highest" label as the reference label.
72  */
73 inline flexible_type get_reference_label(
74  const std::unordered_set<flexible_type>& labels) {
75 
76  // First find atleast 1 label that isn't a None
77  flexible_type ret;
78  for (const auto& l : labels) {
79  if (l != FLEX_UNDEFINED) {
80  ret = l;
81  break;
82  }
83  }
84  // Now find the "max" label.
85  for (const auto& l : labels) {
86  if (l != FLEX_UNDEFINED) {
87  if (ret < l) {
88  ret = l;
89  }
90  }
91  }
92  return ret;
93 }
94 
95 
96 /**
97  * Perform an None save average.
98 */
99 inline flexible_type average_with_none_skip(
100  std::unordered_map<flexible_type, flexible_type> scores) {
101 
102  double average = 0.0;
103  size_t tot_classes = 0;
104  for (const auto& sc: scores) {
105  if (sc.second != FLEX_UNDEFINED) {
106  average += sc.second.get<double>();
107  tot_classes += 1;
108  }
109  }
110 
111  // If every value is a None, then return None.
112  if (tot_classes == 0) {
113  return FLEX_UNDEFINED;
114  } else {
115  return flex_float(average / tot_classes);
116  }
117 }
118 
119 /**
120  * Check that probabilities are in the range [0, 1].
121  *
122 */
123 inline void check_probability_range(const double& pred) {
124  if ((pred < 0 - EVAL_ZERO) || (pred > (1 + EVAL_ZERO))) {
125  log_and_throw("Prediction scores/probabilities are expected to be "
126  "in the range [0, 1]. If they aren't, try normalizing them.");
127  }
128 }
129 
130 /**
131  * Check undefined.
132 */
133 inline void check_undefined(const flexible_type& pred) {
134  if (pred.get_type() == flex_type_enum::UNDEFINED) {
135  log_and_throw("Prediction scores/probabilities cannot contain missing "
136  "values (i.e None values). Try removing them with 'dropna'.");
137  }
138 }
139 
140 /**
141  * Compute precision (returns None when not defined)
142  *
143  * \param[in] tp True positives.
144  * \param[in] fp False positives.
145 */
146 inline flexible_type compute_precision_score(size_t tp, size_t fp) {
147  if (tp + fp > 0) {
148  return double(tp)/(tp + fp);
149  } else {
150  return FLEX_UNDEFINED;
151  }
152 }
153 
154 /**
155  * Compute recall (returns None when not defined)
156  *
157  * \param[in] tp True positives.
158  * \param[in] fn False negatives.
159 */
160 inline flexible_type compute_recall_score(size_t tp, size_t fn) {
161  if (tp + fn > 0) {
162  return double(tp)/(tp + fn);
163  } else {
164  return FLEX_UNDEFINED;
165  }
166 }
167 
168 /**
169  * Compute fbeta_score (returns None when not defined)
170  *
171  * \param[in] tp True positives.
172  * \param[in] fp False positives.
173  * \param[in] fn False negatives.
174 */
175 inline flexible_type compute_fbeta_score(
176  size_t tp, size_t fp, size_t fn, double beta) {
177 
178  flexible_type pr = compute_precision_score(tp, fp);
179  flexible_type rec = compute_recall_score(tp, fn);
180 
181  if (pr == FLEX_UNDEFINED) {
182  return rec;
183  }
184  if (rec == FLEX_UNDEFINED) {
185  return pr;
186  }
187 
188  double pr_d = pr.get<double>();
189  double rec_d = rec.get<double>();
190  double denom = std::max(1e-20, beta * beta * pr_d + rec_d);
191  return (1.0 + beta * beta) * (pr_d * rec_d) / denom;
192 }
193 
194 /**
195  *
196  * Interface for performing evaluation in a streaming manner for supervised
197  * learning.
198  *
199  * Background: Evaluation
200  * --------------------------------------------------------------------------
201  *
202  * An evaluation that can be computed in a streaming manner. All it needs is
203  * an aggregation over a sequence of individual statistics computed
204  * from individual evaluations.
205  *
206  *
207  * What we need for a supervised evaluation scheme.
208  * ---------------------------------------------------------------
209  *
210  * The interface makes sure that you can implement various types of streaming
211  * evaluations.
212  *
213  * Each standardization scheme requires the following methods:
214  *
215  * *) init: Initialize the state
216  *
217  * *) register_example: Register a label and a prediction
218  *
219  * *) get_metric: Final transformation required. eg. square root for rmse.
220  *
221  *
222 */
224 
225  public:
226 
227  /**
228  * Default destructor.
229  */
230  virtual ~supervised_evaluation_interface() = default;
231 
232  /**
233  * Default constructor.
234  */
236 
237  /**
238  * Name of the evaluator.
239  */
240  virtual std::string name() const = 0;
241 
242  /**
243  * Init the state with n_threads.
244  *
245  * \param[in] n_threads Number of threads.
246  */
247  virtual void init(size_t _n_threads = 1) = 0;
248 
249  /**
250  * Returns true of this evaluator works on probabilities/scores (vs)
251  * classes.
252  */
253  virtual bool is_prob_evaluator() const {
254  return false;
255  }
256 
257  /**
258  * Returns true of this evaluator can be displayed as a single float value.
259  */
260  virtual bool is_table_printer_compatible() const {
261  return true;
262  }
263 
264 
265  /**
266  * Register a (target, prediction) pair
267  *
268  * \param[in] target Target of a simple example.
269  * \param[in] prediction Prediction of a single example.
270  * \param[in] thread_id Thread id registering this example.
271  *
272  */
273  virtual void register_example(const flexible_type& target,
274  const flexible_type& prediction,
275  size_t thread_id = 0) = 0;
276 
277  /**
278  * Register an unmapped (target, prediction) pair. Use this for performance
279  * only. Here the target and prediction are assumed to be integers to avoid
280  * flexible_type comparisons and flexible_type hashing.
281  *
282  * \param[in] target Target of a simple example.
283  * \param[in] prediction Prediction of a single example.
284  * \param[in] thread_id Thread id registering this example.
285  *
286  */
288  const size_t& target,
289  const size_t& prediction,
290  size_t thread_id = 0) {
291  register_example(target, prediction, thread_id);
292  }
293 
294  /**
295  * Init the state with a variant type.
296  *
297  * \param[in] _state Starting state.
298  *
299  */
300  virtual variant_type get_metric() = 0;
301 
302 };
303 
304 /**
305  * Computes the RMSE between two SArrays.
306  *
307  * \sqrt((1/N) \sum_{i=1}^N (targets[i] - predictions[i])^2)
308  *
309  */
311 
312  private:
313 
314  size_t n_threads;
315  std::vector<double> mse;
316  std::vector<size_t> num_examples;
317 
318  public:
319 
320  /**
321  * Name of the evaluator.
322  */
323  std::string name() const {
324  return (std::string)("rmse");
325  }
326 
327  /**
328  * Init the state with a variant type.
329  */
330  void init(size_t _n_threads = 1){
331  n_threads = _n_threads;
332  mse.resize(n_threads);
333  num_examples.resize(n_threads);
334  for(size_t i = 0; i < n_threads; i++){
335  mse[i] = 0;
336  num_examples[i] = 0;
337  }
338  }
339 
340  /**
341  * Register a (target, prediction) pair
342  *
343  * \param[in] target Target of a simple example.
344  * \param[in] prediction Prediction of a single example.
345  * \param[in] thread_id Thread id
346  *
347  */
348  void register_example(const flexible_type& target,
349  const flexible_type& prediction,
350  size_t thread_id = 0){
351  DASSERT_TRUE(thread_id < n_threads);
352 
353  // See http://www.johndcook.com/standard_deviation.html
354  // Mk = Mk-1+ (xk - Mk-1)/k
355  double a = (double)prediction - (double)target;
356  num_examples[thread_id]++;
357  mse[thread_id] += (a * a - mse[thread_id]) / num_examples[thread_id];
358  }
359 
360  /**
361  * Return the final metric.
362  */
364  double rmse = 0;
365  size_t total_examples = 0;
366  for(size_t i = 0; i < n_threads; i++){
367  rmse += num_examples[i] * mse[i];
368  total_examples += num_examples[i];
369  }
370  DASSERT_TRUE(total_examples > 0);
371  DASSERT_TRUE(rmse >= 0);
372  return to_variant(sqrt(rmse/total_examples));
373  }
374 
375 };
376 
377 
378 /**
379  * Computes the worst case errors between two SArrays.
380  */
382 
383  private:
384 
385  size_t n_threads;
386  std::vector<double> max_error;
387 
388  public:
389 
390  /**
391  * Name of the evaluator.
392  */
393  std::string name() const {
394  return (std::string)("max_error");
395  }
396 
397  /**
398  * Init the state with a variant type.
399  */
400  void init(size_t _n_threads = 1){
401  n_threads = _n_threads;
402  max_error.resize(n_threads);
403  for (size_t i = 0; i < n_threads; i++) {
404  max_error[i] = 0.0;
405  }
406  }
407 
408  /**
409  * Register a (target, prediction) pair
410  *
411  * \param[in] target Target of a simple example.
412  * \param[in] prediction Prediction of a single example.
413  * \param[in] thread_id Thread id
414  *
415  */
416  void register_example(const flexible_type& target,
417  const flexible_type& prediction,
418  size_t thread_id = 0){
419  DASSERT_TRUE(thread_id < n_threads);
420  double err = (double)prediction - (double)target;
421  max_error[thread_id] = std::max(std::abs(err), max_error[thread_id]);
422  }
423 
424  /**
425  * Return the final metric.
426  */
428  double max_max_error = 0;
429  for(size_t i = 0; i < n_threads; i++){
430  max_max_error = std::max(max_max_error, max_error[i]);
431  }
432  return to_variant(max_max_error);
433  }
434 
435 };
436 
437 
438 class multiclass_logloss: public supervised_evaluation_interface {
439 
440  private:
441 
442  size_t n_threads;
443  std::vector<double> logloss;
444  std::vector<size_t> num_examples;
445  std::unordered_map<flexible_type, size_t> m_index_map;
446  size_t num_classes = size_t(-1);
447 
448  public:
449 
450  /**
451  * Constructor.
452  */
453  multiclass_logloss(
454  const std::unordered_map<flexible_type, size_t>& index_map,
455  size_t num_classes = size_t(-1)) {
456  m_index_map = index_map;
457  if (num_classes == size_t(-1)) {
458  this->num_classes = index_map.size();
459  } else {
460  this->num_classes = num_classes;
461  }
462  }
463 
464  /**
465  * Returns true of this evaluator works on probabilities/scores (vs)
466  * classes.
467  */
468  bool is_prob_evaluator() const {
469  return true;
470  }
471 
472  /**
473  * Name of the evaluator.
474  */
475  std::string name() const {
476  return (std::string)("multiclass_logloss");
477  }
478 
479 
480  /**
481  * Init the state with a variant type.
482  */
483  void init(size_t _n_threads = 1){
484  n_threads = _n_threads;
485  logloss.resize(n_threads);
486  num_examples.resize(n_threads);
487  for(size_t i = 0; i < n_threads; i++){
488  logloss[i] = 0;
489  num_examples[i] = 0;
490  }
491  }
492 
493  /**
494  * Register a (target, prediction) pair that are unmapped
495  *
496  * \param[in] target target of a simple example.
497  * \param[in] prediction prediction of a single example.
498  * \param[in] thread_id thread id
499  *
500  * \note Use this for performance because it does not perform a
501  * a flexible_type compare.
502  */
503  void register_unmapped_example(const size_t& target,
504  const std::vector<double>& prediction,
505  size_t thread_id = 0){
506  DASSERT_TRUE(thread_id < n_threads);
507 
508  // If the class provided is a "new" class then treat the probability as 0.0;
509  double pred = 0.0;
510  if (target < prediction.size()) {
511  pred = prediction[target];
512  }
513  num_examples[thread_id]++;
514  check_probability_range(pred);
515  logloss[thread_id] += log(
516  std::max(std::min(1.0 - EVAL_ZERO, pred), EVAL_ZERO));
517  }
518 
519  /**
520  * Register a (target, prediction) pair.
521  *
522  * \param[in] target target of a simple example.
523  * \param[in] prediction prediction of a single example.
524  * \param[in] thread_id thread id
525  *
526  */
527  void register_example(const flexible_type& target,
528  const flexible_type& prediction,
529  size_t thread_id = 0){
530  DASSERT_TRUE(thread_id < n_threads);
531  num_examples[thread_id]++;
532 
533 
534  // Error out!
535  if(prediction.size() != this->num_classes) {
536  std::stringstream ss;
537  ss << "Size of prediction probability vector"
538  << "(" << prediction.size() << ") != number of classes"
539  << "(" << m_index_map.size() << ")." << std::endl;
540  log_and_throw(ss.str());
541  }
542 
543  // If the class provided is a "new" class then treat the probability as 0.0;
544  auto it = m_index_map.find(target);
545  size_t label = 0;
546  double pred = 0.0;
547  if (it != m_index_map.end()) {
548  label = size_t(it->second);
549  const flex_vec& preds = prediction.get<flex_vec>();
550  // Check that the new class was a class obtained in training.
551  if (label < preds.size()) {
552  pred = preds[label];
553  }
554  }
555 
556  check_probability_range(pred);
557  logloss[thread_id] += log(
558  std::max(std::min(1.0 - 1e-15, pred), 1e-15));
559  }
560 
561  /**
562  * Return the final metric.
563  */
564  variant_type get_metric() {
565  double total_logloss = 0;
566  size_t total_examples = 0;
567  for(size_t i = 0; i < n_threads; i++){
568  total_logloss += logloss[i];
569  total_examples += num_examples[i];
570  }
571  DASSERT_TRUE(total_examples > 0);
572 
573  total_examples = std::max<size_t>(1, total_examples);
574  return to_variant(-total_logloss / total_examples);
575  }
576 
577 };
578 
579 class binary_logloss: public supervised_evaluation_interface {
580 
581  private:
582 
583  size_t n_threads;
584  std::vector<double> logloss;
585  std::vector<size_t> num_examples;
586  std::unordered_map<flexible_type, size_t> index_map;
587 
588  public:
589 
590  /**
591  * Constructor.
592  *
593  * \param[in] index_map Dictionary from flexible_type -> size_t for classes.
594  */
595  binary_logloss(
596  std::unordered_map<flexible_type, size_t> index_map =
597  std::unordered_map<flexible_type, size_t>()) {
598  this->index_map = index_map;
599  }
600 
601  /**
602  * Name of the evaluator.
603  */
604  std::string name() const override {
605  return (std::string)("binary_logloss");
606  }
607 
608 
609  /**
610  * Returns true of this evaluator works on probabilities/scores (vs)
611  * classes.
612  */
613  bool is_prob_evaluator() const override {
614  return true;
615  }
616 
617  /**
618  * Init the state with a variant type.
619  */
620  void init(size_t _n_threads = 1) override {
621  n_threads = _n_threads;
622  logloss.resize(n_threads);
623  num_examples.resize(n_threads);
624  for(size_t i = 0; i < n_threads; i++){
625  logloss[i] = 0;
626  num_examples[i] = 0;
627  }
628  }
629 
630  /**
631  * Register a (target, prediction) pair that are unmapped
632  *
633  * \param[in] target target of a simple example.
634  * \param[in] prediction prediction of a single example.
635  * \param[in] thread_id thread id
636  *
637  * \note Use this for performance because it does not perform a
638  * a flexible_type compare.
639  */
640  void register_unmapped_example(const size_t& target,
641  const double& prediction,
642  size_t thread_id = 0) {
643  DASSERT_TRUE(target == 0 || target == 1);
644  DASSERT_TRUE(thread_id < n_threads);
645  num_examples[thread_id]++;
646  check_probability_range(prediction);
647  logloss[thread_id] +=
648  log(target !=0 ? std::max(prediction, EVAL_ZERO) :
649  std::max(1.0 - prediction, EVAL_ZERO));
650  }
651 
652  /**
653  * Register a (target, prediction) pair.
654  *
655  * \param[in] target target of a simple example.
656  * \param[in] prediction prediction of a single example.
657  * \param[in] thread_id thread id
658  *
659  */
660  void register_example(const flexible_type& target,
661  const flexible_type& prediction,
662  size_t thread_id = 0) override {
663  DASSERT_TRUE(thread_id < n_threads);
664  check_undefined(prediction);
665  DASSERT_TRUE((prediction.get_type() == flex_type_enum::FLOAT) ||
666  (prediction.get_type() == flex_type_enum::INTEGER));
667  DASSERT_EQ(index_map.size(), 2);
668  DASSERT_TRUE(index_map.count(target) > 0);
669 
670  num_examples[thread_id]++;
671  size_t label = index_map.at(target);
672  double pred = prediction.to<double>();
673  check_probability_range(pred);
674  logloss[thread_id] +=
675  log(label != 0 ? std::max(pred, EVAL_ZERO) : std::max(1.0 - pred, EVAL_ZERO));
676  }
677 
678  /**
679  * Return the final metric.
680  */
681  variant_type get_metric() override {
682  double total_logloss = 0;
683  size_t total_examples = 0;
684  for(size_t i = 0; i < n_threads; i++){
685  total_logloss += logloss[i];
686  total_examples += num_examples[i];
687  }
688  DASSERT_TRUE(total_examples > 0);
689  total_examples = std::max<size_t>(1, total_examples);
690  return to_variant(-total_logloss/total_examples);
691  }
692 
693 };
694 
695 /**
696  * Computes the classifier accuracy for a set of predictions, where all
697  * predictions above the provided threshold are considered positive labels.
698  *
699  * accuaracy = num_right / num_examples
700  *
701  * where num_right are the things you got right!
702  *
703  */
705 
706  private:
707 
708  size_t n_threads;
709  std::vector<double> accuracy;
710  std::vector<size_t> num_examples;
711 
712  public:
713 
714  /**
715  * Name of the evaluator.
716  */
717  std::string name() const {
718  return (std::string)("classifier_accuracy");
719  }
720 
721 
722  /**
723  * Init the state with a variant type.
724  */
725  void init(size_t _n_threads = 1){
726  n_threads = _n_threads;
727  accuracy.resize(n_threads);
728  num_examples.resize(n_threads);
729  for(size_t i = 0; i < n_threads; i++){
730  accuracy[i] = 0;
731  num_examples[i] = 0;
732  }
733  }
734 
735 
736  /**
737  * Register a (target, prediction) pair that are unmapped
738  *
739  * \param[in] target target of a simple example.
740  * \param[in] prediction prediction of a single example.
741  * \param[in] thread_id thread id
742  *
743  * \note Use this for performance because it does not perform a
744  * a flexible_type compare.
745  */
747  const size_t& target,
748  const size_t& prediction,
749  size_t thread_id = 0){
750  DASSERT_TRUE(thread_id < n_threads);
751  num_examples[thread_id]++;
752  accuracy[thread_id] += (target == prediction);
753  }
754 
755  /**
756  * Register a (target, prediction) pair.
757  *
758  * \param[in] target target of a simple example.
759  * \param[in] prediction prediction of a single example.
760  * \param[in] thread_id thread id
761  *
762  */
763  void register_example(const flexible_type& target,
764  const flexible_type& prediction,
765  size_t thread_id = 0){
766  DASSERT_TRUE(thread_id < n_threads);
767  num_examples[thread_id]++;
768  accuracy[thread_id] += (target == prediction);
769  }
770 
771  /**
772  * Return the final metric.
773  */
775  double total_accuracy = 0;
776  size_t total_examples = 0;
777  for(size_t i = 0; i < n_threads; i++){
778  total_accuracy += accuracy[i];
779  total_examples += num_examples[i];
780  }
781  DASSERT_TRUE(total_examples > 0);
782  DASSERT_TRUE(total_accuracy >= 0);
783  return to_variant(total_accuracy * 1.0 / total_examples);
784  }
785 
786 };
787 
788 /**
789  * Computes the confusion matrix for a set of predictions, where all
790  * predictions above the provided threshold are considered positive
791  * labels.
792  *
793  * -----------------------------------
794  * true_label predicted_label count
795  * -----------------------------------
796  *
797  * -----------------------------------
798  *
799  */
801 
802  private:
803 
804  // Accumulators
805  std::vector<std::unordered_map<std::pair<flexible_type, flexible_type>, size_t,
806  flex_pair_hash>> counts;
807  protected:
808 
809  // Useful variables
810  size_t n_threads = 0;
811  std::unordered_set<flexible_type> labels;
812  std::map<size_t, flexible_type> index_map;
813  std::unordered_map<std::pair<flexible_type, flexible_type>, size_t,
814  flex_pair_hash> final_counts_thread, final_counts;
815 
816 
817  public:
818 
819  /**
820  * Constructor.
821  */
822  confusion_matrix(std::map<size_t, flexible_type> index_map =
823  std::map<size_t, flexible_type>()) {
824  this->index_map = index_map;
825  }
826 
827  /**
828  * Name of the evaluator.
829  */
830  std::string name() const {
831  return (std::string)("confusion_matrix");
832  }
833 
834  /**
835  * Init the state with a variant type.
836  */
837  void init(size_t _n_threads = 1){
838  n_threads = _n_threads;
839  counts.resize(n_threads);
840  }
841 
842  /**
843  * Returns true of this evaluator can be displayed as a single float value.
844  */
845  virtual bool is_table_printer_compatible() const {
846  return false;
847  }
848 
849  /**
850  * Register a (target, prediction) pair
851  *
852  * \param[in] target Target of a simple example.
853  * \param[in] prediction Prediction of a single example.
854  * \param[in] thread_id Thread id
855  *
856  */
857  void register_example(const flexible_type& target,
858  const flexible_type& prediction,
859  size_t thread_id = 0){
860  DASSERT_TRUE(thread_id < n_threads);
861  std::pair<flexible_type, flexible_type> pair =
862  std::make_pair(target, prediction);
863 
864  if(counts[thread_id].count(pair) > 0){
865  counts[thread_id][pair]++;
866  } else {
867  counts[thread_id][pair] = 1;
868  }
869  }
870 
871  /**
872  * Gather all final counts.
873  */
875 
876  // Merge by thread.
877  std::unordered_map<std::pair<flexible_type, flexible_type>, size_t,
878  flex_pair_hash> final_counts_thread;
879  for(size_t i = 0; i < n_threads; i++){
880  for (const auto& kvp: counts[i]){
881  if(final_counts_thread.count(kvp.first) > 0){
882  final_counts_thread[kvp.first] += kvp.second;
883  } else {
884  final_counts_thread[kvp.first] = kvp.second;
885  }
886  }
887  }
888  final_counts = final_counts_thread;
889 
890  // Gather labels.
891  DASSERT_TRUE(final_counts_thread.size() >= 0);
892  for (const auto& kvp: final_counts) {
893  if (labels.count(kvp.first.first) == 0) {
894  labels.insert(kvp.first.first);
895  }
896  if (labels.count(kvp.first.second) == 0) {
897  labels.insert(kvp.first.second);
898  }
899  }
900  }
901 
902  /**
903  * Return the final metric.
904  */
906 
907  // Accumulate counts & labels for each class.
908  this->gather_counts_and_labels();
909 
910  // If map provided, then do nothing!
911  if (!index_map.empty()) {
912  std::unordered_map<std::pair<flexible_type, flexible_type>, size_t,
913  flex_pair_hash> final_counts_copy;
914  for (const auto& kvp: final_counts) {
915  size_t first_index = kvp.first.first.get<flex_int>();
916  size_t second_index = kvp.first.second.get<flex_int>();
917  const flexible_type& first_key = index_map.at(first_index);
918  const flexible_type& second_key = index_map.at(second_index);
919  DASSERT_TRUE(index_map.count(first_index) > 0);
920  DASSERT_TRUE(index_map.count(second_index) > 0);
921  final_counts_copy[std::make_pair(first_key,second_key)] = kvp.second;
922  }
923  final_counts = final_counts_copy;
924  }
925 
926  // Write to an SFrame.
927  sframe confusion_matrix_sf;
928  std::vector<std::string> names;
929  names.push_back("target_label");
930  names.push_back("predicted_label");
931  names.push_back("count");
932 
933  // Inspect types: If things are the same type, then use the type that they
934  // share, otherwise use string.
936  flex_type_enum predicted_type = flex_type_enum::UNDEFINED;
937  for (const auto &cf_entry: final_counts){
938  auto t_type = cf_entry.first.first.get_type();
939  auto p_type = cf_entry.first.second.get_type();
940 
941  if(target_type == flex_type_enum::UNDEFINED) {
942  target_type = t_type;
943  } else {
944  if (t_type != flex_type_enum::UNDEFINED && t_type != target_type) {
945  target_type = flex_type_enum::STRING;
946  break;
947  }
948  }
949 
950  if(predicted_type == flex_type_enum::UNDEFINED) {
951  predicted_type = p_type;
952  } else {
953  if (p_type != flex_type_enum::UNDEFINED && p_type != predicted_type) {
954  predicted_type = flex_type_enum::STRING;
955  break;
956  }
957  }
958  }
959 
960  if (target_type == flex_type_enum::UNDEFINED) {
961  target_type = flex_type_enum::FLOAT;
962  }
963 
964  if (predicted_type == flex_type_enum::UNDEFINED) {
965  predicted_type = flex_type_enum::FLOAT;
966  }
967 
968  std::vector<flex_type_enum> types;
969  types.push_back(target_type);
970  types.push_back(predicted_type);
971  types.push_back(flex_type_enum::INTEGER);
972  confusion_matrix_sf.open_for_write(names, types, "", 1); // write to temp file
973  auto it = confusion_matrix_sf.get_output_iterator(0);
974 
975  std::vector<flexible_type> x(3);
976  for (const auto &cf_entry: final_counts){
977  x[0] = cf_entry.first.first;
978  x[1] = cf_entry.first.second;
979  x[2] = cf_entry.second;
980  *it= x;
981  ++it;
982  }
983 
984  confusion_matrix_sf.close();
985  std::shared_ptr<unity_sframe> unity_confusion_matrix =
986  std::make_shared<unity_sframe>();
987  unity_confusion_matrix->construct_from_sframe(confusion_matrix_sf);
988  return to_variant(unity_confusion_matrix);
989  }
990 
991 };
992 
993 
994 /**
995  * Compute the F-Beta score.
996  */
998 
999  protected:
1000 
1001  average_type_enum average;
1002  std::unordered_map<flexible_type, size_t> tp; // True positives
1003  std::unordered_map<flexible_type, size_t> tn; // True negatives
1004  std::unordered_map<flexible_type, size_t> fp; // False positives
1005  std::unordered_map<flexible_type, size_t> fn; // False negatives
1006 
1007  public:
1008 
1009  /**
1010  * Get the "highest" label as the reference label.
1011  */
1013 
1014  // First find atleast 1 label that isn't a None
1015  flexible_type ret;
1016  for (const auto& l : labels) {
1017  if (l != FLEX_UNDEFINED) {
1018  ret = l;
1019  break;
1020  }
1021  }
1022  // Now find the "max" label.
1023  for (const auto& l : labels) {
1024  if (l != FLEX_UNDEFINED) {
1025  if (ret < l) {
1026  ret = l;
1027  }
1028  }
1029  }
1030  return ret;
1031  }
1032 
1033  /**
1034  * Name of the evaluator.
1035  */
1036  std::string name() const = 0;
1037 
1038  /**
1039  * Returns true of this evaluator can be displayed as a single float value.
1040  */
1042  return average != average_type_enum::NONE;
1043  }
1044 
1045  /**
1046  * Gather global metrics for true_positives and false negatives
1047  */
1049 
1050  // Accumulate counts & labels for each class.
1051  this->gather_counts_and_labels();
1052  for (const auto& l: labels) {
1053  tp[l] = 0;
1054  fp[l] = 0;
1055  tn[l] = 0;
1056  fn[l] = 0;
1057  }
1058 
1059  // Compute the global metrics for tp, fp, tn, fn for each label.
1060  for (const auto& kvp: final_counts) {
1061  flexible_type t = kvp.first.first;
1062  flexible_type p = kvp.first.second;
1063  size_t count = kvp.second;
1064 
1065  // If predicted is the same as the target.
1066  for (const auto& l: labels) {
1067 
1068  // Correctly predicted "l"
1069  if ( (p == l) == (t == l)) {
1070  // True positive with repect to label [p]
1071  if (l == p) {
1072  tp[l] += count;
1073  // True negative with repect to label l != p
1074  } else {
1075  tn[l] += count;
1076  }
1077 
1078  // Correctly predicted not "l"
1079  } else {
1080  // False positive with repect to label [p]
1081  if (l == p) {
1082  fp[l] += count;
1083  // False negative with repect to label l != p
1084  } else {
1085  fn[l] += count;
1086  }
1087  }
1088  }
1089  }
1090  }
1091 
1092  /**
1093  * Get the metric!
1094  */
1095  variant_type get_metric() = 0;
1096 
1097 };
1098 
1099 /**
1100  * Compute the F-Beta score.
1101  */
1103 
1104  private:
1105 
1106  double beta;
1107 
1108  public:
1109 
1110  /**
1111  * Constructor to set the value of beta.
1112  */
1113  fbeta_score(double beta = 1.0, flexible_type average = "macro") {
1114  if (beta <= 0) {
1115  log_and_throw("The beta value in the F-beta score must be > 0.0");
1116  }
1117  this->beta = beta;
1118  this->average = average_type_enum_from_name(average);
1119  }
1120 
1121  /**
1122  * Name of the evaluator.
1123  */
1124  std::string name() const {
1125  return (std::string)("fbeta_score");
1126  }
1127 
1128 
1129  /**
1130  * Get the metric!
1131  */
1133 
1134  // Gather all the global metrics.
1135  this->gather_global_metrics();
1136  DASSERT_TRUE(labels.size() >= 0);
1137  DASSERT_TRUE(beta > 0);
1138  DASSERT_EQ(fp.size(), labels.size());
1139  DASSERT_EQ(tp.size(), labels.size());
1140 
1141 
1142  // Multi-class vs binary classification.
1143  std::unordered_map<flexible_type, flexible_type> fbeta_scores;
1144  for (const auto& l: labels) {
1145  fbeta_scores[l] = compute_fbeta_score(tp[l], fp[l], fn[l], beta);
1146  }
1147 
1148  // For binary classification, return the scores for the final label.
1149  if (labels.size() == 2) {
1150  return to_variant(fbeta_scores[get_reference_label()]);
1151  }
1152 
1153  // Multi-class scores: Average based on user request.
1154  switch (average) {
1155  // Global scores.
1156  case average_type_enum::MICRO:
1157  {
1158  size_t total_tp = 0;
1159  size_t total_fp = 0;
1160  size_t total_fn = 0;
1161  for (const auto& l: labels) {
1162  total_tp += tp[l];
1163  total_fp += fp[l];
1164  total_fn += fn[l];
1165  }
1166  return to_variant(compute_fbeta_score(total_tp, total_fp, total_fn, beta));
1167 
1168  // Average scores.
1169  }
1170  case average_type_enum::DEFAULT:
1171  case average_type_enum::MACRO:
1172  {
1173  return to_variant(average_with_none_skip(fbeta_scores));
1174 
1175  // All scores.
1176  }
1177  case average_type_enum::NONE:
1178  {
1179  return to_variant(fbeta_scores);
1180  }
1181 
1182  default: {
1183  log_and_throw(std::string("Unsupported average_type_enum case"));
1184  ASSERT_UNREACHABLE();
1185  }
1186  }
1187  }
1188 
1189 };
1190 
1191 
1192 /**
1193  * Compute the precision score.
1194  */
1196 
1197  public:
1198 
1199  /**
1200  * Constructor.
1201  */
1202  precision(flexible_type average = "macro") {
1203  this->average = average_type_enum_from_name(average);
1204  }
1205 
1206  /**
1207  * Name of the evaluator.
1208  */
1209  std::string name() const {
1210  return (std::string)("precision");
1211  }
1212 
1213  /**
1214  * Get the metric!
1215  */
1217 
1218  // Gather all the global metrics.
1219  this->gather_global_metrics();
1220  DASSERT_TRUE(labels.size() >= 0);
1221  DASSERT_EQ(fp.size(), labels.size());
1222  DASSERT_EQ(tp.size(), labels.size());
1223 
1224 
1225  // Multi-class vs binary classification.
1226  std::unordered_map<flexible_type, flexible_type> precision_scores;
1227  for (const auto& l: labels) {
1228  precision_scores[l] = compute_precision_score(tp[l], fp[l]);
1229  }
1230 
1231  // For binary classification, return the scores for the final label.
1232  if (labels.size() == 2) {
1233  return to_variant(precision_scores[get_reference_label()]);
1234  }
1235 
1236  // Multi-class scores: Average based on user request.
1237  switch (average) {
1238  // Global scores.
1239  case average_type_enum::MICRO:
1240  {
1241  size_t total_tp = 0;
1242  size_t total_fp = 0;
1243  for (const auto& l: labels) {
1244  total_tp += tp[l];
1245  total_fp += fp[l];
1246  }
1247  return to_variant(compute_precision_score(total_tp, total_fp));
1248  }
1249  // Average scores.
1250  case average_type_enum::DEFAULT:
1251  case average_type_enum::MACRO:
1252  {
1253  return to_variant(average_with_none_skip(precision_scores));
1254  }
1255 
1256  // All scores.
1257  case average_type_enum::NONE:
1258  {
1259  return to_variant(precision_scores);
1260  }
1261 
1262  default: {
1263  log_and_throw(std::string("Unsupported average_type_enum case"));
1264  ASSERT_UNREACHABLE();
1265  }
1266  }
1267  }
1268 
1269 };
1270 
1271 
1272 /**
1273  * Compute the recall score.
1274  */
1276 
1277 
1278  public:
1279 
1280  /**
1281  * Constructor.
1282  */
1283  recall(flexible_type average = "macro") {
1284  this->average = average_type_enum_from_name(average);
1285  }
1286 
1287  /**
1288  * Name of the evaluator.
1289  */
1290  std::string name() const {
1291  return (std::string)("recall");
1292  }
1293 
1294 
1295  /**
1296  * Get the metric!
1297  */
1299 
1300  // Gather all the global metrics.
1301  this->gather_global_metrics();
1302  DASSERT_TRUE(labels.size() >= 0);
1303  DASSERT_EQ(fp.size(), labels.size());
1304  DASSERT_EQ(tp.size(), labels.size());
1305 
1306  // Multi-class vs binary classification.
1307  std::unordered_map<flexible_type, flexible_type> recall_scores;
1308  for (const auto& l: labels) {
1309  recall_scores[l] = compute_recall_score(tp[l], fn[l]);
1310  }
1311 
1312  // For binary classification, return the scores for the final label.
1313  if (labels.size() == 2) {
1314  return to_variant(recall_scores[get_reference_label()]);
1315  }
1316 
1317  // Multi-class scores: Average based on user request.
1318  switch (average) {
1319  // Global scores.
1320  case average_type_enum::MICRO:
1321  {
1322  size_t total_tp = 0;
1323  size_t total_fn = 0;
1324  for (const auto& l: labels) {
1325  total_tp += tp[l];
1326  total_fn += fn[l];
1327  }
1328  return to_variant(compute_recall_score(total_tp, total_fn));
1329  }
1330 
1331  // Average scores.
1332  case average_type_enum::DEFAULT:
1333  case average_type_enum::MACRO:
1334  {
1335  return to_variant(average_with_none_skip(recall_scores));
1336  }
1337 
1338  // All scores.
1339  case average_type_enum::NONE:
1340  {
1341  return to_variant(recall_scores);
1342  }
1343 
1344  default: {
1345  log_and_throw(std::string("Unsupported average_type_enum case"));
1346  ASSERT_UNREACHABLE();
1347  }
1348  }
1349  }
1350 
1351 };
1352 
1353 /**
1354  * Compute the accuracy score. This is a slower, but more flexible version
1355  * of the accuracy.
1356  *
1357  */
1359 
1360 
1361  public:
1362 
1363  /**
1364  * Constructor.
1365  */
1366  flexible_accuracy(flexible_type average = "micro") {
1367  this->average = average_type_enum_from_name(average);
1368  }
1369 
1370  /**
1371  * Name of the evaluator.
1372  */
1373  std::string name() const {
1374  return (std::string)("accuracy");
1375  }
1376 
1377 
1378  /**
1379  * Get the metric!
1380  */
1382 
1383  // Gather all the global metrics.
1384  this->gather_global_metrics();
1385  DASSERT_TRUE(labels.size() >= 0);
1386  DASSERT_EQ(fp.size(), labels.size());
1387  DASSERT_EQ(tp.size(), labels.size());
1388 
1389  // Multi-class vs binary classification.
1390  std::unordered_map<flexible_type, double> accuracy_scores;
1391  std::unordered_map<flexible_type, flexible_type> precision_scores;
1392  for (const auto& l: labels) {
1393  accuracy_scores[l] = double(tp[l] + tn[l])/(tp[l] + fp[l] + tn[l] + fn[l]);
1394  precision_scores[l] = compute_precision_score(tp[l], fp[l]);
1395  }
1396 
1397  // For binary classification, return the scores for the final label.
1398  if (labels.size() == 2) {
1399  return to_variant(accuracy_scores[get_reference_label()]);
1400  }
1401 
1402  // Multi-class scores: Average based on user request.
1403  switch (average) {
1404  // Global scores.
1405  case average_type_enum::MICRO:
1406  case average_type_enum::DEFAULT:
1407  {
1408  size_t tot_tp = 0;
1409  size_t tot_tn = 0;
1410  size_t tot_fp = 0;
1411  size_t tot_fn = 0;
1412  for (const auto& l: labels) {
1413  tot_tp += tp[l];
1414  tot_tn += tp[l];
1415  tot_fp += fp[l];
1416  tot_fn += fn[l];
1417  }
1418  double accuracy = double(tot_tp + tot_tn)/(tot_tp + tot_fp + tot_tn + tot_fn);
1419  return to_variant(accuracy);
1420 
1421  }
1422  // Average scores.
1423  case average_type_enum::MACRO:
1424  {
1425  double average_accuracy = 0.0;
1426  for (const auto& rec: accuracy_scores) {
1427  average_accuracy += rec.second;
1428  }
1429  average_accuracy /= labels.size();
1430  return to_variant(average_accuracy);
1431  }
1432  // All scores.
1433  case average_type_enum::NONE:
1434  {
1435  return to_variant(precision_scores);
1436  }
1437 
1438  default: {
1439  log_and_throw(std::string("Unsupported average_type_enum case"));
1440  ASSERT_UNREACHABLE();
1441  }
1442  }
1443  }
1444 };
1445 
1446 /**
1447  * Computes the ROC curve. An aggregated version is computed, where we
1448  * compute the true positive rate and false positive rate for a set of
1449  * 1000 predefined thresholds equally spaced from 0 to 1.
1450  * For each prediction, we find which bin it belongs to and we increment
1451  * the count of true positives (where y=1 and yhat is greater than the lower
1452  * bound for that bin) and the number of false positives.
1453  * When complete, these counts are used to compute false positive rate and
1454  * true positive rate for each bin.
1455  *
1456  * In order to use this class, there are two modes:
1457  * - binary mode: In this mode, the inputs are (target_class, prediction_prob)
1458  * where prediction_prob is the probability of the "positive" class. Here
1459  * the "positive" class is defined as the largest class as sorted by flexible_type
1460  * semantics.
1461  *
1462  * - multiclass mode: In this mode, the inputs are (target_class, prob_vec)
1463  * where prob_vec are the vector of probabilities. In this case, the
1464  * target_class must be integer
1465  */
1467 
1468 
1469  private:
1470 
1471  // Accumulators
1472  std::unordered_map<std::pair<flexible_type, flexible_type>, size_t,
1473  flex_pair_hash> final_counts_thread, final_counts;
1474  std::vector<std::vector<std::vector<size_t>>> tpr;
1475  std::vector<std::vector<std::vector<size_t>>> fpr;
1476  std::vector<std::vector<size_t>> num_examples;
1477 
1478  protected:
1479 
1480  // Options
1481  average_type_enum average = average_type_enum::NONE;
1482  bool binary = false;
1483  const size_t NUM_BINS=1000;
1484  size_t n_threads = 0;
1485  size_t num_classes = 0;
1486 
1487  // Input map.
1488  std::unordered_map<flexible_type, size_t> index_map;
1489 
1490  // Total counts
1491  std::vector<std::vector<size_t>> total_fp;
1492  std::vector<std::vector<size_t>> total_tp;
1493  std::vector<size_t> total_examples;
1494 
1495  public:
1496 
1497  /**
1498  * Constructor.
1499  *
1500  * \param[in] index_map Dictionary from flexible_type -> size_t for classes.
1501  * \param[in] average Averaging mode
1502  * \param[in] binary Is the input mode expected to be binary?
1503  */
1505  std::unordered_map<flexible_type, size_t> index_map =
1506  std::unordered_map<flexible_type, size_t>(),
1507  flexible_type average = FLEX_UNDEFINED,
1508  bool binary = true,
1509  size_t num_classes = size_t(-1)) {
1510  this->average = average_type_enum_from_name(average);
1511  this->binary = binary;
1512  this->index_map = index_map;
1513  if (num_classes == size_t(-1)) {
1514  this->num_classes = index_map.size();
1515  } else {
1516  this->num_classes = num_classes;
1517  }
1518  }
1519 
1520  /**
1521  * Name of the evaluator.
1522  */
1523  std::string name() const {
1524  return (std::string)("roc_curve");
1525  }
1526 
1527  /**
1528  * Returns true of this evaluator works on probabilities/scores (vs)
1529  * classes.
1530  */
1531  bool is_prob_evaluator() const {
1532  return true;
1533  }
1534 
1535  /**
1536  * Returns true of this evaluator can be displayed as a single float value.
1537  */
1538  virtual bool is_table_printer_compatible() const {
1539  return false;
1540  }
1541 
1542  /**
1543  * Init the state with a variant type.
1544  */
1545  void init(size_t _n_threads = 1) {
1546  DASSERT_TRUE(num_classes > 0);
1547  DASSERT_LE(binary, num_classes == 2);
1548 
1549  // Init the options.
1550  n_threads = _n_threads;
1551 
1552  // Initialize the accumulators
1553  tpr.resize(n_threads);
1554  fpr.resize(n_threads);
1555  num_examples.resize(n_threads);
1556  for (size_t i = 0; i < n_threads; i++) {
1557 
1558  tpr[i].resize(num_classes);
1559  fpr[i].resize(num_classes);
1560  num_examples[i].resize(num_classes);
1561  for (size_t c = 0; c < num_classes; c++) {
1562 
1563  tpr[i][c].resize(NUM_BINS);
1564  fpr[i][c].resize(NUM_BINS);
1565  num_examples[i][c] = 0;
1566  for (size_t j = 0; j < NUM_BINS; j++) {
1567  tpr[i][c][j] = 0;
1568  fpr[i][c][j] = 0;
1569  }
1570  }
1571  }
1572 
1573  // Initialize the aggregators.
1574  total_fp.resize(num_classes);
1575  total_tp.resize(num_classes);
1576  total_examples.resize(num_classes);
1577  for (size_t c = 0; c < num_classes; c++) {
1578  total_examples[c] = 0;
1579  total_fp[c].resize(NUM_BINS);
1580  total_tp[c].resize(NUM_BINS);
1581  for (size_t j = 0; j < NUM_BINS; j++) {
1582  total_fp[c][j] = 0;
1583  total_tp[c][j] = 0;
1584  }
1585  }
1586 
1587  };
1588 
1589  const float get_bin(double prediction) const {
1590  // Assign this prediction to an integer that indicates a "bin" id.
1591  size_t bin = std::floor((double) std::max(0.0, prediction * NUM_BINS));
1592 
1593  // This effectively makes the upper bin [0.999, 1] instead of [0.999, 1).
1594  // If a prediction is exactly 1.0, then it would get assigned to
1595  // a bin with lower bound 1.0, but since we want 1000 bins, we move
1596  // these into the bin with lower bound 0.999.
1597  if (bin >= NUM_BINS) bin = NUM_BINS - 1;
1598  return bin;
1599  }
1600 
1601  const float get_lower_bound(size_t bin) const {
1602  // Get the lower threshold of predictions that fall into this bin.
1603  return bin/((double)NUM_BINS);
1604  }
1605 
1606  /**
1607  * Register a (target, prediction) pair
1608  *
1609  * \param[in] target Target of a simple example.
1610  * \param[in] prediction Prediction of a single example.
1611  * \param[in] thread_id Thread id
1612  *
1613  */
1614  void register_example(const flexible_type& target,
1615  const flexible_type& prediction,
1616  size_t thread_id = 0){
1617  DASSERT_LT(thread_id, n_threads);
1618  DASSERT_LT(thread_id, fpr.size());
1619  DASSERT_LT(thread_id, tpr.size());
1620  check_undefined(prediction);
1621  DASSERT_EQ(binary, (prediction.get_type() == flex_type_enum::FLOAT) ||
1622  (prediction.get_type() == flex_type_enum::INTEGER));
1623  DASSERT_EQ(!binary, prediction.get_type() == flex_type_enum::VECTOR);
1624 
1625  // The index for this target. Skip the example if it doesn't exist!
1626  size_t idx = 0;
1627  auto it = index_map.find(target);
1628  if (it == index_map.end()) {
1629  return;
1630  } else {
1631  idx = size_t(it->second);
1632  }
1633  DASSERT_LT(idx, index_map.size());
1634 
1635  // Binary mode.
1636  if (binary) {
1637  DASSERT_EQ(num_classes, 2);
1638 
1639  // Add this example to the prediction counter.
1640  double pred = prediction.to<double>();
1641  check_probability_range(pred);
1642  size_t bin = get_bin(pred);
1643 
1644  if (idx == 1) {
1645  DASSERT_LT(bin, tpr[thread_id][1].size());
1646  DASSERT_LT(bin, fpr[thread_id][0].size());
1647  tpr[thread_id][1][bin]++;
1648  fpr[thread_id][0][bin]++;
1649  num_examples[thread_id][1]++;
1650  } else {
1651  DASSERT_LT(bin, tpr[thread_id][0].size());
1652  DASSERT_LT(bin, fpr[thread_id][1].size());
1653  fpr[thread_id][1][bin]++;
1654  tpr[thread_id][0][bin]++;
1655  num_examples[thread_id][0]++;
1656  }
1657 
1658  // Multi-class mode.
1659  } else {
1660 
1661  // Error out!
1662  if(prediction.size() != num_classes) {
1663  std::stringstream ss;
1664  ss << "Size of prediction probability vector"
1665  << "(" << prediction.size() << ") != number of classes"
1666  << "(" << num_classes << ")." << std::endl;
1667  log_and_throw(ss.str());
1668  }
1669 
1670  // Data point in the test set but not in the training set. Skip.
1671  if (idx >= prediction.size()) {
1672  return;
1673  }
1674 
1675  // Get the prediction for the true class.
1676  for (size_t i = 0; i < prediction.size(); i++) {
1677  check_probability_range(prediction[i]);
1678  size_t bin = get_bin(prediction[i]);
1679 
1680  // Update the tpr and fpr rates!
1681  if (i == idx) {
1682  DASSERT_LT(bin, tpr[thread_id][idx].size());
1683  tpr[thread_id][i][bin]++;
1684  } else {
1685  DASSERT_LT(bin, fpr[thread_id][idx].size());
1686  fpr[thread_id][i][bin]++;
1687  }
1688  }
1689  num_examples[thread_id][idx]++;
1690  }
1691 
1692  }
1693 
1694  /**
1695  * Gather global metrics for true_positives and false negatives
1696  */
1698 
1699  // Total fp, tp, examples
1700  for (size_t i = 0; i < n_threads; ++i) {
1701  for (size_t c = 0; c < num_classes; c++) {
1702  total_examples[c] += num_examples[i][c];
1703  for (size_t j = 0; j < NUM_BINS; ++j) {
1704  total_fp[c][j] += fpr[i][c][j];
1705  total_tp[c][j] += tpr[i][c][j];
1706  }
1707  }
1708  }
1709 
1710  // Get the number of false positives and true positives for all
1711  // bins above the current bin.
1712  for (size_t c = 0; c < num_classes; c++) {
1713  for (ssize_t j = NUM_BINS-2; j >= 0; --j) {
1714  total_fp[c][j] += total_fp[c][j+1];
1715  total_tp[c][j] += total_tp[c][j+1];
1716  }
1717  }
1718 
1719  }
1720 
1721  /**
1722  * Return the final metric.
1723  */
1725 
1726  this->gather_global_metrics();
1727 
1728  // Helper function for computing the roc curve from the statistics
1729  size_t total_bins = NUM_BINS;
1730  size_t _num_classes = this->num_classes;
1731  auto compute_roc_curve = [total_bins, _num_classes](
1732  const std::vector<std::vector<size_t>>& total_fp,
1733  const std::vector<std::vector<size_t>>& total_tp,
1734  const std::vector<size_t>& total_examples,
1735  const size_t& c,
1736  bool binary = true, // Is this in binary mode?
1737  const std::map<size_t, flexible_type>& inv_map =
1738  std::map<size_t, flexible_type>()) -> variant_type {
1739 
1740  size_t all_examples = 0;
1741  for (const auto& cex: total_examples) {
1742  all_examples += cex;
1743  }
1744 
1745  // Columns in the SFrame.
1746  sframe ret;
1747  std::vector<std::string> col_names {"threshold", "fpr", "tpr", "p", "n"};
1748  std::vector<flex_type_enum> col_types {flex_type_enum::FLOAT,
1752  flex_type_enum::INTEGER};
1753 
1754  // Not binary, add class to it!
1755  if (not binary) {
1756  col_names.push_back("class");
1757  DASSERT_TRUE(inv_map.size() > 0);
1758  col_types.push_back(inv_map.at(c).get_type());
1759  }
1760 
1761  ret.open_for_write(col_names, col_types, "", 1);
1762  std::vector<flexible_type> out_v;
1763  auto it_out = ret.get_output_iterator(0);
1764 
1765  // Write to the SFrame.
1766  size_t cl = 0;
1767  do {
1768  if (binary) cl = c;
1769 
1770  // Add all rows.
1771  for (size_t j=0; j < total_bins; ++j) {
1772  DASSERT_LE(total_tp[cl][j], total_examples[cl]);
1773  DASSERT_LE(total_fp[cl][j], all_examples - total_examples[cl]);
1774  out_v = {j / double(total_bins),
1775  (1.0 * total_fp[cl][j]) / (all_examples - total_examples[cl]),
1776  (1.0 * total_tp[cl][j]) / total_examples[cl],
1777  total_examples[cl], (all_examples - total_examples[cl])};
1778  if (not binary) {
1779  out_v.push_back(inv_map.at(cl));
1780  }
1781  *it_out = out_v;
1782  }
1783 
1784  // Manually add final row.
1785  out_v = {1.0, 0.0, 0.0,
1786  total_examples[c], (all_examples - total_examples[c])};
1787  if (not binary) {
1788  out_v.push_back(inv_map.at(cl));
1789  }
1790 
1791  // Write the row.
1792  *it_out = out_v;
1793  cl++;
1794 
1795  if (binary) break;
1796  if (cl == _num_classes) break;
1797 
1798  } while (true);
1799 
1800  ret.close();
1801  DASSERT_EQ(ret.size(),
1802  (total_bins + 1) * (binary + (1 - binary) * _num_classes));
1803 
1804  // Convert to variant type.
1805  std::shared_ptr<unity_sframe> tmp = std::make_shared<unity_sframe>();
1806  tmp->construct_from_sframe(ret);
1807  variant_type roc_curve = to_variant<std::shared_ptr<unity_sframe>>(tmp);
1808  return roc_curve;
1809 
1810  }; // end-of-helper function.
1811 
1812  // Compute the integral with respect to ROC-1
1813  if (num_classes == 2) {
1814  return to_variant(compute_roc_curve(total_fp, total_tp, total_examples, 1));
1815  }
1816 
1817  switch(average) {
1818 
1819  // Score for each class.
1820  case average_type_enum::NONE:
1821  case average_type_enum::DEFAULT:
1822  {
1823 
1824  // Create an inverse map.
1825  std::map<size_t, flexible_type> inv_map;
1826  for (const auto& kvp: index_map) {
1827  inv_map[kvp.second] = kvp.first;
1828  }
1829  return compute_roc_curve(total_fp, total_tp, total_examples, 0, false, inv_map);
1830  }
1831 
1832  default: {
1833  log_and_throw(std::string("Unsupported average_type_enum case"));
1834  ASSERT_UNREACHABLE();
1835  }
1836  }
1837  }
1838 };
1839 
1840 
1841 /*
1842  * Compute the Area Under the Curve (AUC) using the trapezoidal rule
1843  */
1844 class auc: public roc_curve {
1845 
1846  public:
1847 
1848  /**
1849  * Constructor.
1850  *
1851  * \param[in] index_map Dictionary from flexible_type -> size_t for classes.
1852  * \param[in] average Averaging mode
1853  * \param[in] binary Is the input mode expected to be binary?
1854  */
1855  auc(
1856  std::unordered_map<flexible_type, size_t> index_map =
1857  std::unordered_map<flexible_type, size_t>(),
1858  flexible_type average = "micro",
1859  bool binary = true,
1860  size_t num_classes = size_t(-1)) {
1861  this->average = average_type_enum_from_name(average);
1862  this->binary = binary;
1863  this->index_map = index_map;
1864  if (num_classes == size_t(-1)) {
1865  this->num_classes = index_map.size();
1866  } else {
1867  this->num_classes = num_classes;
1868  }
1869  }
1870 
1871  /*
1872  * Name of the evaluator.
1873  */
1874  std::string name() const {
1875  return (std::string)("auc");
1876  }
1877 
1878  /**
1879  * Returns true of this evaluator can be displayed as a single float value.
1880  */
1881  virtual bool is_table_printer_compatible() const {
1882  return average != average_type_enum::NONE;
1883  }
1884 
1885 
1886  /**
1887  * Return the final metric.
1888  */
1889  variant_type get_metric() {
1890 
1891  this->gather_global_metrics();
1892 
1893  // Compute the auc-score.
1894  size_t total_bins = NUM_BINS;
1895  auto compute_auc = [total_bins](
1896  const std::vector<std::vector<size_t>>& total_fp,
1897  const std::vector<std::vector<size_t>>& total_tp,
1898  const std::vector<size_t>& total_examples,
1899  const size_t& c) -> double {
1900 
1901  size_t all_examples = 0;
1902  for (const auto& cex: total_examples) {
1903  all_examples += cex;
1904  }
1905 
1906  double auc_score = 0;
1907  for(size_t i = 0; i < total_bins- 1; i++) {
1908  double delta = total_fp[c][i] - total_fp[c][i+1];
1909  delta /= (all_examples - total_examples[c]);
1910  if (delta > 1e-10) {
1911  auc_score += 0.5 * (total_tp[c][i] + total_tp[c][i+1]) * delta
1912  / total_examples[c];
1913  }
1914  }
1915  return auc_score;
1916  };
1917 
1918  // Compute the integral with respect to ROC-1
1919  if (num_classes == 2) {
1920  return to_variant(compute_auc(total_fp, total_tp, total_examples, 1));
1921  }
1922 
1923  switch(average) {
1924 
1925  // Score for each class.
1926  case average_type_enum::NONE:
1927  {
1928 
1929  // Create an inverse map.
1930  std::map<size_t, flexible_type> inv_map;
1931  for (const auto& kvp: index_map) {
1932  inv_map[kvp.second] = kvp.first;
1933  }
1934 
1935  // Compute AUC score.
1936  std::unordered_map<flexible_type, double> auc_score;
1937  for (size_t c = 0; c < num_classes; c++) {
1938  flexible_type k = inv_map[c];
1939  auc_score[k] = compute_auc(total_fp, total_tp, total_examples, c);
1940  }
1941  return to_variant(auc_score);
1942  }
1943 
1944  case average_type_enum::DEFAULT:
1945  case average_type_enum::MACRO:
1946  {
1947  double auc_score = 0;
1948  for (size_t c = 0; c < num_classes; c++) {
1949  auc_score += compute_auc(total_fp, total_tp, total_examples, c);
1950  }
1951  return to_variant(auc_score / num_classes);
1952  }
1953 
1954  default: {
1955  log_and_throw(std::string("Unsupported average_type_enum case"));
1956  ASSERT_UNREACHABLE();
1957  }
1958  }
1959  }
1960 };
1961 
1962 /*
1963  * Factory method to get the set of evaluation metrics.
1964  * \param[in] metric Name of the metric
1965  * \param[in] kwargs Arguments for the metric
1966  *
1967  *
1968  * \example
1969  *
1970  * For a constructor of the following format:
1971  *
1972  * flexible_accuracy(flexible_type _average = "micro") {
1973  * average = average_type_enum_from_name(average);
1974  * }
1975  *
1976  * this factory function can be called as follows:
1977  *
1978  * get_evaluator_metric("flexible_accuracy",
1979  * {"average", to_variant(std::string("micro"))})
1980  *
1981  * This is intended to work just like the python side.
1982  *
1983  */
1984 inline std::shared_ptr<supervised_evaluation_interface> get_evaluator_metric(
1985  const std::string& metric,
1986  const std::map<std::string, variant_type>& kwargs = std::map<std::string, variant_type>()) {
1987 
1988  std::shared_ptr<supervised_evaluation_interface> evaluator;
1989  if(metric == "rmse"){
1990  evaluator = std::make_shared<rmse>(rmse());
1991  } else if(metric == "max_error"){
1992  evaluator = std::make_shared<max_error>(max_error());
1993 
1994  } else if(metric == "confusion_matrix_no_map"){
1995  evaluator = std::make_shared<confusion_matrix>(confusion_matrix());
1996 
1997  } else if(metric == "confusion_matrix"){
1998  DASSERT_TRUE(kwargs.count("inv_index_map") > 0);
1999  std::map<size_t, flexible_type> inv_map = variant_get_value<
2000  std::map<size_t, flexible_type>>(kwargs.at("inv_index_map"));
2001  evaluator = std::make_shared<confusion_matrix>(confusion_matrix(inv_map));
2002 
2003  } else if(metric == "accuracy"){
2004  evaluator = std::make_shared<classifier_accuracy>(classifier_accuracy());
2005 
2006  } else if(metric == "binary_logloss") {
2007  DASSERT_TRUE(kwargs.count("index_map") > 0);
2008  auto index_map = variant_get_value<
2009  std::unordered_map<flexible_type, size_t>>(kwargs.at("index_map"));
2010  evaluator = std::make_shared<binary_logloss>(
2011  binary_logloss(index_map));
2012 
2013  } else if((metric == "multiclass_logloss") || (metric == "log_loss")){
2014  DASSERT_TRUE(kwargs.count("index_map") > 0);
2015  auto index_map = variant_get_value<
2016  std::unordered_map<flexible_type, size_t>>(kwargs.at("index_map"));
2017  size_t num_classes = size_t(-1);
2018  if (kwargs.count("num_classes") > 0) {
2019  num_classes = variant_get_value<size_t>(kwargs.at("num_classes"));
2020  }
2021  evaluator = std::make_shared<multiclass_logloss>(
2022  multiclass_logloss(index_map, num_classes));
2023 
2024  } else if(metric == "roc_curve"){
2025  DASSERT_TRUE(kwargs.count("average") > 0);
2026  DASSERT_TRUE(kwargs.count("binary") > 0);
2027  DASSERT_TRUE(kwargs.count("index_map") > 0);
2028  auto average = variant_get_value<flexible_type>(kwargs.at("average"));
2029  auto binary = variant_get_value<bool>(kwargs.at("binary"));
2030  auto index_map = variant_get_value<
2031  std::unordered_map<flexible_type, size_t>>(kwargs.at("index_map"));
2032  size_t num_classes = size_t(-1);
2033  if (kwargs.count("num_classes") > 0) {
2034  num_classes = variant_get_value<size_t>(kwargs.at("num_classes"));
2035  }
2036  evaluator = std::make_shared<roc_curve>(
2037  roc_curve(index_map, average, binary, num_classes));
2038 
2039  } else if(metric == "auc"){
2040  DASSERT_TRUE(kwargs.count("average") > 0);
2041  DASSERT_TRUE(kwargs.count("binary") > 0);
2042  DASSERT_TRUE(kwargs.count("index_map") > 0);
2043  auto average = variant_get_value<flexible_type>(kwargs.at("average"));
2044  auto binary = variant_get_value<bool>(kwargs.at("binary"));
2045  auto index_map = variant_get_value<
2046  std::unordered_map<flexible_type, size_t>>(kwargs.at("index_map"));
2047  size_t num_classes = size_t(-1);
2048  if (kwargs.count("num_classes") > 0) {
2049  num_classes = variant_get_value<size_t>(kwargs.at("num_classes"));
2050  }
2051  evaluator = std::make_shared<auc>(auc(index_map, average, binary, num_classes));
2052 
2053  } else if(metric == "flexible_accuracy"){
2054  DASSERT_TRUE(kwargs.count("average") > 0);
2055  auto average = variant_get_value<flexible_type>(kwargs.at("average"));
2056  evaluator = std::make_shared<flexible_accuracy>(
2057  flexible_accuracy(average));
2058 
2059  } else if(metric == "precision"){
2060  DASSERT_TRUE(kwargs.count("average") > 0);
2061  auto average = variant_get_value<flexible_type>(kwargs.at("average"));
2062  evaluator = std::make_shared<precision>(precision(average));
2063 
2064  } else if(metric == "recall"){
2065  DASSERT_TRUE(kwargs.count("average") > 0);
2066  auto average = variant_get_value<flexible_type>(kwargs.at("average"));
2067  evaluator = std::make_shared<recall>(recall(average));
2068 
2069  } else if(metric == "fbeta_score"){
2070  DASSERT_TRUE(kwargs.count("beta") > 0);
2071  DASSERT_TRUE(kwargs.count("average") > 0);
2072  auto beta = variant_get_value<double>(kwargs.at("beta"));
2073  auto average = variant_get_value<flexible_type>(kwargs.at("average"));
2074  evaluator = std::make_shared<fbeta_score>(fbeta_score(beta, average));
2075 
2076  } else if(metric == "f1_score"){
2077  DASSERT_TRUE(kwargs.count("average") > 0);
2078  auto average = variant_get_value<flexible_type>(kwargs.at("average"));
2079  evaluator = std::make_shared<fbeta_score>(fbeta_score(1.0, average));
2080 
2081  } else {
2082  log_and_throw("\'" + metric + "\' is not a supported evaluation metric.");
2083  }
2084 
2085  // Initialize with number of threads.
2086  size_t n_threads = turi::thread::cpu_count();
2087  evaluator->init(n_threads);
2088  return evaluator;
2089 }
2090 
2091 } // evaluation
2092 } // turicreate
2093 
2094 #ifdef __clang__
2095  #pragma clang diagnostic pop
2096 #endif
2097 
2098 #endif
std::vector< double > flex_vec
std::decay< T >::type variant_get_value(const variant_type &v)
Definition: variant.hpp:320
virtual void register_unmapped_example(const size_t &target, const size_t &prediction, size_t thread_id=0)
void register_example(const flexible_type &target, const flexible_type &prediction, size_t thread_id=0)
static uint64_t hash64_combine(uint64_t h1, uint64_t h2)
size_t size() const
Definition: sframe.hpp:354
boost::make_recursive_variant< flexible_type, std::shared_ptr< unity_sgraph_base >, dataframe_t, std::shared_ptr< model_base >, std::shared_ptr< unity_sframe_base >, std::shared_ptr< unity_sarray_base >, std::map< std::string, boost::recursive_variant_ >, std::vector< boost::recursive_variant_ >, boost::recursive_wrapper< function_closure_info > >::type variant_type
Definition: variant.hpp:24
iterator get_output_iterator(size_t segmentid)
std::enable_if<!std::is_integral< T >::value &&!std::is_floating_point< T >::value, T >::type to() const
void register_example(const flexible_type &target, const flexible_type &prediction, size_t thread_id=0)
flexible_accuracy(flexible_type average="micro")
static size_t cpu_count()
const T & get() const
void register_unmapped_example(const size_t &target, const size_t &prediction, size_t thread_id=0)
precision(flexible_type average="macro")
fbeta_score(double beta=1.0, flexible_type average="macro")
flex_type_enum get_type() const
confusion_matrix(std::map< size_t, flexible_type > index_map=std::map< size_t, flexible_type >())
void register_example(const flexible_type &target, const flexible_type &prediction, size_t thread_id=0)
virtual bool is_table_printer_compatible() const
void open_for_write(const std::vector< std::string > &column_names, const std::vector< flex_type_enum > &column_types, const std::string &frame_sidx_file="", size_t nsegments=SFRAME_DEFAULT_NUM_SEGMENTS, bool fail_on_column_names=true)
Definition: sframe.hpp:265
variant_type to_variant(const T &f)
Definition: variant.hpp:308
std::vector< double > precision(const std::vector< size_t > &actual, const std::vector< size_t > &predicted, const std::vector< size_t > &cutoffs)
void register_example(const flexible_type &target, const flexible_type &prediction, size_t thread_id=0)
std::vector< double > recall(const std::vector< size_t > &actual, const std::vector< size_t > &predicted, const std::vector< size_t > &cutoffs)
void register_example(const flexible_type &target, const flexible_type &prediction, size_t thread_id=0)
static flexible_type FLEX_UNDEFINED
roc_curve(std::unordered_map< flexible_type, size_t > index_map=std::unordered_map< flexible_type, size_t >(), flexible_type average=FLEX_UNDEFINED, bool binary=true, size_t num_classes=size_t(-1))
#define DASSERT_TRUE(cond)
Definition: assertions.hpp:364
recall(flexible_type average="macro")
void init(size_t _n_threads=1)