Turi Create  4.0
testing_utils.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_DML_DATA_TESTING_UTILS_H_
7 #define TURI_DML_DATA_TESTING_UTILS_H_
8 
9 #include <core/storage/sframe_data/sframe.hpp>
10 #include <ml/ml_data/ml_data.hpp>
11 #include <vector>
12 #include <string>
13 #include <map>
14 #include <iostream>
15 #include <unordered_set>
16 
17 namespace turi {
18 
19 /**
20  * \ingroup mldaata
21  * Creates a random SFrame for testing purposes. The
22  * column_type_info gives the types of the column.
23  *
24  * \param[in] n_rows The number of observations to run the timing on.
25  * \param[in] column_type_info A string with each character denoting
26  * one type of column. The legend is as follows:
27  *
28  * n: numeric column.
29  * b: categorical column with 2 categories.
30  * z: categorical column with 5 categories.
31  * Z: categorical column with 10 categories.
32  * c: categorical column with 100 categories.
33  * C: categorical column with 1000000 categories.
34  * s: categorical column with short string keys and 1000 categories.
35  * S: categorical column with short string keys and 100000 categories.
36  * v: numeric vector with 10 elements.
37  * V: numeric vector with 1000 elements.
38  * u: categorical set with up to 10 elements.
39  * U: categorical set with up to 1000 elements.
40  * d: dictionary with 10 entries.
41  * D: dictionary with 100 entries.
42  *
43  * \param[in] create_target_column If true, then create a random
44  * target column as well.
45  *
46  * \return A pair of sframe, with the raw data, and an ml_data object
47  * made from that sframe.
48  *
49  */
50 std::pair<sframe, ml_data> make_random_sframe_and_ml_data(
51  size_t n_rows, std::string column_types, bool create_target_column = false,
52  bool target_column_categorical = false);
53 
54 /** Better equality testing stuff. Handles out-of-order on the
55  * categorical_vector, which is assumed by ml_data.
56  */
57 static inline bool ml_testing_equals(const flexible_type& v1, const flexible_type& v2) {
58 
59  if(v1.get_type() != v2.get_type())
60  return false;
61 
62  // Have to hijack a few of these here, since the eigen stuff doesn't
63  // deal with duplicates well
64  switch(v1.get_type()) {
65  case flex_type_enum::LIST: {
66  return (std::unordered_set<flexible_type>(
67  v1.get<flex_list>().begin(), v1.get<flex_list>().end())
68  ==
69  std::unordered_set<flexible_type>(
70  v2.get<flex_list>().begin(), v2.get<flex_list>().end()));
71  }
72 
74  if(v1.size() == v2.size()) {
75  return v1 == v2;
76  } else if(v1.size() == 0) {
77  for(size_t i = 0; i < v2.size(); ++i) {
78  if(v2[i] != 0) return false;
79  }
80  return true;
81  } else if(v2.size() == 0) {
82  for(size_t i = 0; i < v1.size(); ++i) {
83  if(v1[i] != 0) return false;
84  }
85  return true;
86  }
87  return false;
88  }
89 
90  default:
91  return v2 == v1;
92 
93  }
94 }
95 
96 }
97 
98 
99 /** Printing out a row.
100  *
101  */
102 std::ostream& operator<<(std::ostream& os, const std::vector<turi::ml_data_entry>& v);
103 
104 
105 
106 #endif /* _TESTING_UTILS_H_ */
std::pair< sframe, ml_data > make_random_sframe_and_ml_data(size_t n_rows, std::string column_types, bool create_target_column=false, bool target_column_categorical=false)
const T & get() const
flex_type_enum get_type() const
static bool ml_testing_equals(const flexible_type &v1, const flexible_type &v2)
std::vector< flexible_type > flex_list