Turi Create  4.0
testing_utils.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_UNITY_ML_DATA_2_TESTING_UTILS_H_
7 #define TURI_UNITY_ML_DATA_2_TESTING_UTILS_H_
8 
9 #include <core/storage/sframe_data/sframe.hpp>
10 #include <toolkits/ml_data_2/ml_data.hpp>
11 #include <toolkits/ml_data_2/ml_data_iterators.hpp>
12 #include <vector>
13 #include <string>
14 #include <map>
15 #include <iostream>
16 #include <unordered_set>
17 #include <core/storage/sframe_data/testing_utils.hpp>
18 
19 namespace turi { namespace v2 {
20 
21 /** Creates a random SFrame for testing purposes. The
22  * column_type_info gives the types of the column.
23  *
24  * \param[in] n_rows The number of observations to run the timing on.
25  * \param[in] column_type_info A string with each character denoting
26  * one type of column. The legend is as follows:
27  *
28  * n: numeric column.
29  * b: categorical column with 2 categories.
30  * z: categorical column with 5 categories.
31  * Z: categorical column with 10 categories.
32  * c: categorical column with 100 categories.
33  * C: categorical column with 1000000 categories.
34  * s: categorical column with short string keys and 1000 categories.
35  * S: categorical column with short string keys and 100000 categories.
36  * v: numeric vector with 10 elements.
37  * V: numeric vector with 1000 elements.
38  * u: categorical set with up to 10 elements.
39  * U: categorical set with up to 1000 elements.
40  * d: dictionary with 10 entries.
41  * D: dictionary with 100 entries.
42  *
43  * \param[in] create_target_column If true, then create a random
44  * target column as well.
45  *
46  * \param[in] options Additional ml_data option flags passed
47  * to ml_data::fill.
48  *
49  * \return A pair of sframe, with the raw data, and an ml_data object
50  * made from that sframe.
51  *
52  */
53 std::pair<sframe, ml_data> make_random_sframe_and_ml_data(
54  size_t n_rows, std::string column_types, bool create_target_column = false,
55  const std::map<std::string, flexible_type>& options = std::map<std::string, flexible_type>());
56 
57 
58 /** The information returned by the
59  *
60  */
62  sframe main_sframe;
63  std::vector<sframe> side_sframes;
64  std::vector<std::vector<flexible_type> > joined_data;
65 
66  ml_data data;
67 };
68 
69 
70 /** Creates an ml_data structure with side information. side gives a
71  * vector of (nrows, creation_string) pairs.
72  *
73  */
74 sframe_and_side_info make_ml_data_with_side_data(
75  size_t n_main_rows, const std::string& main,
76  const std::vector<std::pair<size_t, std::string> >& side,
77  bool create_target_column,
78  const std::map<std::string, flexible_type>& options = std::map<std::string, flexible_type>());
79 
80 
81 /** Better equality testing stuff. Handles out-of-order on the
82  * categorical_vector, which is assumed by ml_data.
83  */
84 static inline bool ml_testing_equals(const flexible_type& v1, const flexible_type& v2) {
85 
86  if(v1.get_type() != v2.get_type())
87  return false;
88 
89  // Have to hijack a few of these here, since the eigen stuff doesn't
90  // deal with duplicates well
91  switch(v1.get_type()) {
92  case flex_type_enum::LIST: {
93  return (std::unordered_set<flexible_type>(
94  v1.get<flex_list>().begin(), v1.get<flex_list>().end())
95  ==
96  std::unordered_set<flexible_type>(
97  v2.get<flex_list>().begin(), v2.get<flex_list>().end()));
98  }
99 
100  case flex_type_enum::VECTOR: {
101  if(v1.size() == v2.size()) {
102  return v1 == v2;
103  } else if(v1.size() == 0) {
104  for(size_t i = 0; i < v2.size(); ++i) {
105  if(v2[i] != 0) return false;
106  }
107  return true;
108  } else if(v2.size() == 0) {
109  for(size_t i = 0; i < v1.size(); ++i) {
110  if(v1[i] != 0) return false;
111  }
112  return true;
113  }
114  return false;
115  }
116 
117  default:
118  return v2 == v1;
119 
120  }
121 }
122 
123 }}
124 
125 
126 /** Printing out a row.
127  *
128  */
129 std::ostream& operator<<(std::ostream& os, const std::vector<turi::v2::ml_data_entry>& v);
130 
131 
132 
133 #endif /* _TESTING_UTILS_H_ */
std::pair< sframe, ml_data > make_random_sframe_and_ml_data(size_t n_rows, std::string column_types, bool create_target_column=false, bool target_column_categorical=false)
const T & get() const
flex_type_enum get_type() const
static bool ml_testing_equals(const flexible_type &v1, const flexible_type &v2)
std::vector< flexible_type > flex_list