Turi Create  4.0
data_generators.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_TESTING_DATA_GENERATORS_H_
7 #define TURI_TESTING_DATA_GENERATORS_H_
8 
9 #include <core/storage/sframe_data/sframe.hpp>
10 #include <Eigen/Core>
11 #include <map>
12 #include <string>
13 
14 namespace turi { namespace recsys {
15 
16 /**
17  * \ingroup toolkit_util
18  * A simple class for generating fake linear model data for testing
19  * purposes. This uses the factorization machine model to generate
20  * the data.
21  *
22  * The options going into this generator are as follows. These are
23  * not necessarily used by each function:
24  *
25  * - random_seed: Random seed for sampling the data.
26  *
27  * - n_factors: Number of latent factors to use in the generation.
28  *
29  * - noise_sd: Standard deviation of the noise associated with each response.
30  *
31  * - w0_sd: The standard deviation used in generating the intercept term.
32  *
33  * - w_sd: The standard deviation used in generating the linear terms.
34  *
35  * - V_sd: The standard deviation used in generating the latent factors.
36  *
37  * - y_mode: The sampling model. Can be "squared_error" or "logistic".
38  *
39  * The defaults for these are given in data_generators.cpp.
40  *
41  */
42 
44  public:
45 
47  const std::vector<std::string>& column_names,
48  const std::vector<size_t>& n_categorical_values,
49  const std::map<std::string, flexible_type>& base_options);
50 
51  /** Fill data with the observations and responses of the linear
52  * model.
53  */
54  sframe generate(size_t n_observations,
55  const std::string& target_column_name,
56  size_t random_seed,
57  double noise_sd) const;
58 
59  /** Fill two datasets for ranking and testing the ranking. This
60  * works by building a linear model and assuming that the
61  * observations with the highest responses are those in the data
62  * set. A portion of these are split off into the test set.
63  */
64  std::pair<sframe, sframe> generate_for_ranking(
65  size_t n_train_samples_per_user,
66  size_t n_test_samples_per_user,
67  size_t random_seed,
68  double noise_sd) const;
69 
70  private:
71 
72  double evaluate(const std::vector<flexible_type>& x, double noise_sd) const;
73 
74  double w0;
75  Eigen::VectorXd w;
76  Eigen::MatrixXd V;
77 
78  std::vector<std::string> column_names;
79  std::vector<size_t> n_categorical_values;
80 
81  size_t n_factors, dim;
82  bool logistic_mode;
83 
84  std::map<std::string, flexible_type> _options;
85 };
86 
87 }}
88 
89 #endif /* TURI_TESTING_DATA_GENERATORS_H_ */
sframe generate(size_t n_observations, const std::string &target_column_name, size_t random_seed, double noise_sd) const
std::pair< sframe, sframe > generate_for_ranking(size_t n_train_samples_per_user, size_t n_test_samples_per_user, size_t random_seed, double noise_sd) const