Turi Create  4.0
xgboost.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_XGBOOST_H_
7 #define TURI_XGBOOST_H_
8 // SFrame
9 #include <core/storage/sframe_data/sarray.hpp>
10 #include <core/storage/sframe_data/sframe.hpp>
11 #include <core/data/sframe/gl_sarray.hpp>
12 #include <core/data/sframe/gl_sframe.hpp>
13 
14 // ML-Data
15 #include <ml/ml_data/ml_data.hpp>
16 
17 // Utils
18 #include <timer/timer.hpp>
19 #include <core/logging/table_printer/table_printer.hpp>
20 #include <core/export.hpp>
21 
22 // Toolkits
23 #include <toolkits/supervised_learning/supervised_learning.hpp>
24 #include <toolkits/coreml_export/mlmodel_wrapper.hpp>
25 
26 // Forward delcare
27 namespace xgboost {
28 namespace learner {
29 class BoostLearner;
30 struct DMatrix;
31 }
32 }
33 
34 namespace turi {
35 namespace supervised {
36 namespace xgboost {
37 
38 // forward declare
39 class DMatrixMLData;
40 
41 enum class storage_mode_enum : int { IN_MEMORY = 0, EXT_MEMORY = 1, AUTO = 2 };
42 
43 /**
44  * Regression model base class.
45  */
47 
48  /** version number */
49  static constexpr size_t XGBOOST_MODEL_VERSION = 9;
50 
51  public:
52 
53  xgboost_model();
54 
55 
56  /**
57  * Configure booster from options
58  */
59  virtual void configure(void) = 0;
60 
61  /**
62  * Methods of base implementation.
63  * -------------------------------------------------------------------------
64  */
65  /**
66  * Initialize things that are specific to your model.
67  *
68  * \param[in] data ML-Data object created by the init function.
69  *
70  */
71  void model_specific_init(const ml_data& data,
72  const ml_data& valid_data) override;
73 
74  /**
75  * Set one of the options in the algorithm.
76  *
77  * This values is checked against the requirements given by the option
78  * instance. Options that are not present use default options.
79  *
80  * \param[in] opts Options to set
81  */
82  virtual void init_options(const std::map<std::string,flexible_type>& _opts) override;
83 
84  /**
85  * Methods already implemented.
86  * -------------------------------------------------------------------------
87  */
88  bool support_missing_value() const override { return true; }
89 
90  /**
91  * Train a regression model.
92  */
93  void train(void) override;
94 
95  /**
96  * Make predictions using a trained regression model.
97  *
98  * \param[in] test_X Test data (only independent variables)
99  * \param[in] output_type Type of prediction
100  * only)
101  * \returns ret Shared pointer to an SArray containing predicions.
102  *
103  * \note Already assumes that data is of the right shape.
104  */
105  std::shared_ptr<sarray<flexible_type>> predict(
106  const ml_data& test_data,
107  const std::string& output_type="") override;
108 
109  /**
110  * Get the predict from the base class and put it here :)
111  */
112  using supervised_learning_model_base::predict;
113 
114  /**
115  * Fast path predictions given a row of flexible_types.
116  *
117  * \param[in] rows List of rows (each row is a flex_dict)
118  * \param[in] missing_value_action Missing value action string
119  * \param[in] output_type Output type.
120  */
121  gl_sarray fast_predict(
122  const std::vector<flexible_type>& test_data,
123  const std::string& missing_value_action = "error",
124  const std::string& output_type="") override;
125 
126  std::shared_ptr<sarray<flexible_type>> predict_impl(
127  const ::xgboost::learner::DMatrix& dmat,
128  const std::string& output_type="");
129 
130  void xgboost_predict(const ::xgboost::learner::DMatrix& dmat,
131  bool output_margin,
132  std::vector<float>& out_preds,
133  double rf_running_rescale_constant=0.0);
134 
135  /**
136  * Fast path predictions given a row of flexible_types.
137  *
138  * \param[in] rows List of rows (each row is a flex_dict)
139  * \param[in] missing_value_action Missing value action string
140  * \param[in] output_type Output type.
141  * \param[in] topk Number of classes to return
142  */
143  gl_sframe fast_predict_topk(
144  const std::vector<flexible_type>& rows,
145  const std::string& missing_value_action ="error",
146  const std::string& output_type="",
147  const size_t topk = 5) override;
148 
149  sframe predict_topk_impl(
150  const ::xgboost::learner::DMatrix& dmat,
151  const std::string& output_type="",
152  const size_t topk = 5);
153 
154  /**
155  * Top-k predictions for multi-class classification.
156  *
157  * \param[in] test_X Test data (only independent variables)
158  * \param[in] output_type Type of prediction
159  * only)
160  * \returns ret Shared pointer to an SArray containing predicions.
161  *
162  * \note Already assumes that data is of the right shape.
163  */
164  sframe predict_topk(const ml_data& test_data,
165  const std::string& output_type="",
166  const size_t topk = 2) override;
167  /**
168  * First make predictions and then evaluate.
169  *
170  * \param[in] test_X Test data.
171  * \param[in] evaluation_type Type of evaluation
172  *
173  * \note Already assumes that data is of the right shape. Test data
174  * must contain target column also.
175  *
176  */
177  std::map<std::string, variant_type> evaluate(
178  const ml_data& test_data,
179  const std::string& evaluation_type="",
180  bool with_prediction=false) override;
181 
182  std::map<std::string, variant_type> evaluate_impl(
183  const DMatrixMLData& dmat,
184  const std::string& evaluation_type="");
185 
186 
187  /**
188  * Extract "tree features" for each test data instance.
189  * The tree feature is a integer vector f of size
190  * equal to number of trees, and f[i] is the leaf index of the tree.
191  *
192  * \param[test_data] test_X Test data.
193  * \param[options] addtional options.
194  */
195  std::shared_ptr<sarray<flexible_type>> extract_features(
196  const sframe& test_data,
197  ml_missing_value_action missing_value_action) override;
198 
199 
200  /**
201  * Returns an SFrame with two columns: feature names, and feature
202  * occurance in all trees.
203  */
204  gl_sframe get_feature_importance();
205 
206  /**
207  * Get all the decision trees from XGboost.
208  */
209  flexible_type get_trees();
210 
211  /**
212  * Get the decision tree associated with a particular tree_id.
213  */
214  flexible_type get_tree(size_t tree_id);
215 
216 
217  /**
218  * Returns a list of string representation of trees.
219  */
220  std::vector<std::string> dump(bool with_stats);
221  std::vector<std::string> dump_json(bool with_stats);
222 
223  /**
224  * Gets the model version number
225  */
226  virtual size_t get_version() const override {
227  // Version translator
228  // -----------------------
229  // 0 - Version 1.0
230  // 1 - Version 1.1
231  // 2 - Version 1.2
232  // 3 - Version 1.4
233  // 4 - Version 1.6
234  // 5 - Version 1.7
235  // 6 - Version 1.8
236  // 7 - Version 1.8.3
237  // 8 - Version 1.9
238  return XGBOOST_MODEL_VERSION;
239  }
240 
241  /**
242  * Serialize the object.
243  */
244  void save_impl(turi::oarchive& oarc) const override;
245 
246  /**
247  * Load the object
248  */
249  void load_version(turi::iarchive& iarc, size_t version) override;
250 
251  /**
252  * Return true if the model is random forest classifier or regression model.
253  */
254  bool is_random_forest();
255 
256  /**
257  * Return the number of classes of the model or 0 if the model is not a classifier.
258  */
259  size_t num_classes();
260 
261  /**
262  * \internal
263  * Set the model to use external memoty for training. Test only, do NOT
264  * call directly.
265  */
266  void _set_storage_mode(storage_mode_enum mode);
267 
268  /**
269  * \internal
270  * Set the model to split the input data to batch_size. Test only, do NOT call directly.
271  */
272  void _set_num_batches(size_t num_batches);
273 
274  /**
275  * \interal
276  */
277  std::pair<std::shared_ptr<DMatrixMLData>, std::shared_ptr<DMatrixMLData>> _init_data();
278 
279  void _init_learner(std::shared_ptr<DMatrixMLData> ptrain, std::shared_ptr<DMatrixMLData> pvalid,
280  bool restore_from_checkpoint, std::string checkpoint_restore_path);
281 
282  table_printer _init_progress_printer(bool has_validation_data);
283 
284  size_t _get_early_stopping_rounds(bool has_validation_data);
285 
286  void _save_training_state(size_t iteration,
287  const std::vector<float>& training_metrics,
288  const std::vector<float>& validation_metrics,
289  std::shared_ptr<unity_sframe> progress_table,
290  double training_time);
291 
292  void _checkpoint(const std::string& path);
293 
294  void _restore_from_checkpoint(const std::string& path);
295 
296  void _save(oarchive& oarc, bool save_booster_prediction_buffer) const;
297 
298 protected:
299 
300  /*! \brief internal ml data structure used for training*/
301  ml_data ml_data_, validation_ml_data_;
302 
303  /*! \brief this is the xgboost object supporting things */
304  std::shared_ptr<::xgboost::learner::BoostLearner> booster_;
305 
306  storage_mode_enum storage_mode_ = storage_mode_enum::AUTO;
307 
308  size_t num_batches_ = 0;
309 
310  std::shared_ptr<coreml::MLModelWrapper> _export_xgboost_model(bool is_classifier,
311  bool is_random_forest,
312  const std::map<std::string, flexible_type>& context);
313 
314 };
315 
316 } // namespace xgboost
317 } // namespace supervised
318 } // namespace turi
319 #endif
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
bool support_missing_value() const override
Definition: xgboost.hpp:88
ml_data ml_data_
internal ml data structure used for training
Definition: xgboost.hpp:301
std::shared_ptr<::xgboost::learner::BoostLearner > booster_
this is the xgboost object supporting things
Definition: xgboost.hpp:304
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
virtual size_t get_version() const override
Definition: xgboost.hpp:226