6 #ifndef TURI_SUPERVISED_LEARNING_UTILS_H_ 7 #define TURI_SUPERVISED_LEARNING_UTILS_H_ 11 #include <core/storage/sframe_data/sarray.hpp> 12 #include <core/storage/sframe_data/sframe.hpp> 15 #include <ml/ml_data/ml_data.hpp> 16 #include <ml/ml_data/metadata.hpp> 17 #include <core/util/testing_utils.hpp> 19 #include <toolkits/supervised_learning/supervised_learning.hpp> 22 #include <model_server/lib/variant.hpp> 23 #include <model_server/lib/unity_base_types.hpp> 24 #include <model_server/lib/variant_deep_serialize.hpp> 25 #include <model_server/lib/flex_dict_view.hpp> 28 #include <model_server/lib/toolkit_function_macros.hpp> 29 #include <core/storage/serialization/serialization_includes.hpp> 33 namespace supervised {
40 inline Eigen::Matrix<double, Eigen::Dynamic,1> get_stderr_from_hessian(
41 const Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>& hessian) {
42 DASSERT_EQ(hessian.rows(), hessian.cols());
43 return hessian.inverse().diagonal().cwiseSqrt();
53 inline bool is_classifier(std::string model_name){
54 if(model_name.find(
"classifier") != std::string::npos) {
63 inline ml_data setup_ml_data_for_prediction(
65 const std::shared_ptr<supervised_learning_model_base>& model,
69 data = model->construct_ml_data_using_current_metadata(X, missing_value_action);
76 inline ml_data setup_ml_data_for_evaluation(
const sframe& X,
const sframe& y,
77 const std::shared_ptr<supervised_learning_model_base>& model,
80 data = model->construct_ml_data_using_current_metadata(X, y, missing_value_action);
89 inline void check_empty_data(sframe X){
90 if (X.num_rows() == 0){
91 log_and_throw(
"Input data does not contain any rows.");
94 if (X.num_columns() == 0){
95 log_and_throw(
"Input data does not contain any features.");
112 inline void check_target_column_type(std::string model_name, sframe y){
115 std::stringstream ss;
116 std::string model_name_for_display =
"";
118 if (model_name ==
"classifier_svm"){
119 model_name_for_display =
"SVM";
120 }
else if (model_name ==
"classifier_logistic_regression"){
121 model_name_for_display =
"Logistic Regression";
125 if(model_name ==
"classifier_svm" ||
126 model_name ==
"classifier_logistic_regression" ||
127 model_name ==
"random_forest_classifier" ||
128 model_name ==
"decision_tree_classifier" ||
129 model_name ==
"boosted_trees_classifier"){
134 ss <<
"Column type of target '" << y.column_name(0)
135 <<
"' must be int or str." 137 log_and_throw(ss.str());
146 ss <<
"Column type of target '" << y.column_name(0)
147 <<
"' must be int or float." 149 log_and_throw(ss.str());
157 inline sframe setup_test_data_sframe(
const sframe& sf,
158 std::shared_ptr<supervised_learning_model_base> model,
161 check_empty_data(sf);
163 auto expected_columns = model->get_feature_names();
164 switch (missing_value_action) {
165 case ml_missing_value_action::IMPUTE:
166 ret = model->impute_missing_columns_using_current_metadata(sf);
168 case ml_missing_value_action::USE_NAN:
169 if (model->support_missing_value()) {
170 ret = model->impute_missing_columns_using_current_metadata(sf);
172 log_and_throw(
"Model doesn't support missing value, please set missing_value_action to \"impute\"");
175 case ml_missing_value_action::ERROR:
179 log_and_throw(
"Invalid missing value action");
181 ret = ret.select_columns(expected_columns);
194 template <
typename EigenExpr>
196 inline void fill_reference_encoding(
197 const ml_data_row_reference& row_ref,
207 size_t feature_index,
double value,
208 size_t index_size,
size_t index_offset) {
210 if(UNLIKELY(feature_index >= index_size))
214 size_t idx = offset + feature_index;
216 if (feature_index != 0) {
224 DASSERT_LT(idx,
size_t(x.size()));
225 x.coeffRef(idx) = value;
245 inline void check_feature_means_and_variances(
246 const std::shared_ptr<ml_metadata> metadata,
247 bool display_warnings =
true) {
249 std::stringstream ss;
250 std::vector<std::string> error_columns;
253 for(
size_t cid = 0; cid < metadata->num_columns(); cid++){
254 const auto stats = metadata->statistics(cid);
255 size_t index_size = metadata->index_size(cid);
256 std::string col = metadata->column_name(cid);
257 for(
size_t i = 0; i < index_size; i++) {
258 if (std::abs(stats->stdev(i)) < 1e-20) {
259 error_columns.push_back(col);
265 if (error_columns.size() && display_warnings) {
266 ss <<
"WARNING: Detected extremely low variance for feature(s) ";
267 for(
size_t i=0; i < error_columns.size()-1; i++){
268 ss <<
"'" << error_columns[i] <<
"', ";
270 ss <<
"'" << error_columns[error_columns.size()-1] <<
"'" 271 <<
" because all entries are nearly the same.\n" 272 <<
"Proceeding with model training using all features. " 273 <<
"If the model does not provide results of adequate quality, " 274 <<
"exclude the above mentioned feature(s) from the input dataset.";
279 error_columns.clear();
280 bool column_with_nan =
false;
281 for(
size_t cid = 0; cid < metadata->num_columns(); cid++){
282 const auto stats = metadata->statistics(cid);
283 size_t index_size = metadata->index_size(cid);
284 std::string col = metadata->column_name(cid);
285 for(
size_t i = 0; i < index_size; i++) {
286 if (!std::isfinite(stats->mean(i))) {
287 error_columns.push_back(col);
288 column_with_nan =
true;
295 if (column_with_nan ==
true) {
296 ss <<
"Detected inf/nan values in feature(s) ";
297 for(
size_t i=0; i < error_columns.size()-1; i++){
298 ss <<
"'" << error_columns[i] <<
"', ";
300 ss <<
"'" << error_columns[error_columns.size()-1] <<
"'. " 301 <<
"Cannot proceed with model training.";
302 log_and_throw(ss.str());
310 inline std::vector<std::string> make_evaluation_progress(
311 const std::map<std::string, float>& eval_map,
312 const std::vector<std::string>
keys) {
313 std::vector<std::string> ret;
314 if (!eval_map.empty()) {
317 ret.push_back(std::to_string(eval_map.at(k)));
322 inline std::vector<std::string> make_progress_string(
323 size_t iter,
size_t examples,
double time,
324 const std::vector<std::string>& train_eval,
325 const std::vector<std::string>& valid_eval,
326 float speed,
bool padding_valid_eval) {
328 std::vector<std::string> ret;
329 ret.push_back(std::to_string(iter));
330 ret.push_back(std::to_string(examples));
331 ret.push_back(std::to_string(time));
332 for (
size_t i = 0 ; i < train_eval.size(); ++i) {
333 ret.push_back(train_eval[i]);
334 if (!valid_eval.empty()) {
335 ret.push_back(valid_eval[i]);
336 }
else if(padding_valid_eval) {
340 ret.push_back(std::to_string(speed));
348 inline std::vector<std::pair<std::string, size_t>> make_progress_header(
349 supervised_learning_model_base& smodel,
350 const std::vector<std::string>& stat_headers,
351 bool has_validation_data) {
353 auto header = std::vector<std::pair<std::string, size_t>>();
354 for (
const auto& s : stat_headers) {
355 header.push_back({s, 8});
358 auto metrics = std::vector<std::string>();
359 for (
const auto& metric: smodel.get_tracking_metrics()) {
360 metrics.push_back(metric);
363 for (
const auto& m: metrics) {
364 std::string dm = smodel.get_metric_display_name(m);
365 header.push_back({std::string(
"Training ") + dm, 6});
366 if (has_validation_data)
367 header.push_back({std::string(
"Validation ") + dm, 6});
373 inline std::vector<std::string> make_progress_row_string(
374 supervised_learning_model_base& smodel,
376 const ml_data& valid_data,
377 const std::vector<std::string>& stats) {
379 auto train_eval = std::vector<std::string>();
380 for (
auto& kv : smodel.evaluate(data,
"train")) {
381 train_eval.push_back(std::to_string(variant_get_value<double>(kv.second)));
384 auto valid_eval = std::vector<std::string>();
385 bool has_validation_data = valid_data.num_rows() > 0;
386 if (has_validation_data) {
387 for (
auto& kv : smodel.evaluate(valid_data,
"train")) {
388 valid_eval.push_back(std::to_string(variant_get_value<double>(kv.second)));
392 auto ret = std::vector<std::string>();
393 for (
const auto& s : stats)
396 for (
size_t i = 0 ; i < train_eval.size(); ++i) {
397 ret.push_back(train_eval[i]);
398 if (!valid_eval.empty()) {
399 ret.push_back(valid_eval[i]);
400 }
else if(has_validation_data) {
415 inline flexible_type get_class_weights_from_options(
416 const option_manager& options,
417 const std::shared_ptr<ml_metadata>& metadata){
419 size_t num_classes = 2;
420 num_classes = metadata->target_index_size();
421 auto indexer = metadata->target_indexer();
422 auto stats = metadata->target_statistics();
425 flexible_type class_weights_option = options.value(
"class_weights");
430 for(
size_t i = 0; i < num_classes; i++){
431 class_weights[i] = {indexer->map_index_to_value(i), 1.0};
435 }
else if (class_weights_option ==
"auto") {
440 for(
size_t i = 0; i < num_classes; i++){
442 total += 1.0 / stats->count(i);
444 for(
size_t i = 0; i < num_classes; i++){
445 class_weights[i] = {indexer->map_index_to_value(i),
446 1.0 / (total * stats->count(i))};
453 flex_dict_view class_weights_view(class_weights_option);
454 for(
size_t i = 0; i < num_classes; i++){
455 if (!class_weights_view.has_key(indexer->map_index_to_value(i))){
456 std::stringstream ss;
457 ss <<
"The parameter class_weight does not contain a weight for the " 458 <<
"class " << indexer->map_index_to_value(i) <<
"." 459 <<
" Make sure that the types of the keys in the class_weight " 460 <<
"dictionary are the same as the type of the target column." 462 log_and_throw(ss.str());
468 for(
const auto& kvp: class_weights_option.get<
flex_dict>()){
478 float weight = (float)kvp.second;
480 class_weights[i++] = {kvp.first, weight};
487 std::stringstream ss;
488 ss <<
"The class_weight parameter for the class " << kvp.first
489 <<
" must be a positive numeric value." 491 log_and_throw(ss.str());
497 std::stringstream ss;
498 ss <<
"The class_weights parameter cannot be of type " 500 <<
" Class weights must be a dictionary, None or 'auto'" << std::endl;
501 log_and_throw(ss.str());
505 return class_weights;
518 inline std::map<flexible_type, size_t> get_num_examples_per_class(
519 std::shared_ptr<ml_metadata> metadata){
521 std::map<flexible_type, size_t> examples_per_class;
522 for(
size_t k = 0; k < metadata->target_index_size(); k++){
523 examples_per_class[metadata->target_indexer()->map_index_to_value(k)] =
524 metadata->target_statistics()->count(k);
526 return examples_per_class;
536 inline std::vector<flexible_type> get_class_names(
537 std::shared_ptr<ml_metadata> metadata){
539 std::vector<flexible_type> classes;
540 classes.resize(metadata->target_index_size());
541 for(
size_t k = 0; k < classes.size(); k++){
542 classes[k] = metadata->target_indexer()->map_index_to_value(k);
553 inline size_t get_number_of_coefficients(std::shared_ptr<ml_metadata> metadata){
555 size_t num_coefficients = 1;
556 for(
size_t i = 0; i < metadata->num_columns(); i++) {
557 if (metadata->is_categorical(i)) {
558 num_coefficients += metadata->index_size(i) - 1;
560 num_coefficients += metadata->index_size(i);
563 return num_coefficients;
572 inline sframe add_na_std_err_to_coef(
const sframe& sf_coef) {
573 auto sa = std::make_shared<sarray<flexible_type>>(
576 return sf_coef.add_column(sa, std::string(
"stderr"));
587 inline void get_one_hot_encoded_coefs(
const Eigen::Matrix<double, Eigen::Dynamic, 1>&
588 coefs, std::shared_ptr<ml_metadata> metadata,
589 std::vector<double>& one_hot_coefs) {
592 size_t num_classes = metadata->target_index_size();
593 bool is_classifier = metadata->target_is_categorical();
598 for (
size_t c = 0; c < num_classes; c++) {
599 for (
size_t i = 0; i < metadata->num_columns(); ++i) {
601 size_t start_idx = 0;
602 if (metadata->is_categorical(i)) {
604 one_hot_coefs.push_back(0.0);
608 for (
size_t j = start_idx; j < metadata->index_size(i); ++j) {
609 one_hot_coefs.push_back(coefs[idx]);
615 one_hot_coefs.push_back(coefs[idx++]);
627 inline sframe get_coefficients_as_sframe(
628 const Eigen::Matrix<double, Eigen::Dynamic, 1>& coefs,
629 std::shared_ptr<ml_metadata> metadata,
630 const Eigen::Matrix<double, Eigen::Dynamic, 1>& std_err) {
637 bool is_classifier = metadata->target_is_categorical();
638 bool has_stderr = std_err.size() > 0;
639 DASSERT_EQ(std_err.size(), has_stderr * coefs.size());
642 std::vector<std::string> coef_names;
643 coef_names.push_back(
"name");
644 coef_names.push_back(
"index");
645 if (is_classifier) coef_names.push_back(
"class");
646 coef_names.push_back(
"value");
647 if (has_stderr) coef_names.push_back(
"stderr");
649 std::vector<flex_type_enum> coef_types;
652 if (is_classifier) coef_types.push_back(metadata->target_column_type());
656 sf_coef.open_for_write(coef_names, coef_types,
"", 1);
657 auto it_sf_coef = sf_coef.get_output_iterator(0);
660 std::vector<flexible_type> feature_names;
661 std::vector<flexible_type> feature_index;
663 feature_names.reserve(metadata->num_dimensions());
664 feature_index.reserve(metadata->num_dimensions());
666 for (
size_t i = 0; i < metadata->num_columns(); ++i) {
667 bool skip_zero = metadata->is_categorical(i);
669 for (
size_t j = skip_zero ? 1 : 0; j < metadata->index_size(i); ++j) {
670 feature_names.push_back(metadata->column_name(i));
672 if (metadata->is_indexed(i)) {
673 feature_index.push_back(
674 metadata->indexer(i)->map_index_to_value(j).to<
flex_string>());
675 }
else if (metadata->column_mode(i) == ml_column_mode::NUMERIC) {
676 feature_index.push_back(FLEX_UNDEFINED);
678 feature_index.push_back(std::to_string(j));
687 size_t num_classes = metadata->target_index_size();
688 size_t variables_per_class = coefs.size() / (num_classes - 1);
689 for(
size_t k = 1; k < num_classes; k++){
692 std::vector<flexible_type> x(4 + has_stderr);
693 x[0] =
"(intercept)";
695 x[2] = (metadata->target_indexer())->map_index_to_value(k);
696 x[3] = coefs(variables_per_class * k - 1);
697 if (has_stderr) x[4] = std_err(variables_per_class * k - 1);
702 for (
size_t i = 0; i < feature_names.size(); ++i) {
703 x[0] = feature_names[i];
704 x[1] = feature_index[i];
705 x[2] = (metadata->target_indexer())->map_index_to_value(k);
706 x[3] = coefs(variables_per_class * (k-1) + i);
707 if (has_stderr) x[4] = std_err(variables_per_class * (k-1) + i);
718 std::vector<flexible_type> x(3 + has_stderr);
719 x[0] =
"(intercept)";
721 x[2] = coefs(coefs.size() - 1);
722 if (has_stderr) x[3] = std_err(std_err.size() - 1);
727 for (
size_t i = 0; i < feature_names.size(); ++i) {
728 x[0] = feature_names[i];
729 x[1] = feature_index[i];
731 if (has_stderr) x[3] = std_err(i);
739 inline sframe get_coefficients_as_sframe(
740 const Eigen::Matrix<double, Eigen::Dynamic, 1>& coefs,
741 std::shared_ptr<ml_metadata> metadata) {
742 Eigen::Matrix<double, Eigen::Dynamic, 1> EMPTY;
743 return get_coefficients_as_sframe(coefs, metadata, EMPTY);
752 inline std::map<flexible_type, size_t>get_num_examples_per_class_from_sarray(
753 std::shared_ptr<sarray<flexible_type>> sa){
754 auto reader = sa->get_reader();
755 std::map<flexible_type, size_t> unique_values;
756 for(
size_t seg_id = 0; seg_id < sa->num_segments(); seg_id++){
757 auto iter = reader->begin(seg_id);
758 auto enditer = reader->end(seg_id);
759 while(iter != enditer) {
760 if(unique_values.find(*iter) == unique_values.end()){
761 unique_values.insert({*iter,0});
763 ++unique_values[*iter];
768 return unique_values;
static GL_HOT_INLINE_FLATTEN bool mode_is_categorical(ml_column_mode mode)
std::set< Key > keys(const std::map< Key, T > &map)
const char * flex_type_enum_to_name(flex_type_enum en)
#define logprogress_stream
#define GL_HOT_INLINE_FLATTEN
std::vector< std::pair< flexible_type, flexible_type > > flex_dict
static flexible_type FLEX_UNDEFINED
#define DASSERT_TRUE(cond)