6 #ifndef TURI_ML2_DATA_H_ 7 #define TURI_ML2_DATA_H_ 11 #include <core/storage/sframe_data/sframe.hpp> 12 #include <model_server/lib/extensions/option_manager.hpp> 13 #include <toolkits/ml_data_2/metadata.hpp> 14 #include <toolkits/ml_data_2/ml_data_entry.hpp> 15 #include <toolkits/ml_data_2/ml_data_column_modes.hpp> 16 #include <toolkits/ml_data_2/data_storage/ml_data_row_format.hpp> 17 #include <toolkits/ml_data_2/side_features.hpp> 19 #include <Eigen/SparseCore> 22 namespace turi {
namespace v2 {
24 class ml_data_iterator;
25 class ml_data_block_iterator;
26 class ml_data_iterator_base;
28 namespace ml_data_internal {
29 class ml_data_block_manager;
456 ml_data(
const ml_data&);
457 const ml_data& operator=(
const ml_data&);
458 ml_data& operator=(ml_data&&) =
default;
460 ml_data(ml_data&&) =
default;
464 static std::map<std::string, flexible_type> default_options() {
469 {
"sort_by_first_two_columns_on_train",
false},
470 {
"sort_by_first_two_columns",
false},
472 {
"shuffle_rows_on_train",
false},
473 {
"shuffle_rows",
false},
475 {
"column_indexer_type",
"unique"},
476 {
"column_statistics_type",
"basic-dense"},
478 {
"missing_value_action_on_train",
"error"},
479 {
"missing_value_action_on_predict",
"impute"},
481 {
"integer_columns_categorical_by_default",
false},
483 {
"target_column_always_numeric",
false},
484 {
"target_column_always_categorical",
false},
486 {
"target_column_indexer_type",
"unique"},
488 {
"target_column_statistics_type",
"basic-dense"},
490 {
"uniquify_side_column_names",
false},
492 {
"ignore_new_columns_after_train",
false}
500 explicit ml_data(
const std::shared_ptr<ml_metadata>& metadata,
501 bool immutable_metadata =
false);
504 typedef std::map<std::string, flexible_type> flex_map;
509 explicit ml_data(
const std::map<std::string, flexible_type>& options = flex_map());
514 explicit ml_data(std::initializer_list<std::map<std::string, flexible_type>::value_type> l)
515 : ml_data(
std::map<
std::string, flexible_type>(l))
519 typedef std::map<std::string, ml_column_mode> column_mode_map;
525 void set_data(
const sframe& data,
526 const std::string& target_column =
"",
527 const std::vector<std::string>& partial_column_ordering = std::vector<std::string>(),
528 const column_mode_map mode_overrides = column_mode_map());
536 void set_data(
const sframe& data,
537 const sframe& target,
538 const std::vector<std::string>& partial_column_ordering = std::vector<std::string>(),
539 const column_mode_map mode_overrides = column_mode_map());
546 void add_side_data(
const sframe& data,
547 const std::string& forced_join_column =
"",
548 const column_mode_map mode_overrides = column_mode_map());
554 void fill(
const sframe& data,
555 const std::string& target_column =
"");
561 void fill(
const sframe& data,
562 const sframe& target);
580 inline bool creation_complete()
const {
return incoming_data ==
nullptr; }
584 inline const std::shared_ptr<ml_metadata>& metadata()
const {
591 inline size_t num_columns()
const {
592 return _metadata->num_columns();
597 inline size_t num_rows()
const {
598 return _row_end - _row_start;
603 inline size_t size()
const {
609 inline bool empty()
const {
610 return _row_start == _row_end;
619 size_t max_row_size()
const;
631 ml_data_iterator get_iterator(
632 size_t thread_idx=0,
size_t num_threads=1,
633 bool add_side_information_if_present =
true,
634 bool use_reference_encoding =
false)
const;
641 ml_data_block_iterator get_block_iterator(
642 size_t thread_idx=0,
size_t num_threads=1,
643 bool add_side_information_if_present =
true,
644 bool use_reference_encoding =
false)
const;
654 std::shared_ptr<ml_data_side_features> get_side_features()
const {
657 return side_features;
662 bool has_target()
const {
663 return rm.has_target;
668 bool has_side_features()
const {
669 return (side_features !=
nullptr);
675 bool has_untranslated_columns()
const {
676 return (!untranslated_columns.empty());
681 bool has_translated_columns()
const {
682 return (untranslated_columns.size() != metadata()->num_columns(
false));
685 typedef Eigen::Matrix<double, Eigen::Dynamic,1> DenseVector;
686 typedef Eigen::SparseVector<double> SparseVector;
691 std::vector<flexible_type> translate_row_to_original(
const std::vector<ml_data_entry>& v)
const;
696 std::vector<flexible_type> translate_row_to_original(
const std::vector<ml_data_entry_global_index>& v)
const;
701 std::vector<flexible_type> translate_row_to_original(
const DenseVector& v)
const;
706 std::vector<flexible_type> translate_row_to_original(
const SparseVector& v)
const;
713 typedef std::shared_ptr<ml_data_internal::column_indexer> indexer_type;
719 static indexer_type create_indexer(
720 const std::string& column_name,
723 const std::string& indexer_type =
"unique",
724 const std::map<std::string, flexible_type>& options = flex_map());
728 void _check_is_iterable()
const;
747 ml_data create_subsampled_copy(
size_t n_rows,
size_t random_seed)
const;
760 ml_data select_rows(
const std::vector<size_t>& selection_indices)
const;
766 ml_data slice(
size_t start_row,
size_t end_row)
const;
772 ml_data absolute_slice(
size_t start_row,
size_t end_row)
const;
780 size_t get_version()
const {
return 1; }
794 friend class ml_data_iterator_base;
802 std::shared_ptr<ml_metadata> _metadata =
nullptr;
804 size_t _row_start = 0;
806 size_t _original_num_rows = 0;
807 size_t _max_row_size = 0;
814 ml_data_internal::row_metadata rm;
820 std::shared_ptr<ml_data_side_features> side_features =
nullptr;
826 size_t row_block_size = size_t(-1);
831 std::shared_ptr<sarray<ml_data_internal::row_data_block> > data_blocks;
836 std::vector<std::shared_ptr<sarray<flexible_type> > > untranslated_columns;
842 std::shared_ptr<ml_data_internal::ml_data_block_manager> block_manager;
847 void _create_block_manager();
857 struct _data_for_filling {
860 std::map<std::string, flexible_type> options;
862 bool immutable_metadata;
865 std::string target_column_name;
871 std::vector<std::string> column_ordering;
874 typedef std::map<std::string, ml_column_mode> mode_override_map;
876 mode_override_map mode_overrides;
878 struct incoming_side_feature {
880 std::string forced_join_column;
881 mode_override_map mode_overrides;
884 std::vector<incoming_side_feature> incoming_side_features;
887 std::unique_ptr<_data_for_filling> incoming_data =
nullptr;
901 void _setup_ml_metadata();
919 void _fill_data_blocks(
bool in_training_mode);
923 void _setup_untranslated_columns(
const sframe& original_data);
927 void _setup_untranslated_column_readers();
936 std::unique_ptr<ml_data_iterator> _merge_sorted_ml_data_sources(
937 const std::vector<std::unique_ptr<ml_data_iterator> >& sources);
939 void _sort_user_item_data_blocks();
955 } END_OUT_OF_PLACE_SAVE()
959 arc >> is_not_nullptr;
961 m.reset(
new sarray<turi::v2::ml_data_internal::row_data_block>);
964 m = std::shared_ptr<sarray<turi::v2::ml_data_internal::row_data_block> >(
nullptr);
966 } END_OUT_OF_PLACE_LOAD()
#define BEGIN_OUT_OF_PLACE_LOAD(arc, tname, tval)
Macro to make it easy to define out-of-place loads.
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
#define DASSERT_TRUE(cond)
#define BEGIN_OUT_OF_PLACE_SAVE(arc, tname, tval)
Macro to make it easy to define out-of-place saves.