6 #ifndef TURI_DML_DATA_H_ 7 #define TURI_DML_DATA_H_ 11 #include <core/storage/sframe_data/sframe.hpp> 12 #include <ml/ml_data/metadata.hpp> 13 #include <ml/ml_data/ml_data_entry.hpp> 14 #include <ml/ml_data/ml_data_column_modes.hpp> 16 #include <Eigen/SparseCore> 21 class ml_data_iterator;
23 namespace ml_data_internal {
24 class ml_data_block_manager;
25 struct row_data_block;
26 class ml_data_reconciler;
275 explicit ml_data(
const std::shared_ptr<ml_metadata>& metadata);
322 void fill(
const sframe& data,
323 const std::string& target_column =
"",
324 const column_mode_map mode_overrides = column_mode_map(),
325 bool immutable_metadata =
false,
351 void fill(
const sframe& data,
352 const std::pair<size_t, size_t>& row_bounds,
353 const std::string& target_column =
"",
354 const column_mode_map mode_overrides = column_mode_map(),
355 bool immutable_metadata =
false,
367 inline const std::shared_ptr<ml_metadata>&
metadata()
const {
374 return _metadata->num_columns();
380 return _row_end - _row_start;
392 return _row_start == _row_end;
405 ml_data_iterator get_iterator(
size_t thread_idx=0,
size_t num_threads=1)
const;
411 return rm.has_target;
418 return (!untranslated_columns.empty());
424 return (untranslated_columns.size() != metadata()->num_columns(
false));
434 return _max_row_size;
451 ml_data create_subsampled_copy(
size_t n_rows,
size_t random_seed)
const;
464 ml_data select_rows(
const std::vector<size_t>& selection_indices)
const;
470 ml_data slice(
size_t start_row,
size_t end_row)
const;
481 void _reindex_blocks(
const std::vector<std::vector<size_t> >& reindex_maps);
486 friend class ml_data_internal::ml_data_reconciler;
487 friend void reconcile_distributed_ml_data(
ml_data& data,
const std::vector<std::string>&);
495 std::shared_ptr<ml_metadata> _metadata =
nullptr;
497 size_t _row_start = 0;
499 size_t _original_num_rows = 0;
500 size_t _max_row_size = 0;
512 size_t row_block_size = size_t(-1);
517 std::shared_ptr<sarray<ml_data_internal::row_data_block> > data_blocks;
522 std::vector<std::shared_ptr<sarray<flexible_type> > > untranslated_columns;
528 std::shared_ptr<ml_data_internal::ml_data_block_manager> block_manager;
533 void _reset_block_manager();
545 void _setup_ml_metadata(
const sframe& data,
546 const std::string& target_column_name,
547 const column_mode_map& mode_overrides);
565 void _fill_data_blocks(
const sframe& raw_data,
566 bool immutable_metadata,
567 bool track_statistics,
569 const std::pair<size_t, size_t>& row_bounds,
570 const std::set<std::string>& sorted_columns);
574 void _setup_untranslated_columns(
const sframe& original_data,
size_t row_lb,
size_t row_ub);
578 void _setup_untranslated_column_readers();
583 friend class ml_data_reconciler;
601 } END_OUT_OF_PLACE_SAVE()
605 arc >> is_not_nullptr;
610 m = std::shared_ptr<sarray<turi::ml_data_internal::row_data_block> >(
nullptr);
612 } END_OUT_OF_PLACE_LOAD()
616 #include <ml/ml_data/ml_data_iterator.hpp> #define BEGIN_OUT_OF_PLACE_LOAD(arc, tname, tval)
Macro to make it easy to define out-of-place loads.
bool has_untranslated_columns() const
const std::shared_ptr< ml_metadata > & metadata() const
bool has_translated_columns() const
std::map< std::string, ml_column_mode > column_mode_map
This is here to get around 2 clang bugs!
size_t max_row_size() const
size_t get_version() const
size_t num_columns() const
#define BEGIN_OUT_OF_PLACE_SAVE(arc, tname, tval)
Macro to make it easy to define out-of-place saves.