6 #ifndef TURI_ML2_DATA_SIDE_FEATURES_H_ 7 #define TURI_ML2_DATA_SIDE_FEATURES_H_ 9 #include <toolkits/ml_data_2/data_storage/ml_data_row_format.hpp> 10 #include <toolkits/ml_data_2/data_storage/ml_data_side_feature_translation.hpp> 11 #include <toolkits/ml_data_2/data_storage/internal_metadata.hpp> 12 #include <toolkits/ml_data_2/ml_data_entry.hpp> 13 #include <core/storage/serialization/serialization_includes.hpp> 64 void add_and_index_side_data(
sframe unindexed_side_sframe,
65 const std::map<std::string, ml_column_mode>& mode_override,
66 const std::map<std::string, flexible_type>& options,
68 bool immutable_metadata,
69 const std::string& forced_join_column =
"");
73 struct side_feature_info {
88 DASSERT_LT(main_column_index, side_lookups.size());
90 const column_side_info& csi = side_lookups[main_column_index];
97 ( (main_feature_index < csi.data_lookup_map.size())
98 ? csi.data_lookup_map[main_feature_index]
101 return {csi.column_index_start, csi.rm, block_ptr};
107 template <
typename EntryType>
109 std::vector<EntryType>& x,
size_t main_column_index,
size_t feature_index)
const {
111 DASSERT_LT(main_column_index, side_lookups.size());
113 const column_side_info& csi = side_lookups[main_column_index];
118 if(feature_index >= csi.data_lookup_map.size())
123 if(block_ptr ==
nullptr)
126 ml_data_internal::append_raw_to_entry_row(csi.rm, block_ptr, x, csi.column_index_start);
130 template <
typename EntryType,
size_t n>
132 std::array<EntryType, n>& x,
size_t main_column_index,
size_t feature_index)
const {
133 ASSERT_MSG(
false,
"Programming Error: arrays not compatible with side features.");
142 std::vector<ml_data_entry>& x,
size_t main_column_index)
const GL_HOT {
148 template <
typename GlobalEntryType>
150 std::vector<GlobalEntryType>& x,
size_t main_column_index)
const {
152 x, main_column_index,
153 x[main_column_index].global_index - _full_metadata[main_column_index]->global_index_offset());
157 template <
typename EntryType,
size_t n>
159 std::array<EntryType, n>& x,
size_t main_column_index)
const {
160 ASSERT_MSG(
false,
"Programming Error: arrays not compatible with side features.");
171 DASSERT_LT(main_column_index, side_lookups.size());
172 const column_side_info& csi = side_lookups[main_column_index];
174 return std::make_pair(csi.column_index_start,
175 csi.column_index_start + csi.rm.metadata_vect.size());
186 DASSERT_LT(main_column_index, side_lookups.size());
187 const column_side_info& csi = side_lookups[main_column_index];
189 size_t start_idx = csi.column_index_start;
190 size_t end_idx = csi.column_index_start + csi.rm.metadata_vect.size();
192 if(start_idx == end_idx)
195 return std::make_pair(_full_metadata[start_idx]->global_index_offset(),
196 (_full_metadata[end_idx - 1]->global_index_offset()
197 + _full_metadata[end_idx - 1]->index_size()));
205 DASSERT_EQ(x.size(), main_metadata.size());
207 const size_t x_size = x.size();
209 for(
size_t i = 0; i < x_size; ++i) {
217 template <
typename GlobalEntryType>
220 DASSERT_EQ(x.size(), main_metadata.size());
222 const size_t x_size = x.size();
224 for(
size_t i = 0; i < x_size; ++i) {
232 template <
typename EntryType,
size_t n>
234 ASSERT_MSG(
false,
"Programming Error: arrays not compatible with side features.");
245 auto new_end = std::remove_if(x.begin(), x.end(),
247 return (lb <= v.column_index) && (v.column_index < ub);
250 x.resize(new_end - x.begin());
256 template <
typename GlobalEntryType>
262 auto new_end = std::remove_if(x.begin(), x.end(),
263 [&](
const GlobalEntryType& v) {
264 return (lb <= v.global_index) && (v.global_index < ub);
267 x.resize(new_end - x.begin());
272 template <
typename EntryType,
size_t n>
274 ASSERT_MSG(
false,
"Programming Error: arrays not compatible with side features.");
280 size_t num_columns(
size_t main_column_index)
const;
285 friend class ml_metadata;
289 void uniquify_side_column_names(
291 std::map<std::string, std::string>& column_name_map,
292 const std::string& join_name)
const;
310 const std::vector<ml_data_internal::column_metadata_ptr>& get_full_column_metadata()
const {
311 return _full_metadata;
324 size_t main_column_index,
size_t side_column_index)
const;
330 size_t max_additional_row_size()
const {
331 size_t _max_additional_row_size = 0;
333 for(
const column_side_info& csi : side_lookups) {
334 _max_additional_row_size += csi.max_row_size;
337 return _max_additional_row_size;
344 std::shared_ptr<ml_data_side_features> copy_with_new_main_columns(
345 const std::vector<ml_data_internal::column_metadata_ptr>& new_columns)
const;
350 const std::vector<ml_data_internal::column_metadata_ptr> main_metadata;
351 std::map<std::string, size_t> main_column_name_lookup;
353 struct column_side_info {
354 size_t column_index_start = 0;
355 size_t max_row_size = 0;
360 std::map<std::string, std::string> column_name_map;
364 std::vector<ml_data_internal::entry_value_iterator> data_lookup_map;
367 std::vector<column_side_info> side_lookups;
415 std::vector<std::shared_ptr<const ml_data_internal::row_data_block> > raw_row_storage;
419 size_t current_column_index;
421 std::vector<ml_data_internal::column_metadata_ptr> _full_metadata;
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
void add_partial_side_features_to_row(std::vector< ml_data_entry > &x, size_t main_column_index) const GL_HOT
void add_partial_side_features_to_row(std::array< EntryType, n > &x, size_t main_column_index) const
Dummy overload to make a number of compiler issues easier.
side_feature_info get_side_feature_block(size_t main_column_index, size_t main_feature_index) const GL_HOT_INLINE_FLATTEN
void strip_side_features_from_row(size_t main_column_index, std::array< EntryType, n > &x) const
void strip_side_features_from_row(size_t main_column_index, std::vector< ml_data_entry > &x) const
void add_partial_side_features_to_row(std::vector< EntryType > &x, size_t main_column_index, size_t feature_index) const
void add_partial_side_features_to_row(std::vector< GlobalEntryType > &x, size_t main_column_index) const
#define GL_HOT_INLINE_FLATTEN
void add_side_features_to_row(std::vector< GlobalEntryType > &x) const
void add_partial_side_features_to_row(std::array< EntryType, n > &x, size_t main_column_index, size_t feature_index) const
Dummy overload to make a number of compiler issues easier.
size_t num_columns(size_t main_column_index) const
void add_side_features_to_row(std::array< EntryType, n > &x) const
void strip_side_features_from_row(size_t main_column_index, std::vector< GlobalEntryType > &x) const
void add_side_features_to_row(std::vector< ml_data_entry > &x) const
std::pair< size_t, size_t > global_indices_of_side_information_block(size_t main_column_index) const
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
std::pair< size_t, size_t > column_indices_of_side_information_block(size_t main_column_index) const