Turi Create  4.0
side_features.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_ML2_DATA_SIDE_FEATURES_H_
7 #define TURI_ML2_DATA_SIDE_FEATURES_H_
8 
9 #include <toolkits/ml_data_2/data_storage/ml_data_row_format.hpp>
10 #include <toolkits/ml_data_2/data_storage/ml_data_side_feature_translation.hpp>
11 #include <toolkits/ml_data_2/data_storage/internal_metadata.hpp>
12 #include <toolkits/ml_data_2/ml_data_entry.hpp>
13 #include <core/storage/serialization/serialization_includes.hpp>
14 #include <vector>
15 #include <map>
16 #include <memory>
17 
18 namespace turi {
19 
20 class sframe;
21 
22 namespace v2 {
23 
24 /** A class to manage possible sources of side information.
25  */
27 
28  private:
29  friend class ml_data;
30 
31  /** Main constructor. To be constructed only from winithin ml_data.
32  */
33  ml_data_side_features(const std::vector<ml_data_internal::column_metadata_ptr>& main_metadata);
34 
35  /** Should not be assigning things; use the copy constructor.
36  */
37  const ml_data_side_features& operator=(const ml_data_side_features&) = delete;
38 
39  /** Add in a new source of side information. This may be called
40  * many times to include new information.
41  *
42  * Joining is done by selecting a column with name the same as the
43  * name of the column in the main_metadata provided to the
44  * constructor of this class. If no column is found, or if there
45  * are multiple columns matching, an error is thrown.
46  *
47  * The new side information is indexed using the metadata indexer,
48  * with the column on which the join is performed indexed with the
49  * same metadata as the original column.
50  *
51  * It is possible to call this function multiple times. If the join
52  * column is new, it adds a block of column indices matched to that
53  * side information. When the observation vector is filled in with
54  * side information -- i.e. the join is performed -- these column
55  * indices have their own unique block. The metadata for that block
56  * can be accessed through get_full_metadata();
57  *
58  * If the join column is the same as a previous one, any new entries
59  * replace the previous entries. Only one table of side information
60  * is allowed per join column, so the schemas must match up. In
61  * this case, the column_is_categorical parameter can be ommitted.
62  *
63  */
64  void add_and_index_side_data(sframe unindexed_side_sframe,
65  const std::map<std::string, ml_column_mode>& mode_override,
66  const std::map<std::string, flexible_type>& options,
67  bool training_mode,
68  bool immutable_metadata,
69  const std::string& forced_join_column = "");
70 
71  public:
72 
73  struct side_feature_info {
74  size_t column_offset;
77  };
78 
79  ////////////////////////////////////////////////////////////////////////////////
80  // Versions of this with only the main index given.
81 
82  /** Returns a pointer to the raw location
83  *
84  */
85  side_feature_info get_side_feature_block(
86  size_t main_column_index, size_t main_feature_index) const GL_HOT_INLINE_FLATTEN {
87 
88  DASSERT_LT(main_column_index, side_lookups.size());
89 
90  const column_side_info& csi = side_lookups[main_column_index];
91 
92  // Get the pointer to the row of entry values. If it's out of
93  // range or the lookup resolves to the null pointer, then there is
94  // no side information for this value.
95 
97  ( (main_feature_index < csi.data_lookup_map.size())
98  ? csi.data_lookup_map[main_feature_index]
100 
101  return {csi.column_index_start, csi.rm, block_ptr};
102  }
103 
104 
105  /// Overload of the above. Appends the side features associated with
106  /// exactly one of the main columns to the observation vector x.
107  template <typename EntryType>
109  std::vector<EntryType>& x, size_t main_column_index, size_t feature_index) const {
110 
111  DASSERT_LT(main_column_index, side_lookups.size());
112 
113  const column_side_info& csi = side_lookups[main_column_index];
114 
115  // Get the pointer to the row of entry values. If it's out of
116  // range or the lookup resolves to the null pointer, then there is
117  // no side information for this value.
118  if(feature_index >= csi.data_lookup_map.size())
119  return;
120 
121  const ml_data_internal::entry_value* block_ptr = csi.data_lookup_map[feature_index];
122 
123  if(block_ptr == nullptr)
124  return;
125 
126  ml_data_internal::append_raw_to_entry_row(csi.rm, block_ptr, x, csi.column_index_start);
127  }
128 
129  /// Dummy overload to make a number of compiler issues easier.
130  template <typename EntryType, size_t n>
132  std::array<EntryType, n>& x, size_t main_column_index, size_t feature_index) const {
133  ASSERT_MSG(false, "Programming Error: arrays not compatible with side features.");
134  }
135 
136  ////////////////////////////////////////////////////////////////////////////////
137  // Versions of this with only the main index given.
138 
139  /// Appends the side features associated with exactly one of the
140  /// main columns to the observation vector x.
142  std::vector<ml_data_entry>& x, size_t main_column_index) const GL_HOT {
143  add_partial_side_features_to_row(x, main_column_index, x[main_column_index].index);
144  }
145 
146  /// Appends the side features associated with exactly one of the
147  /// main columns to the observation vector x.
148  template <typename GlobalEntryType>
150  std::vector<GlobalEntryType>& x, size_t main_column_index) const {
152  x, main_column_index,
153  x[main_column_index].global_index - _full_metadata[main_column_index]->global_index_offset());
154  }
155 
156  /// Dummy overload to make a number of compiler issues easier.
157  template <typename EntryType, size_t n>
159  std::array<EntryType, n>& x, size_t main_column_index) const {
160  ASSERT_MSG(false, "Programming Error: arrays not compatible with side features.");
161  }
162 
163  /** Returns the bounds on the column indices of data
164  * associated with a particular column.
165  *
166  * In a full std::vector<ml_data_entry> observation, the entries
167  * with column indices between these two values will be from the
168  * side data associated with main_column_index.
169  */
170  std::pair<size_t, size_t> column_indices_of_side_information_block(size_t main_column_index) const {
171  DASSERT_LT(main_column_index, side_lookups.size());
172  const column_side_info& csi = side_lookups[main_column_index];
173 
174  return std::make_pair(csi.column_index_start,
175  csi.column_index_start + csi.rm.metadata_vect.size());
176  }
177 
178  /** Returns the bounds on the global indices of data associated with
179  * a particular column.
180  *
181  * In a full std::vector<ml_data_entry> observation, the entries
182  * with column indices between these two values will be from the
183  * side data associated with main_column_index.
184  */
185  std::pair<size_t, size_t> global_indices_of_side_information_block(size_t main_column_index) const {
186  DASSERT_LT(main_column_index, side_lookups.size());
187  const column_side_info& csi = side_lookups[main_column_index];
188 
189  size_t start_idx = csi.column_index_start;
190  size_t end_idx = csi.column_index_start + csi.rm.metadata_vect.size();
191 
192  if(start_idx == end_idx)
193  return {0,0};
194 
195  return std::make_pair(_full_metadata[start_idx]->global_index_offset(),
196  (_full_metadata[end_idx - 1]->global_index_offset()
197  + _full_metadata[end_idx - 1]->index_size()));
198  }
199 
200  /** Appends all available side information to the vector x based on
201  * current values in x.
202  */
203  inline void add_side_features_to_row(std::vector<ml_data_entry>& x) const {
204 
205  DASSERT_EQ(x.size(), main_metadata.size());
206 
207  const size_t x_size = x.size();
208 
209  for(size_t i = 0; i < x_size; ++i) {
211  }
212  }
213 
214  /** Appends all available side information to the vector x based on
215  * current values in x.
216  */
217  template <typename GlobalEntryType>
218  inline void add_side_features_to_row(std::vector<GlobalEntryType>& x) const {
219 
220  DASSERT_EQ(x.size(), main_metadata.size());
221 
222  const size_t x_size = x.size();
223 
224  for(size_t i = 0; i < x_size; ++i) {
225  add_partial_side_features_to_row(x, i, x[i].global_index - _full_metadata[i]->global_index_offset());
226  }
227  }
228 
229  /** Overload of above; this case shouldn't ever be called -- i.e.,
230  * if you are using side features, you should be using vectors, not arrays.
231  */
232  template <typename EntryType, size_t n>
233  inline void add_side_features_to_row(std::array<EntryType, n>& x) const {
234  ASSERT_MSG(false, "Programming Error: arrays not compatible with side features.");
235  }
236 
237 
238  /** Strips out the side features in the row associated with main_column_index.
239  */
240  inline void strip_side_features_from_row(size_t main_column_index, std::vector<ml_data_entry>& x) const {
241 
242  size_t lb, ub;
243  std::tie(lb, ub) = column_indices_of_side_information_block(main_column_index);
244 
245  auto new_end = std::remove_if(x.begin(), x.end(),
246  [&](const ml_data_entry& v) {
247  return (lb <= v.column_index) && (v.column_index < ub);
248  });
249 
250  x.resize(new_end - x.begin());
251  }
252 
253  /** Strips out the side features in the row associated with
254  * main_column_index --
255  */
256  template <typename GlobalEntryType>
257  inline void strip_side_features_from_row(size_t main_column_index, std::vector<GlobalEntryType>& x) const {
258 
259  size_t lb, ub;
260  std::tie(lb, ub) = global_indices_of_side_information_block(main_column_index);
261 
262  auto new_end = std::remove_if(x.begin(), x.end(),
263  [&](const GlobalEntryType& v) {
264  return (lb <= v.global_index) && (v.global_index < ub);
265  });
266 
267  x.resize(new_end - x.begin());
268  }
269 
270  /** Can't strip out arrays.
271  */
272  template <typename EntryType, size_t n>
273  inline void strip_side_features_from_row(size_t main_column_index, std::array<EntryType, n>& x) const {
274  ASSERT_MSG(false, "Programming Error: arrays not compatible with side features.");
275  }
276 
277  /** Returns the number of columns joined off of column
278  * main_column_index in the main data.
279  */
280  size_t num_columns(size_t main_column_index) const;
281 
282  ////////////////////////////////////////////////////////////////////////////////
283 
284  private:
285  friend class ml_metadata;
286 
287  /** Uniquify the side column names.
288  */
289  void uniquify_side_column_names(
290  sframe& side_sframe,
291  std::map<std::string, std::string>& column_name_map,
292  const std::string& join_name) const;
293 
294  /// Serialization -- save and load functions. NOTE: these assume
295  /// that the correct metadata has already been set.
296  void save_without_metadata(turi::oarchive& oarc) const;
297  void load_with_metadata_present(turi::iarchive& iarc);
298 
299 
300  // The main side column information is intended to be accessed from
301  // the metadata class, not directly...
302 
303  /** Returns the full metadata for all columns, including side
304  * information. The full metadata contains the metadata for all the
305  * columns concatenated, as opposed to just the metadata of the main
306  * observation sframe. If you have just user-items in the main data,
307  * and 2 additional columns joined on user, main_metadata will be
308  * length 2 and get_full_metadata() will be length 4.
309  */
310  const std::vector<ml_data_internal::column_metadata_ptr>& get_full_column_metadata() const {
311  return _full_metadata;
312  }
313 
314  ////////////////////////////////////////////////////////////////////////////////
315  // Additional accessor functions to make accessing parts of the
316  // metadata and side features easier.
317 
318  /** Returns the metadata in the main_column_index with
319  * side_column_index. The main_column_index determines which side
320  * table we are referring to. Within that side table, each column
321  * has metadata associated with it.
322  */
323  const ml_data_internal::column_metadata& get_column_metadata(
324  size_t main_column_index, size_t side_column_index) const;
325 
326  /** Returns the maximum row size that is added on by the side data.
327  * This is the sum of the maximum possible sizes of the rows in
328  * each column.
329  */
330  size_t max_additional_row_size() const {
331  size_t _max_additional_row_size = 0;
332 
333  for(const column_side_info& csi : side_lookups) {
334  _max_additional_row_size += csi.max_row_size;
335  }
336 
337  return _max_additional_row_size;
338  }
339 
340  /** This function is needed to remap things for the metadata
341  * select_columns function. When selecting a subset of columns,
342  * this copies over the metadat in order to make it worthwhile.
343  */
344  std::shared_ptr<ml_data_side_features> copy_with_new_main_columns(
345  const std::vector<ml_data_internal::column_metadata_ptr>& new_columns) const;
346 
347  private:
348 
349  /// The main metadata
350  const std::vector<ml_data_internal::column_metadata_ptr> main_metadata;
351  std::map<std::string, size_t> main_column_name_lookup;
352 
353  struct column_side_info {
354  size_t column_index_start = 0;
355  size_t max_row_size = 0;
356 
358 
359  // A map to track column uniquify renamings.
360  std::map<std::string, std::string> column_name_map;
361 
362  // A map of pointers into the raw data below. If a pointer is null,
363  // there are no side features for that column / value.
364  std::vector<ml_data_internal::entry_value_iterator> data_lookup_map;
365  };
366 
367  std::vector<column_side_info> side_lookups;
368 
369  ////////////////////////////////////////////////////////////
370  // Some utility functions to govern how
371 
372  /// All the rows of side information are stored somewhere in the
373  /// vectors in raw_row_storage as a block of entry_values. Indexing
374  /// into the raw storage is provided by the data_lookup_map in the
375  /// column_side_info structure. To access the side row pointed to
376  /// by index j of column k, you would read the row starting at the
377  /// pointer in side_lookups[k].data_lookup_map[j]. If that pointer
378  /// is null, there is no side information provided for that row.
379  ///
380  /// Thus the data layout is simply some raw storage area in which
381  /// all the column information is dumped. It's completely unordered
382  /// and has no organizational structure -- it's designed entirely so
383  /// that the data_lookup_map can store pointers into it. Because
384  /// it's a shared_ptr to a vector, the memory inside it will never
385  /// be moved (unless this class is destroyed). Thus the pointers in
386  /// data_lookup_map give the raw address of the data to copy into
387  /// the observation.
388  ///
389  /// The number of columns to read is given by
390  /// side_lookups[k].metadata.size(), with a lookup of how to
391  /// interpret them given by the value of
392  /// side_lookups[k].column_types[c_idx]. Depending on the column
393  /// types, the data layout is different.
394  ///
395  /// - If it's numeric, then that contribution is a single entry_value
396  /// with double_value filled to that entry.
397  ///
398  /// - If it's categorical, it's contribution is a single entry_value
399  /// with index_value filled to the index of its categorical
400  /// variable.
401  ///
402  /// - If it's a vector, then the first entry_value::index_value
403  /// gives the number of values after that go into that value. For
404  /// example, if it's [3, 4.0, 1.5, 2.0, x, ...], then 3 would say
405  /// it's a vector of length 3, which would be values 4.0, 1.5,
406  /// and 2.0. x would be the start of the next column.
407  ///
408  /// - If it's a dictionary, then the first entry_value::index_value
409  /// gives the number of pairs of index, value triplets after that
410  /// go into that column. For example, if it's [2, 32, 2.0, 16,
411  /// 3.0, x, ...], then 2 would say there are 2 index-value pairs
412  /// following it, (32, 2.0) and (16, 3.0). x would be the start of
413  /// the next column.
414 
415  std::vector<std::shared_ptr<const ml_data_internal::row_data_block> > raw_row_storage;
416 
417  /// The column index telling us where to put new side information
418  /// blocks
419  size_t current_column_index;
420 
421  std::vector<ml_data_internal::column_metadata_ptr> _full_metadata;
422 };
423 
424 }}
425 
426 #endif /* TURI_ML2_DATA_SIDE_FEATURES_H_ */
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
void add_partial_side_features_to_row(std::vector< ml_data_entry > &x, size_t main_column_index) const GL_HOT
void add_partial_side_features_to_row(std::array< EntryType, n > &x, size_t main_column_index) const
Dummy overload to make a number of compiler issues easier.
side_feature_info get_side_feature_block(size_t main_column_index, size_t main_feature_index) const GL_HOT_INLINE_FLATTEN
void strip_side_features_from_row(size_t main_column_index, std::array< EntryType, n > &x) const
void strip_side_features_from_row(size_t main_column_index, std::vector< ml_data_entry > &x) const
void add_partial_side_features_to_row(std::vector< EntryType > &x, size_t main_column_index, size_t feature_index) const
void add_partial_side_features_to_row(std::vector< GlobalEntryType > &x, size_t main_column_index) const
#define GL_HOT_INLINE_FLATTEN
void add_side_features_to_row(std::vector< GlobalEntryType > &x) const
void add_partial_side_features_to_row(std::array< EntryType, n > &x, size_t main_column_index, size_t feature_index) const
Dummy overload to make a number of compiler issues easier.
size_t num_columns(size_t main_column_index) const
void add_side_features_to_row(std::array< EntryType, n > &x) const
void strip_side_features_from_row(size_t main_column_index, std::vector< GlobalEntryType > &x) const
void add_side_features_to_row(std::vector< ml_data_entry > &x) const
std::pair< size_t, size_t > global_indices_of_side_information_block(size_t main_column_index) const
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
std::pair< size_t, size_t > column_indices_of_side_information_block(size_t main_column_index) const