Turi Create  4.0
column_indexer.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_ML2_DATA_COLUMN_INDEXER_H_
7 #define TURI_ML2_DATA_COLUMN_INDEXER_H_
8 
9 #include <string>
10 #include <map>
11 #include <model_server/lib/variant.hpp>
12 #include <core/data/flexible_type/flexible_type.hpp>
13 #include <core/logging/assertions.hpp>
14 #include <core/storage/serialization/serialization_includes.hpp>
15 #include <toolkits/ml_data_2/ml_data_column_modes.hpp>
16 #include <model_server/lib/variant_deep_serialize.hpp>
17 #include <functional>
18 
19 namespace turi { namespace v2 { namespace ml_data_internal {
20 
21 /** COMMENT.
22  *
23  * column_metadata contains "meta data" concerning indexing of a single column
24  * of an SFrame. A collection of meta_data column objects is "all" the
25  * metadata required in the ml_data container.
26  */
28  public:
29 
30  /**
31  * Default constructor.
32  */
34 
35  /** Initialize the index mapping and setup. There are certain
36  * internal parallel things that need to be set up before
37  * map_value_to_index works. Call this before looping over
38  * map_value_to_index, then call finish_indexing() when done.
39  */
40  virtual void initialize() = 0;
41 
42  /** Returns the index associated with the "feature" value.
43  *
44  * \note Only used if is_categorical is true.
45  *
46  * If the value in the feature column was already seen, then the index
47  * already associated with that value is returned. If not, a new unique
48  * index is added and associated with this feature value.
49  *
50  * This method is completely threadsafe and is meant to be called by
51  * multiple threads in contention.
52  *
53  * \param[in] feature The value in the feature column to map to the index.
54  * \return An index (possibly new) associated with the given value.
55  */
56  virtual size_t map_value_to_index(size_t thread_idx, const flexible_type& feature ) = 0;
57 
58  /** Returns the index associated with the "feature" value.
59  *
60  * \note Only used if is_categorical is true.
61  *
62  * If the value in the feature column was already seen, then the
63  * index already associated with that value is returned. If not,
64  * size_t(-1) is returned.
65  *
66  * \param[in] feature The value in the feature column to map to the index.
67  * \return An index associated with the given value. If the index is not
68  * present. We return size_t(-1).
69  */
70  virtual size_t immutable_map_value_to_index(const flexible_type& feature) const = 0;
71 
72  /** Some of the ml_data tests currently depend on the order of
73  * insertion into the index, which is now done in parallel and
74  * thus not deterministic. This function allows the user to
75  * remove that randomness by inserting all indices in a specified
76  * order.
77  *
78  * NOTE: This function is not thread safe; only call it from one
79  * thread.
80  */
81  virtual void insert_values_into_index(const std::vector<flexible_type>& features) {};
82 
83  /** Call this when all calls to map_value_to_index are completed.
84  */
85  virtual void finalize() = 0;
86 
87  /** Returns the feature "value" associated an index.
88  *
89  * \note Only used if is_categorical is true.
90  *
91  * \param[\in] idx Index associated with the feature value.
92  * \return The "value" in the original data associated with the given id.
93  */
94  virtual flexible_type map_index_to_value(size_t idx) const {
95  ASSERT_MSG(false, "Indexing not reversable with this indexer.");
96  return 0;
97  }
98 
99  /** Calculates the type of the values held in the index. This may
100  * be different from original_column_type -- if the
101  * original_column_type is a DICT or LIST, this will return
102  * the actual type of the values. If the values are inconsistent,
103  * then an error is raised.
104  *
105  * This method is useful when a metadata built with a dictionary is
106  * also used to map simple categorical variables.
107  */
108  virtual std::set<flex_type_enum> extract_key_types() const {
109  ASSERT_MSG(false, "Indexing not reversable with this indexer.");
110  return {flex_type_enum::UNDEFINED};
111  }
112 
113  /** Returns the size of the column -- e.g. the number of distinct
114  * categories, or the size of the hash space. Only called if the
115  * column is indeed indexed, i.e. if mode_is_indexed(mode) is true.
116  *
117  * Categorical : # Unique categories
118  *
119  * \return Column size.
120  */
121  virtual size_t indexed_column_size() const = 0;
122 
123  ////////////////////////////////////////////////////////////////////////////////
124  // Methods for creation and serialization
125 
126  /** Returns the current version used for the serialization.
127  */
128  virtual size_t get_version() const = 0;
129 
130  /** Serialize the object (save).
131  */
132  virtual void save_impl(turi::oarchive& oarc) const = 0;
133 
134  /** Load the object.
135  */
136  virtual void load_version(turi::iarchive& iarc, size_t version) = 0;
137 
138  /** Returns a lambda function that can be used as a lambda function for deindexing
139  * a column.
140  */
141  virtual std::function<flexible_type(const flexible_type&)> deindexing_lambda() const = 0;
142 
143  /** Returns a lambda function that can be used as a lambda function for indexing
144  * a column.
145  *
146  * Does not add any new index values.
147  */
148  virtual std::function<flexible_type(const flexible_type&)> indexing_lambda() const = 0;
149 
150  /** Create a copy with the index cleared.
151  */
152  virtual std::shared_ptr<column_indexer> create_cleared_copy() const = 0;
153 
154  /** The factory method for loading and instantiating the proper class
155  */
156  static std::shared_ptr<column_indexer> factory_create(
157  const std::map<std::string, variant_type>& creation_options);
158 
159  const std::map<std::string, variant_type>& get_serialization_parameters() const {
160  return creation_options;
161  }
162 
163  /** Set data directly.
164  *
165  */
166  virtual void set_values(std::vector<flexible_type>&& values) = 0;
167  virtual std::vector<flexible_type> reset_and_return_values() = 0;
168 
169 
170  public:
171 
172  /** The name of the column.
173  */
174  std::string column_name;
175 
176  /** The mode of the column;
177  */
178  ml_column_mode mode;
179 
180  /** Original column type
181  */
183 
184  /** A map of the options passed in to ml_data. May include options
185  * for the indexers.
186  */
187  std::map<std::string, flexible_type> options;
188 
189  private:
190 
191  /** A snapshot of the options needed for creating the class.
192  */
193  std::map<std::string, variant_type> creation_options;
194 
195 };
196 
197 }}}
198 
199 ////////////////////////////////////////////////////////////////////////////////
200 // Implement serialization for vector<std::shared_ptr<column_indexer>
201 // > and std::shared_ptr<column_indexer>
202 
203 BEGIN_OUT_OF_PLACE_SAVE(arc, std::shared_ptr<v2::ml_data_internal::column_indexer>, m) {
204  if(m == nullptr) {
205  arc << false;
206  } else {
207  arc << true;
208 
209  // Save the version number
210  size_t version = m->get_version();
211  arc << version;
212 
213  // Save the model parameters as a map
214  std::map<std::string, variant_type> serialization_parameters =
215  m->get_serialization_parameters();
216 
217  // Save the version along with the creation options.
218  serialization_parameters["version"] = to_variant(m->get_version());
219 
220  variant_deep_save(serialization_parameters, arc);
221 
222  m->save_impl(arc);
223  }
224 
225 } END_OUT_OF_PLACE_SAVE()
226 
227 
228 BEGIN_OUT_OF_PLACE_LOAD(arc, std::shared_ptr<v2::ml_data_internal::column_indexer>, m) {
229  bool is_not_nullptr;
230  arc >> is_not_nullptr;
231  if(is_not_nullptr) {
232 
233  size_t version;
234  arc >> version;
235 
236  std::map<std::string, variant_type> creation_options;
237  variant_deep_load(creation_options, arc);
238 
240 
241  m->load_version(arc, version);
242 
243  } else {
244  m = std::shared_ptr<v2::ml_data_internal::column_indexer>(nullptr);
245  }
246 } END_OUT_OF_PLACE_LOAD()
247 
248 
249 #endif
#define BEGIN_OUT_OF_PLACE_LOAD(arc, tname, tval)
Macro to make it easy to define out-of-place loads.
Definition: iarchive.hpp:314
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
virtual size_t indexed_column_size() const =0
std::map< std::string, flexible_type > options
virtual void insert_values_into_index(const std::vector< flexible_type > &features)
void variant_deep_load(variant_type &v, iarchive &iarc)
virtual void save_impl(turi::oarchive &oarc) const =0
void variant_deep_save(const variant_type &v, oarchive &oarc)
virtual void set_values(std::vector< flexible_type > &&values)=0
virtual std::shared_ptr< column_indexer > create_cleared_copy() const =0
virtual size_t map_value_to_index(size_t thread_idx, const flexible_type &feature)=0
virtual flexible_type map_index_to_value(size_t idx) const
virtual size_t get_version() const =0
virtual std::function< flexible_type(const flexible_type &)> indexing_lambda() const =0
virtual size_t immutable_map_value_to_index(const flexible_type &feature) const =0
virtual std::set< flex_type_enum > extract_key_types() const
variant_type to_variant(const T &f)
Definition: variant.hpp:308
static std::shared_ptr< column_indexer > factory_create(const std::map< std::string, variant_type > &creation_options)
virtual void load_version(turi::iarchive &iarc, size_t version)=0
std::set< T > values(const std::map< Key, T > &map)
Definition: stl_util.hpp:386
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
virtual std::function< flexible_type(const flexible_type &)> deindexing_lambda() const =0
#define BEGIN_OUT_OF_PLACE_SAVE(arc, tname, tval)
Macro to make it easy to define out-of-place saves.
Definition: oarchive.hpp:346