Turi Create  4.0
column_unique_indexer.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_ML2_DATA_UNIQUE_COLUMN_INDEXER_H_
7 #define TURI_ML2_DATA_UNIQUE_COLUMN_INDEXER_H_
8 
9 #include <core/data/flexible_type/flexible_type.hpp>
10 #include <core/util/hash_value.hpp>
11 #include <core/logging/assertions.hpp>
12 #include <core/util/bitops.hpp>
13 #include <core/storage/serialization/serialization_includes.hpp>
14 #include <core/generics/hopscotch_map.hpp>
15 #include <core/parallel/pthread_tools.hpp>
16 #include <toolkits/ml_data_2/indexing/column_indexer.hpp>
17 
18 namespace turi { namespace v2 { namespace ml_data_internal {
19 
20 /** Use a two-level hash table to store the index mappings. The
21  * first level is constant size and unlocked, determined by an n-bit
22  * hash. Each leaf in this one contains a hash table and lock. This
23  * significantly reduces lock contention. This
24  * _column_metadata_first_level_lookup_size_n_bits gives the number
25  * of bits used for this first lookup.
26  */
27 static constexpr int _column_unique_indexer_first_level_lookup_size_n_bits = 8;
28 
29 /**
30  * column_metadata contains "meta data" concerning indexing of a single column
31  * of an SFrame. A collection of meta_data column objects is "all" the
32  * metadata required in the ml_data container.
33  */
34 class column_unique_indexer final : public column_indexer {
35 
36  public:
37 
38  /**
39  * Default constructor; does nothing;
40  * Initialize from a serialization stream
41  */
43 
44  /** Initialize the index mapping and setup. There are certain
45  * internal parallel things that need to be set up before
46  * map_value_to_index works. Call this before looping over
47  * map_value_to_index, then call finalize() when done.
48  */
49  void initialize();
50 
51  /** Returns the index associated with the "feature" value.
52  *
53  * \note Only used if is_categorical is true.
54  *
55  * If the value in the feature column was already seen, then the index
56  * already associated with that value is returned. If not, a new unique
57  * index is added and associated with this feature value.
58  *
59  * This method is completely threadsafe and is meant to be called by
60  * multiple threads in contention.
61  *
62  * \param[in] feature The value in the feature column to map to the index.
63  * \return An index (possibly new) associated with the given value.
64  */
65  size_t map_value_to_index(size_t thread_idx, const flexible_type& feature) GL_HOT;
66 
67  /** Returns the index associated with the "feature" value.
68  *
69  * \note Only used if is_categorical is true.
70  *
71  * If the value in the feature column was already seen, then the
72  * index already associated with that value is returned. If not,
73  * size_t(-1) is returned.
74  *
75  * \param[in] feature The value in the feature column to map to the index.
76  * \return An index associated with the given value. If the index is not
77  * present. We return size_t(-1).
78  */
79  size_t immutable_map_value_to_index(const flexible_type& feature) const;
80 
81  /** Some of the ml_data tests currently depend on the order of
82  * insertion into the index, which is now done in parallel and
83  * thus not deterministic. This function allows the user to
84  * remove that randomness by inserting all indices in a specified
85  * order.
86  *
87  * NOTE: This function is not thread safe; only call it from one
88  * thread.
89  */
90  void insert_values_into_index(const std::vector<flexible_type>& features);
91 
92  /** When a new value is encountered in translating the data, it
93  * should be dealt with through map_value_to_index above if it is
94  * categorical, or through register_real_value below if it is
95  * numeric. This function handles things like checking the size
96  * of the numeric vectors (all must be the same) and setting the
97  * column size.
98  *
99  * Note that the statistics collection functions below are not
100  * always called; hence the error checks in register_real_value
101  * can't go there.
102  */
103  void register_real_value(const flexible_type& feature);
104 
105 
106  /** Call this when all calls to map_value_to_index are completed.
107  */
108  void finalize();
109 
110  /** Returns the feature "value" associated an index.
111  *
112  * \note Only used if is_categorical is true.
113  *
114  * \param[\in] idx Index associated with the feature value.
115  * \return The "value" in the original data associated with the given id.
116  */
117  flexible_type map_index_to_value(size_t idx) const;
118 
119  /** Calculates the type of the values held in the index. This may
120  * be different from original_column_type -- if the
121  * original_column_type is a DICT or LIST, this will return a set
122  * of the actual types present. If the values are
123  * inconsistent, then an error is raised.
124  *
125  * This method is useful when a metadata built with a dictionary is
126  * also used to map simple categorical variables.
127  */
128  std::set<flex_type_enum> extract_key_types() const;
129 
130  /** Returns the size of the column.
131  *
132  * Numeric : 1
133  * Categorical : # Unique categories
134  * Vector : Size of the vector.
135  *
136  * \return Column size.
137  */
138  inline size_t indexed_column_size() const {
139  return _column_size;
140  }
141 
142  /** Returns the current version used for the serialization.
143  */
144  size_t get_version() const;
145 
146  /**
147  * Serialize the object (save).
148  */
149  void save_impl(turi::oarchive& oarc) const;
150 
151  /**
152  * Load the object.
153  */
154  void load_version(turi::iarchive& iarc, size_t version);
155 
156  /** Set data directly.
157  *
158  */
159  void set_values(std::vector<flexible_type>&& values);
160 
161  std::vector<flexible_type> reset_and_return_values();
162 
163 
164  /** Create a copy with the index cleared.
165  */
166  std::shared_ptr<column_indexer> create_cleared_copy() const;
167 
168  /** Returns a lambda function that can be used as a lambda function for deindexing
169  * a column.
170  */
171  std::function<flexible_type(const flexible_type&)> deindexing_lambda() const;
172 
173  /** Returns a lambda function that can be used as a lambda function for indexing
174  * a column.
175  *
176  * Does not add any new index values.
177  */
178  std::function<flexible_type(const flexible_type&)> indexing_lambda() const;
179 
180  private:
181 
182  std::vector<std::pair<simple_spinlock, hopscotch_map<hash_value, size_t> > >
183  index_by_values_lookup;
184 
185  std::vector<std::vector<std::pair<size_t, flexible_type> > >
186  values_by_index_threadlocal_accumulator;
187 
188  std::vector<flexible_type> values_by_index_lookup;
189 
190  // If we have numeric
191  atomic<size_t> _column_size = 0;
192 
193  mutex index_modification_lock;
194 };
195 
196 }}}
197 
198 #endif
std::function< flexible_type(const flexible_type &)> deindexing_lambda() const
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
Definition: iarchive.hpp:60
std::function< flexible_type(const flexible_type &)> indexing_lambda() const
void register_real_value(const flexible_type &feature)
void load_version(turi::iarchive &iarc, size_t version)
void save_impl(turi::oarchive &oarc) const
void insert_values_into_index(const std::vector< flexible_type > &features)
std::shared_ptr< column_indexer > create_cleared_copy() const
size_t immutable_map_value_to_index(const flexible_type &feature) const
std::set< flex_type_enum > extract_key_types() const
flexible_type map_index_to_value(size_t idx) const
void set_values(std::vector< flexible_type > &&values)
std::set< T > values(const std::map< Key, T > &map)
Definition: stl_util.hpp:386
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
Definition: oarchive.hpp:80
size_t map_value_to_index(size_t thread_idx, const flexible_type &feature) GL_HOT