6 #ifndef TURI_DML_DATA_COLUMN_INDEXER_H_ 7 #define TURI_DML_DATA_COLUMN_INDEXER_H_ 9 #include <core/data/flexible_type/flexible_type.hpp> 10 #include <core/util/hash_value.hpp> 11 #include <core/logging/assertions.hpp> 12 #include <core/util/bitops.hpp> 13 #include <ml/ml_data/ml_data_column_modes.hpp> 14 #include <core/storage/serialization/serialization_includes.hpp> 15 #include <core/generics/hopscotch_map.hpp> 16 #include <core/parallel/pthread_tools.hpp> 20 namespace ml_data_internal {
86 mode == ml_column_mode::CATEGORICAL
87 || mode == ml_column_mode::CATEGORICAL_VECTOR
88 || mode == ml_column_mode::DICTIONARY);
90 DASSERT_FALSE(values_by_index_threadlocal_accumulator.empty());
91 DASSERT_LT(thread_idx, values_by_index_threadlocal_accumulator.size());
98 auto throw_error = [&]() GL_GCC_ONLY(GL_COLD_NOINLINE) {
99 log_and_throw(std::string(
"Value encountered in column '")
100 + column_name +
"' is of type '" 102 "' cannot be mapped to a categorical value." +
103 " Categorical values must be integer, strings, or None.");
111 size_t first_index = wt.
n_bit_index(_column_indexer_first_level_lookup_size_n_bits);
112 DASSERT_LT(first_index, index_by_values_lookup.size());
113 auto& lock_ht_pair = index_by_values_lookup[first_index];
115 std::lock_guard<simple_spinlock> lg(lock_ht_pair.first);
116 auto it = lock_ht_pair.second.find(wt);
120 if(it == lock_ht_pair.second.end()) {
121 index = (++_column_size) - 1;
122 values_by_index_threadlocal_accumulator[thread_idx].push_back({index, feature});
123 lock_ht_pair.second[wt] = index;
146 mode == ml_column_mode::CATEGORICAL
147 || mode == ml_column_mode::CATEGORICAL_VECTOR
148 || mode == ml_column_mode::DICTIONARY);
155 auto throw_error = [&]() GL_GCC_ONLY(GL_COLD_NOINLINE) {
156 log_and_throw(std::string(
"Value encountered in column '")
157 + column_name +
"' is of type '" 159 "' cannot be mapped to a categorical value." +
160 " Categorical values must be integer, strings, or None.");
168 size_t first_index = wt.
n_bit_index(_column_indexer_first_level_lookup_size_n_bits);
169 DASSERT_LT(first_index, index_by_values_lookup.size());
170 auto& lock_ht_pair = index_by_values_lookup[first_index];
172 auto it = lock_ht_pair.second.find(wt);
174 if(it == lock_ht_pair.second.end()) {
211 || mode == ml_column_mode::CATEGORICAL_VECTOR
212 || mode == ml_column_mode::DICTIONARY);
214 DASSERT_MSG(idx !=
size_t(-1),
215 "Index not tracked in metadata table!");
217 DASSERT_MSG(idx < values_by_index_lookup.size(),
218 "Index not in metadata table; using correct metadata?");
220 return values_by_index_lookup[idx];
274 void debug_check_is_equal(std::shared_ptr<column_indexer> other)
const;
276 const std::string& name()
const {
return column_name; }
280 const flex_type_enum& column_type()
const {
return original_column_type;}
285 std::string column_name;
295 std::vector<std::pair<simple_spinlock, hopscotch_map<hash_value, size_t> > >
296 index_by_values_lookup;
298 std::vector<std::vector<std::pair<size_t, flexible_type> > >
299 values_by_index_threadlocal_accumulator;
301 std::vector<flexible_type> values_by_index_lookup;
304 atomic<size_t> _column_size = 0;
306 mutex index_modification_lock;
319 size_t version = m->get_version();
325 } END_OUT_OF_PLACE_SAVE()
330 arc >> is_not_nullptr;
336 m->load_version(arc, version);
339 m = std::shared_ptr<ml_data_internal::column_indexer>(
nullptr);
341 } END_OUT_OF_PLACE_LOAD()
#define BEGIN_OUT_OF_PLACE_LOAD(arc, tname, tval)
Macro to make it easy to define out-of-place loads.
size_t get_version() const
The serialization input archive object which, provided with a reference to an istream, will read from the istream, providing deserialization capabilities.
size_t n_bit_index(size_t n_bits) const
Returns the top number of bits in the hash.
size_t immutable_map_value_to_index(const flexible_type &feature) const
size_t indexed_column_size() const
static constexpr int _column_indexer_first_level_lookup_size_n_bits
const char * flex_type_enum_to_name(flex_type_enum en)
size_t map_value_to_index(size_t thread_idx, const flexible_type &feature) GL_HOT
void save_impl(turi::oarchive &oarc) const
std::set< flex_type_enum > extract_key_types() const
flex_type_enum get_type() const
#define DASSERT_FALSE(cond)
std::vector< flexible_type > reset_and_return_values()
void set_indices(std::vector< flexible_type > &&values)
void load_version(turi::iarchive &iarc, size_t version)
std::set< T > values(const std::map< Key, T > &map)
void debug_check_is_internally_consistent() const
const flexible_type & map_index_to_value(size_t idx) const
The serialization output archive object which, provided with a reference to an ostream, will write to the ostream, providing serialization capabilities.
#define DASSERT_TRUE(cond)
#define BEGIN_OUT_OF_PLACE_SAVE(arc, tname, tval)
Macro to make it easy to define out-of-place saves.
void insert_values_into_index(const std::vector< flexible_type > &features)