Turi Create
4.0
|
#include <toolkits/feature_engineering/topk_indexer.hpp>
Public Member Functions | |
topk_indexer (const size_t &_topk=(size_t) -1, const size_t &_threshold=1, const size_t &_max_threshold=(size_t) -1, const std::string _column_name="") | |
topk_indexer (const topk_indexer &)=delete | |
void | initialize () |
void | insert_or_update (const flexible_type &value, size_t thread_idx=0, size_t count=1) GL_HOT |
size_t | lookup (const flexible_type &value) const |
size_t | lookup_counts (const flexible_type &value) const |
void | finalize () |
flexible_type | inverse_lookup (size_t idx) const |
std::vector< flexible_type > | get_values () const |
size_t | size () const |
size_t | get_version () const |
void | save_impl (turi::oarchive &oarc) const |
void | load_version (turi::iarchive &iarc, size_t version) |
Parallel top-k indexer for categorical variables (uses one-hot-encoding)
Note: This implementation is intended to be general and will be moved to some place more general later.
// Construct the indexer with the arguments. auto indexer = topk_indexer(10, 1, "column_name_for_error_messages"); indexer.initialize();
// Insert flexible types into the indexer for (const flexible_type& v: sa.range_iterator() { indexer.insert_or_update(v); }
// Finalize mapping (drops elements by frequency/threshold) indexer.finalize();
size_t index = indexer.lookup(v); // Returns (size_t) -1 if not present.
size_t counts = indexer.lookup_counts(v); // Returns 0 if not present.
flexible_type v = indexer.inverse_lookup(1) // Fails if index doesn't exist.
// Initialize indexer.initialize();
// Perform the indexing. in_parallel([&](size_t thread_idx, size_t num_threads) {
size_t start_idx = src_size * thread_idx / num_threads; size_t end_idx = src_size * (thread_idx + 1) / num_threads;
for (const flexible_type& v: sa.range_iterator(start_idx, end_idx) { indexer.insert_or_update(v, thread_id); }
// Finalize indexer.finalize();
Definition at line 71 of file topk_indexer.hpp.
|
inline |
Default constructor
[in] | topk | Topk to retain (by counts) |
[in] | threshold | Min count threshold to retain. |
[in] | column_name | Column name for display. |
Definition at line 83 of file topk_indexer.hpp.
|
delete |
Copy constructor: Don't want to risk making copies of this.
void turi::topk_indexer::finalize | ( | ) |
Finalize by dropping indices that dont meet
|
inline |
Returns the values (ordered by indices)
Definition at line 149 of file topk_indexer.hpp.
size_t turi::topk_indexer::get_version | ( | ) | const |
Returns the current version used for the serialization.
void turi::topk_indexer::initialize | ( | ) |
Initialize the index mapping and setup. Should be called before starting the map.
void turi::topk_indexer::insert_or_update | ( | const flexible_type & | value, |
size_t | thread_idx = 0 , |
||
size_t | count = 1 |
||
) |
Insert
[in] | value | Flexible type. |
[in] | thread_idx | Thread id (For parallel insertion). |
[in] | count | Amount to increment for this value. |
flexible_type turi::topk_indexer::inverse_lookup | ( | size_t | idx | ) | const |
Returns the "value" associated with the index.
void turi::topk_indexer::load_version | ( | turi::iarchive & | iarc, |
size_t | version | ||
) |
Load the object.
size_t turi::topk_indexer::lookup | ( | const flexible_type & | value | ) | const |
Returns the index associated with the value.
[in] | value | Search for the value. |
size_t turi::topk_indexer::lookup_counts | ( | const flexible_type & | value | ) | const |
Returns the counts associated with the value.
[in] | value | Search for the value. |
void turi::topk_indexer::save_impl | ( | turi::oarchive & | oarc | ) | const |
Serialize the object (save).
|
inline |
Returns the number of categorical variables.
Definition at line 158 of file topk_indexer.hpp.