Turi Create
4.0
|
#include <toolkits/text/topic_model.hpp>
Public Member Functions | |
virtual topic_model * | topic_model_clone ()=0 |
virtual void | init_options (const std::map< std::string, flexible_type > &_opts) override=0 |
virtual size_t | get_version () const override=0 |
virtual void | save_impl (turi::oarchive &oarc) const override=0 |
virtual void | load_version (turi::iarchive &iarc, size_t version) override=0 |
virtual void | train (std::shared_ptr< sarray< flexible_type >> dataset, bool verbose)=0 |
std::vector< std::string > | list_fields () |
v2::ml_data | create_ml_data_using_metadata (std::shared_ptr< sarray< flexible_type >> dataset) |
void | set_associations (const sframe &associations) |
void | set_topics (const std::shared_ptr< sarray< flexible_type >> word_topic_prob, const std::shared_ptr< sarray< flexible_type >> vocabulary, size_t weight) |
std::pair< std::vector< flexible_type >, std::vector< double > > | get_topic (size_t topic_id, size_t num_words=5, double cdf_cutoff=1.0) |
std::shared_ptr< sarray< flexible_type > > | predict_gibbs (std::shared_ptr< sarray< flexible_type >> data, size_t num_burnin) |
count_matrix_type | predict_counts (std::shared_ptr< sarray< flexible_type > > dataset, size_t num_burnin) |
std::shared_ptr< sarray< flexible_type > > | get_topics_matrix () |
std::shared_ptr< sarray< flexible_type > > | get_vocabulary () |
double | perplexity (std::shared_ptr< sarray< flexible_type >> documents, const count_matrix_type &doc_topic_counts, const count_matrix_type &word_topic_counts) |
const variant_type & | get_value_from_state (std::string key) |
const std::map< std::string, flexible_type > & | get_current_options () const |
std::map< std::string, flexible_type > | get_default_options () const |
const flexible_type & | get_option_value (const std::string &name) const |
const std::map< std::string, variant_type > & | get_state () const |
bool | is_trained () const |
void | set_options (const std::map< std::string, flexible_type > &_options) |
void | add_or_update_state (const std::map< std::string, variant_type > &dict) |
const std::vector< option_handling::option_info > & | get_option_info () const |
virtual const char * | name ()=0 |
virtual const std::string & | uid ()=0 |
void | save_to_url (const std::string &url, const variant_map_type &side_data={}) |
void | save_model_to_data (std::ostream &out) |
const std::map< std::string, std::vector< std::string > > & | list_functions () |
const std::vector< std::string > & | list_get_properties () |
const std::vector< std::string > & | list_set_properties () |
variant_type | call_function (const std::string &function, variant_map_type argument) |
variant_type | get_property (const std::string &property) |
variant_type | set_property (const std::string &property, variant_map_type argument) |
const std::string & | get_docstring (const std::string &symbol) |
virtual void | perform_registration () |
Protected Member Functions | |
void | register_function (std::string fnname, const std::vector< std::string > &arguments, impl_fn fn) |
void | register_defaults (const std::string &fnname, const variant_map_type &arguments) |
void | register_setter (const std::string &propname, impl_fn setfn) |
void | register_getter (const std::string &propname, impl_fn getfn) |
void | register_docstring (const std::pair< std::string, std::string > &fnname_docstring) |
Protected Attributes | |
std::map< std::string, variant_type > | state |
Class for learning topic models of text corpora.
Typical use (as seen in cgs.cpp):
1) Create a topic model with a map of options:
topic_model m = new topic_model(options);
2) Create an ml_data object where words have been assigned integers to faciliate indexing.
ml_data d = m->create_ml_data_using_metadata(dataset);
3) Initialize the model so that we have the internal parameters needed for each of the words observed in the dataset.
m->init();
Note: Two other actions can be useful after initialization:
set_topics: Loads a set of topics and vocabulary. set_associations: Loads a set of word-topic assignments.
Definition at line 59 of file topic_model.hpp.
|
inherited |
Append the key value store of the model.
[in] | dict | Options (Key-Value pairs) to set |
|
inherited |
Calls a user defined function.
v2::ml_data turi::text::topic_model::create_ml_data_using_metadata | ( | std::shared_ptr< sarray< flexible_type >> | dataset | ) |
Helper function for creating the appropriate ml_data from an sarray of documents.
dataset | An SArray (of dictionary type) containing document data in bag of words format, where each element has words as keys and the corresponding counts as values. |
|
inherited |
Get current options.
Interfaces with the get_current_options function in the Python side.
|
inherited |
Get default options.
Interfaces with the get_default_options function in the Python side.
|
inherited |
Returns the toolkit documentation for a function or property.
|
inherited |
Returns the option information struct for each of the set parameters.
|
inherited |
Returns the value of an option. Throws an error if the option does not exist.
[in] | name | Name of the option to get. |
|
inherited |
Reads a property.
|
inherited |
Get model.
std::pair<std::vector<flexible_type>, std::vector<double> > turi::text::topic_model::get_topic | ( | size_t | topic_id, |
size_t | num_words = 5 , |
||
double | cdf_cutoff = 1.0 |
||
) |
Get the most probable words for a given topic.
topic_id | The integer id of the topic. Must be in [0, num_topics) length vocab_size used to construct the topic_model object. |
num_words | The number of words to return for the given topic. |
cdf_cutoff | After ordering words by probability, this will only return words while the cumulative probability of the words is below this cutoff value. |
std::shared_ptr<sarray<flexible_type> > turi::text::topic_model::get_topics_matrix | ( | ) |
Returns the current topics matrix as an SFrame
|
inherited |
Returns the value of a particular key from the state.
From the python side, this is interfaced with the get() function or the [] operator in python.
|
overridepure virtual |
Gets the model version number
Reimplemented from turi::model_base.
Implemented in turi::text::alias_topic_model.
std::shared_ptr<sarray<flexible_type> > turi::text::topic_model::get_vocabulary | ( | ) |
Returns current vocabulary of words.
|
overridepure virtual |
Set the model options. Use the option manager to set these options. The option manager should throw errors if the options do not satisfy the option manager's conditions.
[in] | opts | Options to set |
Reimplemented from turi::ml_model_base.
Implemented in turi::text::alias_topic_model.
|
inherited |
Is this model trained.
std::vector<std::string> turi::text::topic_model::list_fields | ( | ) |
Lists all the keys accessible in the "model" map.
|
inherited |
Lists all the registered functions. Returns a map of function name to array of argument names for the function.
|
inherited |
Lists all the get-table properties of the class.
|
inherited |
Lists all the set-table properties of the class.
|
overridepure virtual |
Load the model object.
Reimplemented from turi::model_base.
Implemented in turi::text::alias_topic_model.
|
pure virtualinherited |
Returns the name of the toolkit class, as exposed to client code. For example, the Python proxy for this instance will have a type with this name.
Note: this function is typically overridden using the BEGIN_CLASS_MEMBER_REGISTRATION macro.
|
virtualinherited |
Declare the base registration function. This class has to be handled specially; the macros don't work here due to the override declarations.
Reimplemented in turi::model_proxy.
double turi::text::topic_model::perplexity | ( | std::shared_ptr< sarray< flexible_type >> | documents, |
const count_matrix_type & | doc_topic_counts, | ||
const count_matrix_type & | word_topic_counts | ||
) |
Compute perplexity. For more details see the docstrings for the version that is not a member of the topic_model class. This version is for a model's internal usage, i.e. where the two count matrices are already available. Note that the first thing this method does is normalize counts to be proper probabilities. This is done via: doc_topic_prob[d, k] = p(topic k | document d) = (doc_topic_count[d, k] + alpha) / ' (doc_topic_count[d, k'] + alpha) word_topic_prob[w, k] = p(word w | topic k) = (word_topic_count[w, k] + eta) / ' (word_topic_count[w', k] + eta)
count_matrix_type turi::text::topic_model::predict_counts | ( | std::shared_ptr< sarray< flexible_type > > | dataset, |
size_t | num_burnin | ||
) |
Make predictions for a given data set. Return the number of assignments of each topic for each document in the dataset.
std::shared_ptr<sarray<flexible_type> > turi::text::topic_model::predict_gibbs | ( | std::shared_ptr< sarray< flexible_type >> | data, |
size_t | num_burnin | ||
) |
Make predictions on the given data set.
This method closely resembles the sampler in the collapsed Gibbs sampler solver found in cgs.hpp. Here, however, the word_topic_counts matrix is held fixed. For each document, num_burnin iterations are performed where in each iteration we sample the topic_assignments. The returned predictions are probabilities, and are computed by smoothing the doc_topic_counts matrix that arising from sampling.
|
protectedinherited |
Registers default argument values
|
protectedinherited |
Adds a docstring for the specified function or property name.
|
protectedinherited |
Adds a function with the specified name, and argument list.
|
protectedinherited |
Adds a property getter with the specified name.
|
protectedinherited |
Adds a property setter with the specified name.
|
overridepure virtual |
Serialize the model object.
Reimplemented from turi::model_base.
Implemented in turi::text::alias_topic_model.
|
inherited |
Save a toolkit class to a data stream.
|
inherited |
Save a toolkit class to disk.
url | The destination url to store the class. |
sidedata | Any additional side information |
void turi::text::topic_model::set_associations | ( | const sframe & | associations | ) |
Load a set of associations comprising a (word, topic) pair that should be considered fixed.
associations | An SFrame with two columns named 'word' and 'topic'. |
|
inherited |
Set one of the options in the algorithm.
The value are checked with the requirements given by the option instance.
[in] | name | Name of the option. |
[in] | value | Value for the option. |
|
inherited |
Sets a property. The new value of the property should appear in the argument map under the key "value".
void turi::text::topic_model::set_topics | ( | const std::shared_ptr< sarray< flexible_type >> | word_topic_prob, |
const std::shared_ptr< sarray< flexible_type >> | vocabulary, | ||
size_t | weight | ||
) |
Remove current vocabulary and topics and load these instead.
word_topic_prob | An SArray of vector type, where each element has size num_topics. The k'th element represents the probability of the corresponding word in vocabulary under topic k. |
vocabulary | An SArray of string type containing the unique words that should be loaded into the model. This must have the same length as word_topic_prob. |
weight | The weight the model should give these probabilites when learning. In other words, the provided word-topic probabilities are multiplied by this weight before used as count matrices within the model. |
|
pure virtual |
Clone objects to a topic_model class
model_base for details.
Implemented in turi::text::alias_topic_model.
|
pure virtual |
Create a topic model.
|
pure virtualinherited |
Returns a unique identifier for the toolkit class. It can be any unique ID. The UID is only used at runtime (to determine the concrete type of an arbitrary model_base instance) and is never stored.
Note: this function is typically overridden using the BEGIN_CLASS_MEMBER_REGISTRATION macro.
Implemented in turi::model_proxy.
|
protectedinherited |
All things python
Definition at line 206 of file ml_model.hpp.