Turi Create  4.0
turi::text::alias_topic_model Class Referenceabstract

#include <toolkits/text/alias.hpp>

Public Member Functions

 ~alias_topic_model ()
 
topic_modeltopic_model_clone () override
 
void init_options (const std::map< std::string, flexible_type > &_opts) override
 
size_t get_version () const override
 
void save_impl (turi::oarchive &oarc) const override
 
void load_version (turi::iarchive &iarc, size_t version) override
 
void train (std::shared_ptr< sarray< flexible_type > > data, bool verbose) override
 
std::shared_ptr< sarray< std::vector< size_t > > > forward_sample (v2::ml_data d)
 
void cache_word_pmf_and_samples (size_t w)
 
std::map< std::string, size_t > sample_counts (v2::ml_data d, size_t num_blocks)
 
void sample_block (const v2::ml_data &d, std::vector< std::vector< size_t >> &doc_assignments)
 
size_t sample_topic (size_t d, size_t w, size_t s, std::vector< double > &pd)
 
virtual void train (std::shared_ptr< sarray< flexible_type >> dataset, bool verbose)=0
 
std::vector< std::string > list_fields ()
 
v2::ml_data create_ml_data_using_metadata (std::shared_ptr< sarray< flexible_type >> dataset)
 
void set_associations (const sframe &associations)
 
void set_topics (const std::shared_ptr< sarray< flexible_type >> word_topic_prob, const std::shared_ptr< sarray< flexible_type >> vocabulary, size_t weight)
 
std::pair< std::vector< flexible_type >, std::vector< double > > get_topic (size_t topic_id, size_t num_words=5, double cdf_cutoff=1.0)
 
std::shared_ptr< sarray< flexible_type > > predict_gibbs (std::shared_ptr< sarray< flexible_type >> data, size_t num_burnin)
 
count_matrix_type predict_counts (std::shared_ptr< sarray< flexible_type > > dataset, size_t num_burnin)
 
std::shared_ptr< sarray< flexible_type > > get_topics_matrix ()
 
std::shared_ptr< sarray< flexible_type > > get_vocabulary ()
 
double perplexity (std::shared_ptr< sarray< flexible_type >> documents, const count_matrix_type &doc_topic_counts, const count_matrix_type &word_topic_counts)
 
const variant_typeget_value_from_state (std::string key)
 
const std::map< std::string, flexible_type > & get_current_options () const
 
std::map< std::string, flexible_typeget_default_options () const
 
const flexible_typeget_option_value (const std::string &name) const
 
const std::map< std::string, variant_type > & get_state () const
 
bool is_trained () const
 
void set_options (const std::map< std::string, flexible_type > &_options)
 
void add_or_update_state (const std::map< std::string, variant_type > &dict)
 
const std::vector< option_handling::option_info > & get_option_info () const
 
virtual const char * name ()=0
 
virtual const std::string & uid ()=0
 
void save_to_url (const std::string &url, const variant_map_type &side_data={})
 
void save_model_to_data (std::ostream &out)
 
const std::map< std::string, std::vector< std::string > > & list_functions ()
 
const std::vector< std::string > & list_get_properties ()
 
const std::vector< std::string > & list_set_properties ()
 
variant_type call_function (const std::string &function, variant_map_type argument)
 
variant_type get_property (const std::string &property)
 
variant_type set_property (const std::string &property, variant_map_type argument)
 
const std::string & get_docstring (const std::string &symbol)
 
virtual void perform_registration ()
 

Protected Member Functions

void register_function (std::string fnname, const std::vector< std::string > &arguments, impl_fn fn)
 
void register_defaults (const std::string &fnname, const variant_map_type &arguments)
 
void register_setter (const std::string &propname, impl_fn setfn)
 
void register_getter (const std::string &propname, impl_fn getfn)
 
void register_docstring (const std::pair< std::string, std::string > &fnname_docstring)
 

Protected Attributes

std::map< std::string, variant_typestate
 

Detailed Description

The basic pseudocode for the AliasDLA method is as follows:

initialize n_{t,w} for w in vocab: compute q_w(t) for all t compute Q_w = sum_t q_w(t) A = GenerateAlias(q_w, K) for k = 1:K S_w.push(SampleAlias(A, K)) store q_w(t), Q_w, S_w

for d in docs: for i in len(d): w = i'th word in d s = current topic for w in doc d decrement n_{s,d} and n_{s,w} by 1 for z where n_{z,d} != 0 compute p_dw(z) compute P_dw t = sample from q(t) by popping from S_w if S_w empty: Recompute A and populate S_w Recompute q_w(t), Q_w compute pi if not rand(1) < min(1, pi) t = s increment n_{t,d} and n_{t,w} by 1

Definition at line 70 of file alias.hpp.

Constructor & Destructor Documentation

◆ ~alias_topic_model()

turi::text::alias_topic_model::~alias_topic_model ( )

Destructor. Make sure bad things don't happen

Member Function Documentation

◆ add_or_update_state()

void turi::ml_model_base::add_or_update_state ( const std::map< std::string, variant_type > &  dict)
inherited

Append the key value store of the model.

Parameters
[in]dictOptions (Key-Value pairs) to set

◆ cache_word_pmf_and_samples()

void turi::text::alias_topic_model::cache_word_pmf_and_samples ( size_t  w)

For the given word do the following:

  • Compute q_w(t) and Q_w for word w. Stores this in members q and Q.
  • Compute the alias datastructures for each word w.
  • Fill the cache of topic samples, S_w.

◆ call_function()

variant_type turi::model_base::call_function ( const std::string &  function,
variant_map_type  argument 
)
inherited

Calls a user defined function.

◆ create_ml_data_using_metadata()

v2::ml_data turi::text::topic_model::create_ml_data_using_metadata ( std::shared_ptr< sarray< flexible_type >>  dataset)
inherited

Methods with meaningful default implementations.

Helper function for creating the appropriate ml_data from an sarray of documents.

Parameters
datasetAn SArray (of dictionary type) containing document data in bag of words format, where each element has words as keys and the corresponding counts as values.

◆ forward_sample()

std::shared_ptr<sarray<std::vector<size_t> > > turi::text::alias_topic_model::forward_sample ( v2::ml_data  d)

Use the dataset to create an initial set of topic assignments. Each element is a vector whose length is the total number of words in the respective document. If the first word occurs M times, then the first M elements of this vector are the latent assignments for that word. While sampling new assignments, topic_counts and doc_topic_counts are incremented.

◆ get_current_options()

const std::map<std::string, flexible_type>& turi::ml_model_base::get_current_options ( ) const
inherited

Get current options.

Returns
Dictionary containing current options.

Python side interface

Interfaces with the get_current_options function in the Python side.

◆ get_default_options()

std::map<std::string, flexible_type> turi::ml_model_base::get_default_options ( ) const
inherited

Get default options.

Returns
Dictionary with default options.

Python side interface

Interfaces with the get_default_options function in the Python side.

◆ get_docstring()

const std::string& turi::model_base::get_docstring ( const std::string &  symbol)
inherited

Returns the toolkit documentation for a function or property.

◆ get_option_info()

const std::vector<option_handling::option_info>& turi::ml_model_base::get_option_info ( ) const
inherited

Returns the option information struct for each of the set parameters.

◆ get_option_value()

const flexible_type& turi::ml_model_base::get_option_value ( const std::string &  name) const
inherited

Returns the value of an option. Throws an error if the option does not exist.

Parameters
[in]nameName of the option to get.

◆ get_property()

variant_type turi::model_base::get_property ( const std::string &  property)
inherited

Reads a property.

◆ get_state()

const std::map<std::string, variant_type>& turi::ml_model_base::get_state ( ) const
inherited

Get model.

Returns
Model map.

◆ get_topic()

std::pair<std::vector<flexible_type>, std::vector<double> > turi::text::topic_model::get_topic ( size_t  topic_id,
size_t  num_words = 5,
double  cdf_cutoff = 1.0 
)
inherited

Get the most probable words for a given topic.

Parameters
topic_idThe integer id of the topic. Must be in [0, num_topics) length vocab_size used to construct the topic_model object.
num_wordsThe number of words to return for the given topic.
cdf_cutoffAfter ordering words by probability, this will only return words while the cumulative probability of the words is below this cutoff value.
Returns
Returns an SFrame with the word and its corresponding score. The SFrame is sorted by score.

◆ get_topics_matrix()

std::shared_ptr<sarray<flexible_type> > turi::text::topic_model::get_topics_matrix ( )
inherited

Returns the current topics matrix as an SFrame

◆ get_value_from_state()

const variant_type& turi::ml_model_base::get_value_from_state ( std::string  key)
inherited

Returns the value of a particular key from the state.

Returns
Value of a key model_base for details.

Python side interface

From the python side, this is interfaced with the get() function or the [] operator in python.

◆ get_version()

size_t turi::text::alias_topic_model::get_version ( ) const
inlineoverridevirtual

Gets the model version number

Implements turi::text::topic_model.

Definition at line 95 of file alias.hpp.

◆ get_vocabulary()

std::shared_ptr<sarray<flexible_type> > turi::text::topic_model::get_vocabulary ( )
inherited

Returns current vocabulary of words.

◆ init_options()

void turi::text::alias_topic_model::init_options ( const std::map< std::string, flexible_type > &  _opts)
overridevirtual

Set the model options. Use the option manager to set these options. The option manager should throw errors if the options do not satisfy the option manager's conditions.

Parameters
[in]optsOptions to set

Implements turi::text::topic_model.

◆ is_trained()

bool turi::ml_model_base::is_trained ( ) const
inherited

Is this model trained.

Returns
True if already trained.

◆ list_fields()

std::vector<std::string> turi::text::topic_model::list_fields ( )
inherited

Lists all the keys accessible in the "model" map.

Returns
List of keys in the model map. model_base for details.

◆ list_functions()

const std::map<std::string, std::vector<std::string> >& turi::model_base::list_functions ( )
inherited

Lists all the registered functions. Returns a map of function name to array of argument names for the function.

◆ list_get_properties()

const std::vector<std::string>& turi::model_base::list_get_properties ( )
inherited

Lists all the get-table properties of the class.

◆ list_set_properties()

const std::vector<std::string>& turi::model_base::list_set_properties ( )
inherited

Lists all the set-table properties of the class.

◆ load_version()

void turi::text::alias_topic_model::load_version ( turi::iarchive iarc,
size_t  version 
)
overridevirtual

Turi serialization save

Implements turi::text::topic_model.

◆ name()

virtual const char* turi::model_base::name ( )
pure virtualinherited

Returns the name of the toolkit class, as exposed to client code. For example, the Python proxy for this instance will have a type with this name.

Note: this function is typically overridden using the BEGIN_CLASS_MEMBER_REGISTRATION macro.

◆ perform_registration()

virtual void turi::model_base::perform_registration ( )
virtualinherited

Declare the base registration function. This class has to be handled specially; the macros don't work here due to the override declarations.

Reimplemented in turi::model_proxy.

◆ perplexity()

double turi::text::topic_model::perplexity ( std::shared_ptr< sarray< flexible_type >>  documents,
const count_matrix_type &  doc_topic_counts,
const count_matrix_type &  word_topic_counts 
)
inherited

Compute perplexity. For more details see the docstrings for the version that is not a member of the topic_model class. This version is for a model's internal usage, i.e. where the two count matrices are already available. Note that the first thing this method does is normalize counts to be proper probabilities. This is done via: doc_topic_prob[d, k] = p(topic k | document d) = (doc_topic_count[d, k] + alpha) / ' (doc_topic_count[d, k'] + alpha) word_topic_prob[w, k] = p(word w | topic k) = (word_topic_count[w, k] + eta) / ' (word_topic_count[w', k] + eta)

◆ predict_counts()

count_matrix_type turi::text::topic_model::predict_counts ( std::shared_ptr< sarray< flexible_type > >  dataset,
size_t  num_burnin 
)
inherited

Make predictions for a given data set. Return the number of assignments of each topic for each document in the dataset.

◆ predict_gibbs()

std::shared_ptr<sarray<flexible_type> > turi::text::topic_model::predict_gibbs ( std::shared_ptr< sarray< flexible_type >>  data,
size_t  num_burnin 
)
inherited

Make predictions on the given data set.

This method closely resembles the sampler in the collapsed Gibbs sampler solver found in cgs.hpp. Here, however, the word_topic_counts matrix is held fixed. For each document, num_burnin iterations are performed where in each iteration we sample the topic_assignments. The returned predictions are probabilities, and are computed by smoothing the doc_topic_counts matrix that arising from sampling.

◆ register_defaults()

void turi::model_base::register_defaults ( const std::string &  fnname,
const variant_map_type &  arguments 
)
protectedinherited

Registers default argument values

◆ register_docstring()

void turi::model_base::register_docstring ( const std::pair< std::string, std::string > &  fnname_docstring)
protectedinherited

Adds a docstring for the specified function or property name.

◆ register_function()

void turi::model_base::register_function ( std::string  fnname,
const std::vector< std::string > &  arguments,
impl_fn  fn 
)
protectedinherited

Adds a function with the specified name, and argument list.

◆ register_getter()

void turi::model_base::register_getter ( const std::string &  propname,
impl_fn  getfn 
)
protectedinherited

Adds a property getter with the specified name.

◆ register_setter()

void turi::model_base::register_setter ( const std::string &  propname,
impl_fn  setfn 
)
protectedinherited

Adds a property setter with the specified name.

◆ sample_block()

void turi::text::alias_topic_model::sample_block ( const v2::ml_data &  d,
std::vector< std::vector< size_t >> &  doc_assignments 
)

Perform sampling given a block of data d (typically a slice of an SArray represnted via an ml_data object).

◆ sample_counts()

std::map<std::string, size_t> turi::text::alias_topic_model::sample_counts ( v2::ml_data  d,
size_t  num_blocks 
)

Simultaneously iterate through an v2::ml_data object and the sarray of latent topic assignments. For each instance of a word, resample its topic.

◆ sample_topic()

size_t turi::text::alias_topic_model::sample_topic ( size_t  d,
size_t  w,
size_t  s,
std::vector< double > &  pd 
)

Sample a new topic for word w in document d.

Parameters
documentd
wordw
initialtopic s
vectorof topic probabilities that gets used for sampling

◆ save_impl()

void turi::text::alias_topic_model::save_impl ( turi::oarchive oarc) const
overridevirtual

Turi serialization save

Implements turi::text::topic_model.

◆ save_model_to_data()

void turi::model_base::save_model_to_data ( std::ostream &  out)
inherited

Save a toolkit class to a data stream.

◆ save_to_url()

void turi::model_base::save_to_url ( const std::string &  url,
const variant_map_type &  side_data = {} 
)
inherited

Save a toolkit class to disk.

Parameters
urlThe destination url to store the class.
sidedataAny additional side information

◆ set_associations()

void turi::text::topic_model::set_associations ( const sframe associations)
inherited

Methods available to all topic_models.

Load a set of associations comprising a (word, topic) pair that should be considered fixed.

Parameters
associationsAn SFrame with two columns named 'word' and 'topic'.

◆ set_options()

void turi::ml_model_base::set_options ( const std::map< std::string, flexible_type > &  _options)
inherited

Set one of the options in the algorithm.

The value are checked with the requirements given by the option instance.

Parameters
[in]nameName of the option.
[in]valueValue for the option.

◆ set_property()

variant_type turi::model_base::set_property ( const std::string &  property,
variant_map_type  argument 
)
inherited

Sets a property. The new value of the property should appear in the argument map under the key "value".

◆ set_topics()

void turi::text::topic_model::set_topics ( const std::shared_ptr< sarray< flexible_type >>  word_topic_prob,
const std::shared_ptr< sarray< flexible_type >>  vocabulary,
size_t  weight 
)
inherited

Remove current vocabulary and topics and load these instead.

Parameters
word_topic_probAn SArray of vector type, where each element has size num_topics. The k'th element represents the probability of the corresponding word in vocabulary under topic k.
vocabularyAn SArray of string type containing the unique words that should be loaded into the model. This must have the same length as word_topic_prob.
weightThe weight the model should give these probabilites when learning. In other words, the provided word-topic probabilities are multiplied by this weight before used as count matrices within the model.

◆ topic_model_clone()

topic_model* turi::text::alias_topic_model::topic_model_clone ( )
overridevirtual

Clone objects to a topic_model class

Implements turi::text::topic_model.

◆ train() [1/2]

void turi::text::alias_topic_model::train ( std::shared_ptr< sarray< flexible_type > >  data,
bool  verbose 
)
override

Train the model using the method described in (Li, 2014).

◆ train() [2/2]

virtual void turi::text::topic_model::train ( std::shared_ptr< sarray< flexible_type >>  dataset,
bool  verbose 
)
pure virtualinherited

Create a topic model.

◆ uid()

virtual const std::string& turi::model_base::uid ( )
pure virtualinherited

Returns a unique identifier for the toolkit class. It can be any unique ID. The UID is only used at runtime (to determine the concrete type of an arbitrary model_base instance) and is never stored.

Note: this function is typically overridden using the BEGIN_CLASS_MEMBER_REGISTRATION macro.

Implemented in turi::model_proxy.

Member Data Documentation

◆ state

std::map<std::string, variant_type> turi::ml_model_base::state
protectedinherited

All things python

Definition at line 206 of file ml_model.hpp.


The documentation for this class was generated from the following file: