Turi Create  4.0
scvb.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_TEXT_SCVB_H_
7 #define TURI_TEXT_SCVB_H_
8 
9 // SFrame
10 #include <core/storage/sframe_data/sarray.hpp>
11 #include <core/storage/sframe_data/sframe.hpp>
12 
13 // Other
14 #include <core/storage/fileio/temp_files.hpp>
15 #include <thread>
16 #include <iostream>
17 
18 // Types
19 #include <model_server/lib/variant.hpp>
20 #include <model_server/lib/unity_base_types.hpp>
21 #include <core/data/flexible_type/flexible_type.hpp>
22 #include <core/util/hash_value.hpp>
23 #include <model_server/lib/flex_dict_view.hpp>
24 
25 // External
26 #include <Eigen/Core>
27 #include <core/random/random.hpp>
28 
29 namespace turi {
30 
31 namespace text {
32 
33 class scvb0_solver {
34 
35  public:
36 
37  scvb0_solver(topic_model* _model) {
38 
39  model = _model;
40  s = 10;
41  tau = 1000;
42  kappa = .9;
43 
44  }
45 
46  /**
47  * Train the model using the SCVB0 algorithm.
48  *
49  * See Foulds, Boyles, DuBois, Smyth, Welling.
50  * Stochastic Collapsed Variational Bayesian Inference
51  * for Latent Dirichlet Allocation. KDD 2013.
52  * See http://arxiv.org/pdf/1305.2452.pdf for a pdf.
53  *
54  * The key aspect of this algorithm is to keep a set of statistics that
55  * describe the number of times each word in the vocabulary has been assigned
56  * to each topic. We then iterate through minibatches of documents and
57  * perform updates akin to online EM: we use our current statistics (N_Z and N_phi)
58  * to make estimated "local" versions using the minibatch.
59  *
60  */
61  void train(std::shared_ptr<sarray<flexible_type>> data, bool verbose);
62 
63 
64  private:
65  topic_model* model;
66 
67  // Hyperparameters for learning rate
68  size_t s;
69  size_t tau;
70  double kappa;
71 
72  // Variables needed for SCVB0
73  Eigen::MatrixXd N_Z; /* < Estimate of N_Z for each topic. */
74  Eigen::MatrixXd N_theta_j; /* < Estimate of N_theta for document j. */
75  Eigen::MatrixXd N_phi; /* < Estimate of N_phi. */
76  Eigen::MatrixXd N_phi_hat; /* < Estimate of N_phi based on a minibatch. */
77  Eigen::MatrixXd N_Z_hat; /* < Estimate of N_Z based on a minibatch. */
78 
79  private:
80 
81  /**
82  * Initialize global estimate of N_theta_j.
83  */
84  void initialize_N_theta_j(size_t C_j) {
85  N_theta_j = Eigen::MatrixXd::Zero(model->num_topics, 1);
86  for (size_t i = 0; i < C_j; ++i) {
87  size_t ix = random::fast_uniform<size_t>(0, model->num_topics-1);
88  N_theta_j(ix) += 1;
89  }
90  }
91 
92  /**
93  * Compute the topic probabilities for a single token.
94  *
95  * \params w_ij The integer id for the word that is the i'th token
96  * of document j.
97  *
98  * \returns An Eigen vector of length num_topics containing the
99  * estimated probability the word belongs to each of the topics.
100  */
101  Eigen::MatrixXd compute_gamma(size_t w_ij) {
102  // DASSERT(j >= 0 && j < documents.size());
103  // DASSERT(i >= 0 && i < vocab_size);
104  Eigen::MatrixXd gamma_ij(model->num_topics, 1);
105  for (size_t k = 0; k < model->num_topics; ++k) {
106  gamma_ij(k, 0) = (N_phi(w_ij, k) + model->beta) *
107  (N_theta_j(k) + model->alpha) /
108  (N_Z(k) + model->beta * model->vocab_size);
109  }
110  gamma_ij.normalize();
111  return gamma_ij;
112  }
113 
114  /**
115  * Compute a local estimate of topic proportions for this document.
116  *
117  */
118  void update_N_theta_j(const Eigen::MatrixXd& gamma_ij,
119  size_t count_ij,
120  size_t C_j,
121  double rho) {
122  double alpha = std::pow(1 - rho, count_ij);
123  N_theta_j = alpha * N_theta_j + C_j * gamma_ij * (1 - alpha);
124  }
125 
126  /**
127  * Update the local estimate of N_Z using the current token probabilities.
128  */
129  void update_N_Z_hat(const Eigen::MatrixXd& gamma_ij,
130  size_t M, size_t C) {
131  N_Z_hat += gamma_ij * C / M;
132  }
133 
134  /**
135  * Update the global estimate of N_Z using the current local estimates.
136  */
137  void update_N_Z(double rho) {
138  N_Z = (1 - rho) * N_Z + rho * N_Z_hat;
139  }
140 
141  /**
142  * Update global estimate of N_phi using the current local estimates.
143  */
144  void update_N_phi(double rho) {
145  N_phi = (1 - rho) * N_phi + rho * N_phi_hat;
146  }
147 
148  /**
149  * Update local estimate of N_phi with the current token probabilities.
150  */
151  void update_N_phi_hat(const Eigen::MatrixXd& gamma_ij,
152  size_t word_ij,
153  size_t M, size_t C) {
154  for (size_t k = 0; k < model->num_topics; ++k) {
155  N_phi_hat(word_ij, k) += gamma_ij(k) * C / M;
156  }
157  }
158 
159  /**
160  * Compute the learning rate for a given iteration.
161  * The default values have been reported to experimentally provide
162  * reasonable learning rates for real data sets.
163  *
164  * \returns s/(tau + t)^kappa.
165  */
166  double compute_rho(size_t t, size_t s = 10, size_t tau=1000, double kappa = .9) {
167  return (double) s / std::pow(tau + t, kappa);
168  }
169 
170 };
171 }
172 }
173 #endif