Turi Create  4.0
factorization_model_sgd_interface.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_FACTORIZATION_GLM_SGD_INTERFACE_H_
7 #define TURI_FACTORIZATION_GLM_SGD_INTERFACE_H_
8 
9 #include <Eigen/Core>
10 #include <cmath>
11 #include <vector>
12 #include <string>
13 #include <map>
14 #include <atomic>
15 
16 #include <core/util/code_optimization.hpp>
17 #include <core/util/fast_integer_power.hpp>
18 #include <toolkits/sgd/sgd_interface.hpp>
19 #include <toolkits/factorization/factorization_model_impl.hpp>
20 
21 namespace turi { namespace factorization {
22 
23 ////////////////////////////////////////////////////////////////////////////////
24 
25 /** The type of the regularization used. We currently have 3
26  * different modes. Each of these modes uses different variables,
27  * given by the parameters below.
28  */
29 enum class model_regularization_type {L2, ON_THE_FLY, NONE};
30 
31 template <typename T>
33 static inline T clip_1m1(T v) {
34  return (v < T(-1)) ? T(-1) : ( (v > T(1)) ? T(1) : v);
35 }
36 
37 template <typename T>
39 static inline T sqr(T v) { return v * v; }
40 
41 /** This class provides the interface layer for the basic sgd solver,
42  * this time for the second_order model. It provides functions to
43  * calculate the gradient and apply a gradient update. (In the
44  * original design, these were just folded into the model; they are
45  * now seperated out to make the original model simpler.)
46  *
47  * For documentation on the interface requirements of each of the
48  * solvers, see the sgd algorithms; each one requires specific
49  * interface functions to be defined.
50  */
51 template <typename GLMModel,
52  typename _LossModelProfile,
53  model_regularization_type _regularization_type>
55 
56  public:
57 
58  ////////////////////////////////////////////////////////////////////////////////
59  // Only one way to instantiate this class.
60 
61  factorization_sgd_interface(const std::shared_ptr<GLMModel>& _model)
62  : model(_model)
63  , iteration_sample_count(0)
64  {}
65 
67  const factorization_sgd_interface& operator=(const factorization_sgd_interface&) = delete;
68 
69  ////////////////////////////////////////////////////////////
70  // Import the appropriate typedefs / constants from the model
71 
72  typedef _LossModelProfile LossModelProfile;
73  LossModelProfile loss_model;
74 
75  typedef typename GLMModel::factor_type factor_type;
76  typedef typename GLMModel::factor_matrix_type factor_matrix_type;
77  typedef typename GLMModel::vector_type vector_type;
78 
79  static constexpr model_factor_mode factor_mode = GLMModel::factor_mode;
80  static constexpr flex_int num_factors_if_known = GLMModel::num_factors_if_known;
81 
82  // Enable item locking if we're in matrix factorization mode.
83  static constexpr bool enable_item_locking = true; // (factor_mode == model_factor_mode::matrix_factorization);
84 
85  /** Trial mode is used to find the sgd step size.
86  *
87  */
88  bool currently_in_trial_mode = false;
89 
90  /** The model_regularization_type.
91  */
92  static constexpr model_regularization_type regularization_type = _regularization_type;
93 
94  // If in nmf_mode, disable intercept and linear terms.
95  bool nmf_mode = false;
96 
97  private:
98 
99  /** The number of factors in the model. This can sometimes be set
100  * statically, as we do for several of the default models, yielding
101  * a significant optimization benefit.
102  */
103  inline size_t num_factors() const GL_HOT_INLINE_FLATTEN {
104  return ((factor_mode == model_factor_mode::pure_linear_model)
105  ? 0
106  : ( (num_factors_if_known == Eigen::Dynamic)
107  ? model->num_factors()
108  : num_factors_if_known));
109  }
110 
111  // The model we're optimizing. This also contains the state.
112  std::shared_ptr<GLMModel> model;
113 
114  /** Returns the total dimension of all the features, i.e. the number
115  * of features.
116  */
117  inline size_t n_total_dimensions() const GL_HOT_INLINE_FLATTEN {
118  return model->n_total_dimensions;
119  }
120 
121  /** Returns the dimension of the factor matrix. Only global feature
122  * indices less than this have factors in the V factor matrix in
123  * the model.
124  */
125  inline size_t num_factor_dimensions() const GL_HOT_INLINE_FLATTEN {
126 
127  switch(factor_mode) {
128  case model_factor_mode::pure_linear_model:
129  return 0;
130 
131  case model_factor_mode::matrix_factorization:
132  case model_factor_mode::factorization_machine:
133  return model->num_factor_dimensions;
134  default:
135  return 0;
136  }
137  }
138 
139  ////////////////////////////////////////////////////////////////////////////////
140  //
141  // Regularization
142 
143  double _lambda_w = NAN;
144  double _lambda_V = NAN;
145 
146  size_t current_iteration = size_t(-1);
147 
148  // This is for the tempering iterations on the
149  double current_iteration_step_size = 0;
150 
151 
152  size_t num_tempering_iterations = 0;
153  double tempering_regularization_start_value = 0;
154 
155  double current_lambda_w(size_t iteration) const {
156  return _interpolate_reg_value(iteration, _lambda_w);
157  }
158 
159  double current_lambda_V(size_t iteration) const {
160  return _interpolate_reg_value(iteration, _lambda_V);
161  }
162 
163  double _interpolate_reg_value(size_t iteration, double lambda) const {
164  if(iteration >= num_tempering_iterations)
165  return lambda;
166 
167  // If we're in trial mode, only run it with the tempering for one
168  // iteration; this way we can test the step size for stability
169  // with the tempered step size but still test for optimization
170  // without that.
171  if(currently_in_trial_mode && iteration != 0)
172  return lambda;
173 
174  double end_reg = std::max(1e-12, lambda);
175  double begin_reg = tempering_regularization_start_value;
176 
177  if(end_reg >= begin_reg)
178  return end_reg;
179 
180  // get step as an interpolation between the the tempering start
181  // and the lower value.
182  double s = double(iteration) / num_tempering_iterations;
183 
184  double ret = std::exp(std::log(begin_reg) * (1.0 - s) + std::log(end_reg) * s);
185 
186  // logprogress_stream
187  // << "ITER " << iteration << "; lambda = " << lambda
188  // << "; s = " << s << "; actual = " << ret << std::endl;
189 
190  return ret;
191  }
192 
193  size_t data_size;
194 
195  // Variables needed for L2 regularization. This term tracks the regularization of the
196  // terms in the state that are only touched by the L2 regularization
197  // term at each iteration. The original s_w are now equal to
198  // s_w_factor^iteration_sample_count.
199 
200  double s_w_factor = NAN, s_V_factor = NAN;
201  fast_integer_power s_w_factor_pow, s_V_factor_pow;
202  bool s_w_identically_1 = true, s_V_identically_1 = true;
203  std::atomic<size_t> iteration_sample_count;
204 
205 
206  // Variables needed for the on-the-fly (aka weighted) regularization. This vector
207  // is of length n_total_dimensions, and is equal to the number of
208  // parameters that each regularization value hits.
209  double w_shrinkage = NAN, V_shrinkage = NAN;
210  vector_type on_the_fly__regularization_scaling_factors;
211 
212  // Used if the items are locked.
213  std::vector<simple_spinlock> item_locks;
214 
215  size_t parameter_scaling_offset;
216  vector_type parameter_scaling;
217 
218  ////////////////////////////////////////////////////////////
219  //
220  // A container to hold the state updates that get applied during the
221  // (locked) gradient update step. Each thread has one of these
222  // buffers to avoid allocations during the update step.
223  //
224  size_t n_threads = 1;
225 
226  struct sgd_processing_buffer {
227  double w0;
228 
229  struct variable {
230 
231  size_t index;
232  double xv;
233  double w;
234 
235  factor_type V_row;
236  factor_type xV_row;
237 
238  // This stores the row pointer to the actual, original row.
239  float * __restrict__ V_row_ptr;
240  };
241 
242  std::vector<variable> v;
243 
244  factor_type xv_accumulator;
245  };
246 
247  mutable std::vector<sgd_processing_buffer> buffers, alt_buffers;
248 
249 
250  // For adagrad: If true, use the adagrad model.
251  bool adagrad_mode = true;
252  // vector_type w_adagrad_g;
253  // factor_matrix_type V_adagrad_g;
254  volatile double w0_adagrad_g;
255 
256  float adagrad_momentum_weighting = 1.0;
257 
258  Eigen::Matrix<float, Eigen::Dynamic, num_factors_if_known, Eigen::RowMajor> V_adagrad_g;
259  Eigen::Matrix<float, Eigen::Dynamic, 1> w_adagrad_g;
260 
261  ////////////////////////////////////////////////////////////////////////////////
262  // Set up and tear down methods for the different things.
263 
264  public:
265  /** Set up all the stuff needed for processing the data at each
266  * iteration.
267  */
268  void setup(const v2::ml_data& train_data,
269  const std::map<std::string, flexible_type>& options) {
270 
271  ////////////////////////////////////////////////////////////
272  // Set up some common constants used everywhere.
273 
274  n_threads = thread::cpu_count();
275 
276  data_size = train_data.size();
277 
278  _lambda_w = options.at("linear_regularization");
279  _lambda_V = options.at("regularization");
280  num_tempering_iterations = options.at("num_tempering_iterations");
281  num_tempering_iterations = std::min(num_tempering_iterations, size_t(options.at("max_iterations")));
282  tempering_regularization_start_value = options.at("tempering_regularization_start_value");
283 
284  nmf_mode = options.at("nmf");
285 
286  adagrad_mode = (options.at("solver") == "adagrad");
287 
288  if(adagrad_mode)
289  adagrad_momentum_weighting = options.at("adagrad_momentum_weighting");
290 
291  ////////////////////////////////////////////////////////////
292  // Set up all the buffers
293 
294  buffers.resize(n_threads);
295  alt_buffers.resize(n_threads);
296 
297  size_t max_row_size = train_data.max_row_size();
298 
299  // Init the buffer arrays.
300  for(std::vector<sgd_processing_buffer>* bv : {&buffers, &alt_buffers}) {
301  for(sgd_processing_buffer& buffer : (*bv)) {
302 
303  buffer.v.resize(max_row_size);
304 
305  for(auto& var : buffer.v) {
306  var.V_row.resize(num_factors());
307  var.xV_row.resize(num_factors());
308  }
309 
310  buffer.xv_accumulator.resize(num_factors());
311  }
312  }
313 
314  ////////////////////////////////////////////////////////////
315  // Set all the iteration-based constants to appropriate values for
316  // computing things when no iterations are happening
317 
318  s_w_factor = 1.0;
319  s_w_factor_pow.set_base(1.0);
320  s_w_identically_1 = true;
321 
322  s_V_factor = 1.0;
323  s_V_factor_pow.set_base(1.0);
324  s_V_identically_1 = true;
325 
326  iteration_sample_count = 0;
327 
328  ////////////////////////////////////////////////////////////
329  // Set up things needed for the different regularization interfaces.
330 
331  switch(regularization_type) {
332  case model_regularization_type::L2:
333  break;
334 
335  case model_regularization_type::ON_THE_FLY:
336  {
337  on_the_fly__regularization_scaling_factors.resize(n_total_dimensions());
338 
339  size_t pos = 0;
340  for(size_t c_idx = 0; c_idx < train_data.num_columns(); ++c_idx) {
341  for(size_t i = 0; i < train_data.metadata()->index_size(c_idx); ++i, ++pos) {
342  on_the_fly__regularization_scaling_factors[pos]
343  = (train_data.metadata()->statistics(c_idx)->count(i)
344  / (std::max(size_t(1), train_data.size())));
345  }
346  }
347 
348  break;
349  }
350 
351  case model_regularization_type::NONE:
352  break;
353  }
354 
355  // Set up the locking buffers if needed.
356  static constexpr size_t ITEM_COLUMN_INDEX = 1;
357 
358  if(enable_item_locking)
359  item_locks.resize(train_data.metadata()->index_size(ITEM_COLUMN_INDEX));
360 
361  // Set up the stuff for adagrad
362  if(adagrad_mode) {
363  w_adagrad_g.resize(model->w.size());
364  V_adagrad_g.resize(model->V.rows(), num_factors());
365  }
366  }
367 
368  public:
369 
370  ////////////////////////////////////////////////////////////////////////////////
371  // Functions to help with the SGD stuff.
372 
373  /** Returns the l2 regularization coefficient.
374  *
375  */
376  double l2_regularization_factor() const {
377  return (regularization_type == model_regularization_type::L2
378  ? std::max(_lambda_w, _lambda_V)
379  : 0);
380  }
381 
382  /** Returns an upper bound on the sgd step size.
383  *
384  */
385  double max_step_size() const {
386  switch(regularization_type) {
387  case model_regularization_type::L2:
388  case model_regularization_type::ON_THE_FLY:
389 
390  // This ensures that (1 - step_size * lambda) > 0, a very
391  // important requirement for numerical stability.
392 
393  return 0.9 / (1e-16 + std::max(current_lambda_w(0), current_lambda_V(0)));
394 
395  case model_regularization_type::NONE:
396  default:
397  return std::numeric_limits<double>::max();
398  }
399  }
400 
401  /** Set up the class and constants before every iteration.
402  */
403  void setup_iteration(size_t iteration, double step_size) {
404 
405  // Set the global current iteration value.
406  current_iteration = iteration;
407  current_iteration_step_size = step_size;
408 
409  // The following text is saved for debugging mode stuff.
410 
411  // if(adagrad_mode && factor_mode == model_factor_mode::factorization_machine) {
412  // size_t idx = model->index_offsets[2];
413  // logprogress_stream << "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ " << std::endl;
414  // logprogress_stream << "w = " << model->w[idx]
415  // << "; w_adagrad = " << w_adagrad_g[idx] << std::endl;
416  // logprogress_stream << "V = " << model->V.row(idx) << std::endl;
417  // logprogress_stream << "V_adagrad = " << V_adagrad_g.row(idx) << std::endl;
418 
419  // logprogress_stream << "w0 = " << model->w0 << "; w0_adagrad = " << w0_adagrad_g << std::endl;
420 
421  // logprogress_stream << "w max = " << model->w.maxCoeff()
422  // << ";\t w min = " << model->w.minCoeff()
423  // << ";\t w_ada min = " << w_adagrad_g.minCoeff()
424  // << ";\t w_ada min = " << w_adagrad_g.maxCoeff() << std::endl;
425 
426  // logprogress_stream << "V max = " << model->V.maxCoeff()
427  // << ";\t V min = " << model->V.minCoeff()
428  // << ";\t V_ada min = " << V_adagrad_g.minCoeff()
429  // << ";\t V_ada max = " << V_adagrad_g.maxCoeff() << std::endl;
430  // }
431 
432  switch(regularization_type) {
433  case model_regularization_type::L2: {
434 
435  // Set the sample count to 0.
436  iteration_sample_count = 0;
437 
438  double lambda_w = current_lambda_w(iteration);
439  double lambda_V = current_lambda_V(iteration);
440 
441  // The s scaling factor in the l2 regularization is just the
442  // power of (1 - step_size * lambda) raised to the power n,
443  // where n is the number of samples seen so far on this
444  // iteration.
445 
446  w_shrinkage = 1.0;
447  V_shrinkage = 1.0;
448 
449  double w_step_size = step_size;
450  double V_step_size = step_size;
451 
452  if(adagrad_mode) {
453  w_step_size /= std::max(1.0, std::sqrt(double(w_adagrad_g.mean())));
454  if(V_adagrad_g.rows() != 0)
455  V_step_size /= std::max(1.0, std::sqrt(double(V_adagrad_g.mean())));
456  }
457 
458  s_w_factor = 1.0 - w_step_size * lambda_w;
459  s_w_factor_pow.set_base(s_w_factor);
460  s_w_identically_1 = (s_w_factor == 1.0);
461 
462  s_V_factor = 1.0 - V_step_size * lambda_V;
463  s_V_factor_pow.set_base(s_V_factor);
464  s_V_identically_1 = (s_V_factor == 1.0);
465  break;
466  }
467  case model_regularization_type::ON_THE_FLY:
468 
469  w_shrinkage = 1.0 - step_size * _lambda_w;
470  V_shrinkage = 1.0 - step_size * _lambda_V;
471 
472  // These factors are identically 1.
473  s_w_factor = 1.0;
474  s_w_factor_pow.set_base(s_w_factor);
475  s_w_identically_1 = true;
476 
477  s_V_factor = 1.0;
478  s_V_factor_pow.set_base(s_V_factor);
479  s_V_identically_1 = true;
480  break;
481 
482  case model_regularization_type::NONE:
483 
484  w_shrinkage = 1.0;
485  V_shrinkage = 1.0;
486 
487  s_w_factor = 1.0;
488  s_w_factor_pow.set_base(s_w_factor);
489  s_w_identically_1 = true;
490 
491  s_V_factor = 1.0;
492  s_V_factor_pow.set_base(s_V_factor);
493  s_V_identically_1 = true;
494 
495  break;
496  }
497 
498  // Now, for the iteration, set this. It means that underflows are
499  // flushed to zero.
500  set_denormal_are_zero();
501 
502  current_iteration = iteration;
503  }
504 
505  ////////////////////////////////////////////////////////////////////////////////
506 
507  /** Finalizes the iteration. Called after each pass through the data.
508  *
509  */
511 
512  if(adagrad_mode && adagrad_momentum_weighting != 1.0) {
513  float rho = adagrad_momentum_weighting;
514 
515  if(w_adagrad_g.size() != 0)
516  w_adagrad_g.array() = rho * w_adagrad_g.array() + (1 - rho) * w_adagrad_g.mean();
517 
518  if(V_adagrad_g.rows() != 0)
519  V_adagrad_g.array() = rho * V_adagrad_g.array() + (1 - rho) * V_adagrad_g.mean();
520  }
521 
522  ////////////////////////////////////////////////////////////////////////////////
523  // Sync all the current stuff to the model.
524 
525  switch(regularization_type) {
526 
527  case model_regularization_type::L2:
528  {
529  double n_samples_processed = iteration_sample_count;
530  double s_w = s_w_factor_pow.pow(n_samples_processed);
531  double s_V = s_V_factor_pow.pow(n_samples_processed);
532 
533  in_parallel([&](size_t thread_idx, size_t num_threads) GL_GCC_ONLY(GL_HOT_FLATTEN) {
534 
535  if(!nmf_mode && s_w != 1.0) {
536 
537  size_t start_w_idx = (thread_idx * n_total_dimensions()) / num_threads;
538  size_t end_w_idx = ((thread_idx + 1) * n_total_dimensions()) / num_threads;
539 
540  for(size_t i = start_w_idx; i < end_w_idx; ++i) {
541 
542  model->w[i] *= s_w;
543 
544  if(std::fabs(model->w[i]) < 1e-16)
545  model->w[i] = 0;
546  }
547  }
548 
549  if(num_factor_dimensions() != 0 && s_V != 1.0) {
550 
551  size_t start_V_idx = (thread_idx * num_factor_dimensions() ) / num_threads;
552  size_t end_V_idx = ((thread_idx + 1) * num_factor_dimensions() ) / num_threads;
553 
554  for(size_t i = start_V_idx; i < end_V_idx; ++i) {
555  for(size_t j = 0; j < num_factors(); ++j) {
556 
557  model->V(i,j) *= s_V;
558 
559  if(std::fabs(model->V(i,j)) < 1e-16)
560  model->V(i,j) = 0;
561  }
562  }
563  }
564  });
565 
566  }
567 
568  case model_regularization_type::ON_THE_FLY:
569  case model_regularization_type::NONE:
570 
571  // Don't need to do anything for this regularization mode.
572  break;
573  }
574 
575  unset_denormal_are_zero();
576  }
577 
578  /// Test whether the current state is numerically stable or not; if
579  /// not, it needs to be reset.
581  if(!(std::isfinite(model->w0) && std::fabs(model->w0) <= 1e12))
582  return false;
583 
584  // This tests for the corner case of the model hitting the point
585  // where all the factors are identically 0 and then getting stuck.
586  if(nmf_mode) {
587  // Test for hitting the zero point.
588  for(size_t i = 0; i < num_factor_dimensions(); ++i) {
589  if(model->V.row(i).sum() > 1e-16)
590  return true;
591  }
592 
593  return false;
594  } else {
595  return true;
596  }
597  }
598 
599  /** Sets up the optimization run. Called at the beginning of an
600  * optimization run or to reset the state.
601  */
602  void setup_optimization(size_t random_seed = size_t(-1), bool trial_mode = false) {
603  if(random_seed == size_t(-1))
604  random_seed = model->options.at("random_seed");
605 
606  model->reset_state(random_seed, 0.001);
607  currently_in_trial_mode = trial_mode;
608 
609  if(adagrad_mode) {
610  w_adagrad_g.setConstant(1e-16);
611  V_adagrad_g.setConstant(1e-16);
612  w0_adagrad_g = 1e-16;
613  }
614 
615  }
616 
617  ////////////////////////////////////////////////////////////////////////////////
618 
619  /** Calculate the current regularization penalty.
620  *
621  */
623 
624  double lambda_w = current_lambda_w(current_iteration);
625  double lambda_V = current_lambda_V(current_iteration);
626 
627  if(regularization_type == model_regularization_type::NONE
628  || (lambda_w == 0 && lambda_V == 0) ) {
629  return 0;
630  }
631 
632  size_t n_threads = thread::cpu_count();
633 
634  std::vector<double> accumulative_regularization_penalty(n_threads, 0);
635 
636  in_parallel([&](size_t thread_idx, size_t num_threads) GL_GCC_ONLY(GL_HOT_FLATTEN) {
637 
638  size_t w_start_idx = (thread_idx * size_t(model->w.size())) / num_threads;
639  size_t w_end_idx = ((thread_idx + 1) * size_t(model->w.size())) / num_threads;
640 
641  size_t V_start_idx = (thread_idx * size_t(model->V.rows())) / num_threads;
642  size_t V_end_idx = ((thread_idx + 1) * size_t(model->V.rows())) / num_threads;
643 
644  if(regularization_type == model_regularization_type::ON_THE_FLY) {
645 
646  if(lambda_w != 0) {
647  for(size_t i = w_start_idx; i < w_end_idx; ++i) {
648 
649  accumulative_regularization_penalty[thread_idx]
650  += (lambda_w
651  * on_the_fly__regularization_scaling_factors[i]
652  * (model->w[i] * model->w[i]));
653  }
654  }
655 
656  if(lambda_V != 0) {
657  for(size_t i = V_start_idx; i < V_end_idx; ++i) {
658 
659  accumulative_regularization_penalty[thread_idx]
660  += (lambda_V
661  * on_the_fly__regularization_scaling_factors[i]
662  * model->V.row(i).squaredNorm());
663  }
664  }
665 
666  } else {
667 
668  if(lambda_w != 0) {
669  accumulative_regularization_penalty[thread_idx] +=
670  lambda_w * model->w.segment(w_start_idx, w_end_idx - w_start_idx).squaredNorm();
671  }
672 
673  if(lambda_V != 0) {
674  accumulative_regularization_penalty[thread_idx] +=
675  lambda_V * model->V.block(V_start_idx, 0, V_end_idx - V_start_idx, num_factors()).squaredNorm();
676  }
677  }
678 
679  });
680 
681  double total_reg = std::accumulate(accumulative_regularization_penalty.begin(),
682  accumulative_regularization_penalty.end(),
683  double(0.0));
684 
685  return total_reg;
686  }
687 
688 
689  /** Calculate the value of the objective function as determined by the
690  * loss function, for a full data set, minus the regularization
691  * penalty.
692  */
693  double calculate_loss(const v2::ml_data& data) const {
694 
695  std::vector<double> total_loss_accumulator(thread::cpu_count(), 0);
696 
697  volatile bool numerical_error_detected = false;
698 
699  in_parallel([&](size_t thread_idx, size_t num_threads) GL_GCC_ONLY(GL_HOT) {
700 
701  std::vector<v2::ml_data_entry> x;
702 
703  for(auto it = data.get_iterator(thread_idx, num_threads);
704  !it.done() && !numerical_error_detected;
705  ++it) {
706 
707  it.fill_observation(x);
708 
709  double y = it.target_value();
710 
711  double fx_pred = calculate_fx(thread_idx, x);
712  double point_loss = loss_model.loss(fx_pred, y);
713 
714  if(!std::isfinite(point_loss)) {
715  numerical_error_detected = true;
716  break;
717  }
718 
719  total_loss_accumulator[thread_idx] += point_loss;
720  }
721  }
722  );
723 
724  if(numerical_error_detected)
725  return NAN;
726 
727  double total_loss = std::accumulate(total_loss_accumulator.begin(),
728  total_loss_accumulator.end(), double(0));
729 
730  size_t n = data.size();
731  double loss_value = (n != 0) ? (total_loss / n) : 0;
732 
733  return loss_value;
734  }
735 
736  /** The value of the reported loss. The apply_sgd_step accumulates
737  * estimated loss values between samples. This function is called
738  * with this accumulated value to get a value
739  *
740  * For example, if squared error loss is used,
741  * reported_loss_name() could give RMSE, and then
742  * reported_loss_value(v) would be std::sqrt(v).
743  */
744  double reported_loss_value(double accumulative_loss) const {
745  return loss_model.reported_loss_value(accumulative_loss);
746  }
747 
748  /** The name of the loss to report on each iteration.
749  *
750  * For example, if squared error loss is used,
751  * reported_loss_name() could give RMSE, and then
752  * reported_loss_value(v) would be std::sqrt(v).
753  */
754  std::string reported_loss_name() const {
755  return loss_model.reported_loss_name();
756  }
757 
758  ////////////////////////////////////////////////////////////////////////////////
759 
760  private:
761 
762  /** Fill up the buffer with the current state, performing
763  * appropriate scaling along the way. Return the current function
764  * value.
765  *
766  * This version is called when the factor type is full -- i.e. the
767  * factorization machine.
768  */
769  inline double _fill_buffer_calc_value(
770  sgd_processing_buffer& buffer,
771  const std::vector<v2::ml_data_entry>& x,
772  double l2_s_w_old, double l2_s_V_old) const GL_HOT {
773 
774  const bool using_l2_regularization
775  = (regularization_type == model_regularization_type::L2);
776 
777  const double s_w = using_l2_regularization ? l2_s_w_old : 1;
778  const double s_V = using_l2_regularization ? l2_s_V_old : 1;
779 
780  // Set the row size.
781  const size_t x_size = x.size();
782 
783  ////////////////////////////////////////////////////////////////////////////////
784  // Each of the three models has a different method for computing
785  // the factor mode.
786 
787  switch(factor_mode) {
788 
789  ////////////////////////////////////////////////////////////////////////////////
790  // FACTORIZATION MODEL -- ALL FACTORS COMPUTED.
791 
792  case model_factor_mode::factorization_machine: {
793 
794  ////////////////////////////////////////////////////////////////////////////////
795  //
796  // Step 1: Pull in a snapshot of the current model values for the
797  // linear part of the model, and calculate the fx_value from the
798  // linear part as well.
799 
800  double fx_value = 0;
801 
802  buffer.xv_accumulator.setZero();
803 
804  for(size_t j = 0; j < x_size; ++j) {
805  const v2::ml_data_entry& v = x[j];
806  auto& b = buffer.v[j];
807 
808  // Get the global index
809  const size_t global_idx = model->index_offsets[v.column_index] + v.index;
810  b.index = global_idx;
811 
812  b.w = model->w[global_idx];
813 
814  // Set the x scaling on this column
815  double value_shift, value_scale;
816  std::tie(value_shift, value_scale) = model->column_shift_scales[global_idx];
817 
818  b.xv = value_scale * (v.value - value_shift);
819 
820  // Set all the rows to be correct
821  b.V_row = model->V.row(global_idx);
822  b.xV_row = (s_V * b.xv) * b.V_row;
823  buffer.xv_accumulator += b.xV_row;
824 
825  fx_value += (s_w * b.xv) * b.w;
826  }
827 
828  ////////////////////////////////////////////////////////////////////////////////
829  //
830  // Step 2: Calculate the inner product between factors by first
831  // calculating the total sum -- this is stored in xv_accumulator
832  // -- then do another pass through the data to calculate the
833  // summation over inner product. The interactions are the square
834  // of the accumulator minus the sum of the inner products.
835 
836  {
837  double fx_delta = 0;
838  for(size_t j = 0; j < x.size(); ++j) {
839  const auto& b = buffer.v[j];
840  fx_delta += buffer.xv_accumulator.dot(b.xV_row) - b.xV_row.squaredNorm();
841  }
842 
843  fx_value += 0.5*fx_delta;
844  }
845 
846 
847  ////////////////////////////////////////////////////////////////////////////////
848  //
849  // Step 3: Add in the intercept term. As this term has the
850  // most thread contention, make sure it's read as closely to
851  // the gradient update as possible.
852 
853  buffer.w0 = model->w0;
854  fx_value += buffer.w0;
855 
856  ////////////////////////////////////////////////////////////////////////////////
857  //
858  // Step 4: We're done, return!
859 
860  return fx_value;
861  }
862 
863  ////////////////////////////////////////////////////////////////////////////////
864  // MATRIX FACTORIZATION -- FACTORS ON ONLY FIRST TWO TERMS
865 
866  case model_factor_mode::matrix_factorization: {
867 
868  ////////////////////////////////////////////////////////////////////////////////
869  //
870  // Step 1: Pull in a snapshot of the current model values for the
871  // linear part of the model, and calculate the fx_value from the
872  // linear part as well.
873 
874  DASSERT_GE(buffer.v.size(), x.size());
875 
876  double fx_value = 0;
877 
878  for(size_t j : {1, 0}) {
879  const v2::ml_data_entry& v = x[j];
880  auto& b = buffer.v[j];
881 
882  DASSERT_EQ(j, v.column_index);
883 
884  // Get the global index
885  const size_t global_idx = (j == 0 ? 0 : model->index_offsets[1]) + v.index;
886  b.index = global_idx;
887  b.V_row = model->V.row(global_idx);
888 
889  // No column scaling on the first two dimensions under MF model.
890  DASSERT_EQ(v.value, 1);
891  b.xv = 1.0;
892 
893  // No need for volatile on the item terms, as they are locked.
894  fx_value += s_w * model->w[global_idx];
895  }
896 
897  ////////////////////////////////////////////////////////////////////////////////
898  // Step 2: Calculate the dimensions past the first two. These
899  // can have anything in them.
900 
901  if(x_size > 2) {
902  for(size_t j = 2; j < x_size; ++j) {
903  const v2::ml_data_entry& v = x[j];
904  auto& b = buffer.v[j];
905 
906  // Get the global index
907  const size_t global_idx = model->index_offsets[v.column_index] + v.index;
908  b.index = global_idx;
909 
910  // Set the scaling on this column
911  double value_shift, value_scale;
912  std::tie(value_shift, value_scale) = model->column_shift_scales[global_idx];
913 
914  b.xv = value_scale * (v.value - value_shift);
915 
916  fx_value += (s_w * b.xv) * model->w[global_idx];
917  }
918  }
919 
920  ////////////////////////////////////////////////////////////////////////////////
921  //
922  // Step 3: Pull in the contribution from the factors.
923  fx_value += (s_V * s_V) * (buffer.v[0].V_row.dot(buffer.v[1].V_row));
924 
925  ////////////////////////////////////////////////////////////////////////////////
926  //
927  // Finally, add in the intercept term. As this term has the
928  // most thread contention, make sure it's read as closely to
929  // the gradient update as possible.
930 
931  buffer.w0 = model->w0;
932  fx_value += buffer.w0;
933 
934  return fx_value;
935  }
936 
937  ////////////////////////////////////////////////////////////////////////////////
938  // LINEAR REGRESSION -- NO FACTORS AT ALL.
939 
940  case model_factor_mode::pure_linear_model: {
941 
942  DASSERT_GE(buffer.v.size(), x.size());
943 
944  double fx_value = 0;
945 
946  for(size_t j = 0; j < x_size; ++j) {
947  const v2::ml_data_entry& v = x[j];
948  auto& b = buffer.v[j];
949 
950  // Get the global index
951  const size_t global_idx = model->index_offsets[v.column_index] + v.index;
952  b.index = global_idx;
953 
954  b.w = model->w[global_idx];
955 
956  // Set the scaling on this column
957  double value_shift, value_scale;
958  std::tie(value_shift, value_scale) = model->column_shift_scales[global_idx];
959 
960  b.xv = value_scale * (v.value - value_shift);
961 
962  fx_value += (s_w * b.xv) * b.w;
963  }
964 
965  buffer.w0 = model->w0;
966  fx_value += buffer.w0;
967 
968  return fx_value;
969  }
970 
971  default:
972  return 0;
973  }
974  }
975 
976 
977  /** The information from the regularization updates.
978  */
979  struct _regularization_updates {
980  double s_w_old, s_w_new_inv;
981  double s_V_old, s_V_new_inv;
982  };
983 
984  /** Apply the updates to the regularization scaling parameters.
985  *
986  */
987  inline _regularization_updates _apply_regularization_update(
988  double step_size, bool apply_regularization = true)
990 
991  switch(regularization_type) {
992 
993  case model_regularization_type::L2: {
994 
995  _regularization_updates ru;
996 
997  size_t n = (apply_regularization
998  ? size_t(iteration_sample_count.fetch_add(1, std::memory_order_relaxed))
999  : size_t(iteration_sample_count));
1000 
1001  if(s_w_identically_1) {
1002  ru.s_w_old = 1;
1003  ru.s_w_new_inv = 1;
1004  } else {
1005  ru.s_w_old = s_w_factor_pow.pow(n);
1006  ru.s_w_new_inv = 1.0 / (ru.s_w_old * s_w_factor);
1007  }
1008 
1009  if(s_V_identically_1) {
1010  ru.s_V_old = 1;
1011  ru.s_V_new_inv = 1;
1012  } else {
1013  ru.s_V_old = s_V_factor_pow.pow(n);
1014  ru.s_V_new_inv = 1.0 / (ru.s_V_old * s_V_factor);
1015  }
1016 
1017  return ru;
1018  }
1019  case model_regularization_type::ON_THE_FLY:
1020  case model_regularization_type::NONE:
1021  default:
1022  return {1.0, 1.0, 1.0, 1.0};
1023 
1024  }
1025  }
1026 
1027  /** Apply w0 update.
1028  */
1029  inline void _apply_w0_gradient(
1030  sgd_processing_buffer& buffer,
1031  double l_grad,
1032  double step_size) GL_HOT_INLINE_FLATTEN {
1033 
1034  // In the case of squared error, for numerical stability, limit
1035  // the update to not go past the bottom of the quadratic curve.
1036  // Otherwise, things seem to go out of controll waaaay too often.
1037 
1038  // Only update this term if we aren't in squared error loss.
1039  if(std::is_same<LossModelProfile, loss_squared_error>::value)
1040  return;
1041 
1042  double delta = l_grad;
1043 
1044  if(adagrad_mode) {
1045  double _wg = (w0_adagrad_g += (delta * delta));
1046  delta /= std::sqrt(_wg);
1047  }
1048 
1049  model->w0 -= step_size * delta / n_threads;;
1050  }
1051 
1052  atomic<size_t> hits;
1053 
1054  /** Apply updates to the linear and quadratic terms.
1055  */
1056  inline void _apply_w_V_gradient(
1057  sgd_processing_buffer& buffer, double l_grad,
1058  double s_w_new_inv, double s_V_new_inv,
1059  size_t x_size, double step_size) GL_HOT_INLINE_FLATTEN {
1060 
1061 
1062  static constexpr bool using_on_the_fly_regularization =
1063  (regularization_type == model_regularization_type::ON_THE_FLY);
1064 
1065  typedef volatile float * __restrict__ vfloat_ptr;
1066 
1067  float ss_scaling_factor = float(adagrad_mode ? sq(step_size / current_iteration_step_size) : 1.0);
1068 
1069  // There are several modes here...
1070  switch(factor_mode) {
1071 
1072  ////////////////////////////////////////////////////////////////////////////////
1073  // FACTORIZATION MODEL -- ALL FACTORS COMPUTED.
1074 
1075  case model_factor_mode::factorization_machine: {
1076 
1077  // Prefetch all rows except the user terms.
1078  for(size_t j = 1; j < x_size; ++j)
1079  __builtin_prefetch(&(model->V(buffer.v[j].index)), 1, 1);
1080 
1081  // Now apply the gradient to w and to the factors in turn
1082  for(size_t j = 0; j < x_size; ++j) {
1083 
1084  auto& b = buffer.v[j];
1085 
1086  if(b.xv == 0) continue;
1087 
1088  // Step is the amount to move minus the regularization scaling and step size
1089 
1090  // The l_grad is needed for the adagrad computation; the scale
1091  // is how much actually gets applied. The latter depends on
1092  // step size and regularization scale.
1093 
1094  double w_grad = l_grad * b.xv;
1095  double step_w_scale = step_size;
1096  double step_V_scale = step_size;
1097 
1098  // Apply the linear terms
1099  if(!nmf_mode) {
1100 
1101  if(adagrad_mode) {
1102  w_adagrad_g[b.index] += ss_scaling_factor * w_grad * w_grad;
1103  step_w_scale /= std::sqrt(w_adagrad_g[b.index]);
1104  }
1105 
1106  // Set the new linear terms
1107  model->w[b.index] -= clip_1m1(w_grad * step_w_scale) * s_w_new_inv;;
1108 
1109  if(using_on_the_fly_regularization) {
1110  model->w[b.index] *= w_shrinkage;
1111  }
1112  }
1113 
1114  // Use xV_row as a buffer to hold the gradient step
1115 
1116  // Unclipped =
1117 
1118  // b.xV_row -= buffer.xv_accumulator;
1119 
1120  b.xV_row = (l_grad * (buffer.xv_accumulator - b.xV_row));
1121  if(adagrad_mode) {
1122 
1123  for(size_t i = 0; i < num_factors(); ++i) {
1124  V_adagrad_g(b.index, i) += ss_scaling_factor * b.xV_row[i] * b.xV_row[i];
1125  b.xV_row[i] /= std::sqrt(V_adagrad_g(b.index, i));
1126  }
1127  }
1128 
1129  if(nmf_mode) {
1130 
1131  for(size_t i = 0; i < num_factors(); ++i)
1132  b.V_row[i] -= clip_1m1(step_V_scale * b.xV_row[i]) * s_V_new_inv;
1133 
1134  if(using_on_the_fly_regularization)
1135  b.V_row *= V_shrinkage;
1136 
1137  for(size_t i = 0; i < num_factors(); ++i) {
1138  if(b.V_row[i] < 0)
1139  b.V_row[i] = 0;
1140  }
1141 
1142  model->V.row(b.index) = b.V_row;
1143 
1144  } else {
1145  if(using_on_the_fly_regularization) {
1146  for(size_t i = 0; i < num_factors(); ++i)
1147  model->V(b.index, i)
1148  = V_shrinkage * (b.V_row[i] - clip_1m1(step_V_scale * b.xV_row[i]) * s_V_new_inv);
1149 
1150  } else {
1151  b.xV_row *= step_V_scale;
1152  b.xV_row = b.xV_row.cwiseMin(float(1.0));
1153  b.xV_row = b.xV_row.cwiseMax(float(-1.0));
1154 
1155  model->V.row(b.index) -= s_V_new_inv * b.xV_row;
1156  }
1157  }
1158  }
1159  break;
1160  }
1161 
1162  ////////////////////////////////////////////////////////////////////////////////
1163  // MATRIX FACTORIZATION -- FACTORS ON ONLY FIRST TWO TERMS
1164 
1165  case model_factor_mode::matrix_factorization: {
1166 
1167  // In the case of squared error, for numerical stability, limit
1168  // the update to not go past the bottom of the quadratic curve.
1169  // Otherwise, things seem to go out of controll waaaay too often.
1170 
1171  if(!nmf_mode) {
1172  for(size_t j = 0; j < x_size; ++j) {
1173  const auto& b = buffer.v[j];
1174 
1175  double w_delta = l_grad * b.xv;
1176 
1177  if(adagrad_mode) {
1178  w_adagrad_g[b.index] += ss_scaling_factor * w_delta * w_delta;
1179  w_delta /= std::sqrt(w_adagrad_g[b.index]);
1180  }
1181 
1182  vfloat_ptr w_ptr = (vfloat_ptr)(&(model->w[b.index]));
1183 
1184  // Clip for stability.
1185 
1186  if(using_on_the_fly_regularization)
1187  *w_ptr = w_shrinkage * ((*w_ptr) - (clip_1m1(step_size * w_delta) * s_w_new_inv));
1188  else
1189  *w_ptr -= clip_1m1(step_size * w_delta) * s_w_new_inv;
1190  }
1191  }
1192 
1193  auto GL_GCC_ONLY(__restrict__)& b0 = buffer.v[0];
1194  auto GL_GCC_ONLY(__restrict__)& b1 = buffer.v[1];
1195 
1196  b0.xV_row = l_grad * b1.V_row;
1197  b1.xV_row = l_grad * b0.V_row;
1198 
1199  // Clip for stability; apply to the temporary buffer.
1200 
1201  if(adagrad_mode) {
1202  for(size_t i = 0; i < num_factors(); ++i) {
1203  V_adagrad_g(b0.index, i) += ss_scaling_factor * b0.xV_row[i] * b0.xV_row[i];
1204  b0.xV_row[i] /= std::sqrt(V_adagrad_g(b0.index, i));
1205  }
1206 
1207  for(size_t i = 0; i < num_factors(); ++i) {
1208  V_adagrad_g(b1.index, i) += ss_scaling_factor * b1.xV_row[i] * b1.xV_row[i];
1209  b1.xV_row[i] /= std::sqrt(V_adagrad_g(b1.index, i));
1210  }
1211  }
1212 
1213  b0.xV_row *= step_size;
1214  b0.xV_row = b0.xV_row.cwiseMin(float(1.0));
1215  b0.xV_row = b0.xV_row.cwiseMax(float(-1.0));
1216 
1217  b1.xV_row *= step_size;
1218  b1.xV_row = b1.xV_row.cwiseMin(float(1.0));
1219  b1.xV_row = b1.xV_row.cwiseMax(float(-1.0));
1220 
1221  // Apply the gradient to the temporary buffer
1222  b0.V_row -= s_V_new_inv * b0.xV_row;
1223  b1.V_row -= s_V_new_inv * b1.xV_row;
1224 
1225  if(using_on_the_fly_regularization) {
1226  b0.V_row *= V_shrinkage;
1227  b1.V_row *= V_shrinkage;
1228  }
1229 
1230  if(nmf_mode) {
1231  // Clip to non-negative values.
1232  model->V.row(b0.index) = b0.V_row.cwiseMax(float(0));
1233  model->V.row(b1.index) = b1.V_row.cwiseMax(float(0));
1234  } else {
1235  model->V.row(b0.index) = b0.V_row;
1236  model->V.row(b1.index) = b1.V_row;
1237  }
1238 
1239  break;
1240  }
1241 
1242  ////////////////////////////////////////////////////////////////////////////////
1243  // LINEAR REGRESSION -- NO FACTORS AT ALL.
1244 
1245  case model_factor_mode::pure_linear_model: {
1246 
1247  for(size_t j = 0; j < x_size; ++j) {
1248  const auto& b = buffer.v[j];
1249 
1250  double w_delta = l_grad * b.xv;
1251 
1252  if(adagrad_mode) {
1253  w_adagrad_g[b.index] += ss_scaling_factor * w_delta * w_delta;
1254  w_delta /= std::sqrt(w_adagrad_g[b.index]);
1255  }
1256 
1257  vfloat_ptr w_ptr = (vfloat_ptr)(&(model->w[b.index]));
1258 
1259  // Clip for stability.
1260 
1261  if(using_on_the_fly_regularization)
1262  *w_ptr = w_shrinkage * ((*w_ptr) - (clip_1m1(step_size * w_delta) * s_w_new_inv));
1263  else
1264  *w_ptr -= clip_1m1(step_size * w_delta) * s_w_new_inv;
1265  }
1266  break;
1267  }
1268  }
1269  }
1270 
1271  public:
1272 
1273  ////////////////////////////////////////////////////////////////////////////////
1274 
1275  /** Calculate the gradient with respect to a single observation,
1276  * then applies it. Used by the basic sgd solver; this one is used
1277  * for the second_order model.
1278  *
1279  * x is the observation vector, in standard ml_data_entry format,
1280  * formed by ml_data_iterator.fill_observation(...).
1281  *
1282  * struct ml_data_entry {
1283  * size_t column_index; // Column id
1284  * size_t index; // Local index within the column.
1285  * double value; // Value
1286  * };
1287  */
1288  double calculate_fx(size_t thread_idx,
1289  const std::vector<v2::ml_data_entry>& x) const GL_HOT_FLATTEN {
1290 
1291  DASSERT_LT(thread_idx, buffers.size());
1292 
1293  sgd_processing_buffer& buffer = buffers[thread_idx];
1294 
1295  switch(regularization_type) {
1296  case model_regularization_type::L2: {
1297  size_t n = iteration_sample_count;
1298 
1299  double s_w, s_V;
1300 
1301  if(n == 0) {
1302  s_w = s_V = 1;
1303  } else {
1304  s_w = s_w_factor_pow.pow(n);
1305  s_V = s_V_factor_pow.pow(n);
1306  }
1307 
1308  return _fill_buffer_calc_value(buffer, x, s_w, s_V);
1309  }
1310  case model_regularization_type::ON_THE_FLY:
1311  case model_regularization_type::NONE: {
1312  return _fill_buffer_calc_value(buffer, x, 1.0, 1.0);
1313  }
1314  default:
1315  return 0;
1316  }
1317  }
1318 
1319  ////////////////////////////////////////////////////////////////////////////////
1320 
1321  /** Calculate the gradient with respect to a single observation,
1322  * then applies it. Used by the basic sgd solver; this one is used
1323  * for the second_order model.
1324  *
1325  * x is the observation vector, in standard ml_data_entry format,
1326  * formed by ml_data_iterator.fill_observation(...).
1327  *
1328  * struct ml_data_entry {
1329  * size_t column_index; // Column id
1330  * size_t index; // Local index within the column.
1331  * double value; // Value
1332  * };
1333  */
1335  inline double apply_sgd_step(size_t thread_idx,
1336  const std::vector<v2::ml_data_entry>& x,
1337  double y,
1338  double step_size,
1339  bool apply_regularization) {
1340 
1341  sgd_processing_buffer& buffer = buffers[thread_idx];
1342 
1343  static constexpr size_t ITEM_COLUMN_INDEX = 1;
1344 
1345  ////////////////////////////////////////////////////////////////////////////////
1346  //
1347  // Step 1: Update the scaling of the regularization tracking
1348  // constants. We need to do this atomically; otherwise, the
1349  // updates may not be consistent.
1350 
1351  auto ru = _apply_regularization_update(step_size, apply_regularization);
1352 
1353  ////////////////////////////////////////////////////////////////////////////////
1354  //
1355  // Step 2: Pull in a snapshot of the current model values for the
1356  // linear part of the model, and calculate the fx_value from the
1357  // linear part as well.
1358 
1359  std::unique_lock<simple_spinlock> item_lock(item_locks[x[ITEM_COLUMN_INDEX].index], std::defer_lock);
1360 
1361  if(enable_item_locking)
1362  item_lock.lock();
1363 
1364  double fx_value = _fill_buffer_calc_value(buffer, x, ru.s_w_old, ru.s_V_old);
1365 
1366 
1367  // The following commented out code is kept in here to help with debugging.
1368 
1369  // if(!(std::fabs(fx_value) < 1000)) {
1370 
1371  // logprogress_stream << "Thread " << thread_idx
1372  // << "; user = " << x[0].index
1373  // << "; item = " << x[1].index
1374  // << "; w0 = " << model->w0
1375  // << "; buffered w0 = " << buffer.w0
1376  // << "; fx_value = " << fx_value
1377  // << "; s.s_w_new_inv = " << s.s_w_new_inv
1378  // << "; s.s_V_new_inv = " << s.s_V_new_inv
1379  // << "; step_size = " << step_size
1380  // << "; y = " << y
1381  // << "; w[user] = " << model->w[buffer.v[0].index]
1382  // << "; w[item] = " << model->w[buffer.v[1].index]
1383  // << "; item_global = " << buffer.v[1].index
1384  // << "; item_count = " << model->metadata->statistics(1)->count(x[1].index)
1385  // << std::endl;
1386 
1387  // if(num_factors() != 0) {
1388  // logprogress_stream << "; V[user] = " << model->V.row(buffer.v[0].index)
1389  // << "; V[item] = " << model->V.row(buffer.v[1].index) << std::endl;
1390  // }
1391 
1392 
1393  // for(size_t i = 0; i < size_t(model->w.size()); ++i) {
1394  // float v = model->w[i];
1395  // if(! (std::fabs(v) < 1000) ) {
1396  // logstream(LOG_WARNING) << "Thread " << thread_idx
1397  // << "; w[" << i << "] = " << v
1398  // << std::endl;
1399  // }
1400  // }
1401  // }
1402 
1403  ////////////////////////////////////////////////////////////////////////////////
1404  //
1405  // Step 3: Everything needed for the prediction fx should now be
1406  // calculated; go ahead and use that to get the gradient of the
1407  // loss function at fx_value.
1408 
1409  double l_grad = loss_model.loss_grad(fx_value, y);
1410 
1411  ////////////////////////////////////////////////////////////
1412  //
1413  // Step 4: Apply all the updates.
1414 
1415  // Apply the gradient to w0.
1416  if(!nmf_mode)
1417  _apply_w0_gradient(buffer, l_grad, step_size);
1418 
1419  const size_t x_size = x.size();
1420 
1421  // Apply the gradient to w and V
1422  _apply_w_V_gradient(buffer, l_grad, ru.s_w_new_inv, ru.s_V_new_inv, x_size, step_size);
1423 
1424  if(enable_item_locking)
1425  item_lock.unlock();
1426 
1427  // Flush all registers to memory.
1428  asm volatile("" ::: "memory");
1429 
1430  ////////////////////////////////////////////////////////////
1431  // Step 5: Return the state of the model at the old value
1432  // Make sure we have the most recent
1433  double loss_value = loss_model.loss(fx_value, y);
1434 
1435  // The following commented code is kept in here for possible future debugging.
1436 
1437  // if(!std::isfinite(loss_value)
1438  // || std::fabs(model->w0) > 1000
1439  // || !std::isfinite(_fill_buffer_calc_value(buffer, x, s.s_w_old, s.s_V_old) )) {
1440 
1441  // logstream(LOG_WARNING) << "Thread " << thread_idx
1442  // << "; w0 = " << model->w0
1443  // << "; buffered w0 = " << buffer.w0
1444  // << "; fx_value = " << fx_value
1445  // << "; l_grad = " << l_grad
1446  // << "; s.s_w_new_inv = " << s.s_w_new_inv
1447  // << "; s.s_V_new_inv = " << s.s_V_new_inv
1448  // << "; step_size = " << step_size
1449  // << "; y = " << y
1450  // << std::endl;
1451 
1452 
1453  // for(size_t i = 0; i < size_t(model->w.size()); ++i) {
1454  // float v = model->w[i];
1455  // if(! (std::fabs(v) < 1000) ) {
1456  // logstream(LOG_WARNING) << "Thread " << thread_idx
1457  // << "; w[" << i << "] = " << v
1458  // << std::endl;
1459  // }
1460  // }
1461 
1462  // }
1463 
1464  return loss_value;
1465  }
1466 
1467 
1468  /** Calculate the gradient with respect to a single observation,
1469  * then applies it. Used by the basic sgd solver; this one is used
1470  * for the second_order model.
1471  *
1472  * x is the observation vector, in standard ml_data_entry format,
1473  * formed by ml_data_iterator.fill_observation(...).
1474  *
1475  * struct ml_data_entry {
1476  * size_t column_index; // Column id
1477  * size_t index; // Local index within the column.
1478  * double value; // Value
1479  * };
1480  */
1481 
1483  inline double apply_sgd_step(size_t thread_idx,
1484  const std::vector<v2::ml_data_entry>& x,
1485  double y,
1486  double step_size) {
1487  return apply_sgd_step(thread_idx, x, y, step_size, true);
1488  }
1489 
1490  ////////////////////////////////////////////////////////////////////////////////
1491 
1492  /** Calculate the gradient with respect to a single observation,
1493  * then applies it. Used by the basic sgd solver; this one is used
1494  * for the first order model.
1495  *
1496  * x is the observation vector, in standard ml_data_entry format,
1497  * formed by ml_data_iterator.fill_observation(...).
1498  *
1499  * struct ml_data_entry {
1500  * size_t column_index; // Column id
1501  * size_t index; // Local index within the column.
1502  * double value; // Value
1503  * };
1504  */
1506  size_t thread_idx,
1507  const std::vector<v2::ml_data_entry>& x_positive,
1508  const std::vector<v2::ml_data_entry>& x_negative,
1509  double step_size) GL_HOT_FLATTEN {
1510  sgd_processing_buffer& buffer_1 = buffers[thread_idx];
1511  sgd_processing_buffer& buffer_2 = alt_buffers[thread_idx];
1512 
1513  ////////////////////////////////////////////////////////////////////////////////
1514  //
1515  // Step 1: Pull in a snapshot of the current model values.
1516  // Calculate the current linear predictor fx along the way.
1517 
1518  DASSERT_GE(buffer_1.v.size(), x_positive.size());
1519  DASSERT_GE(buffer_2.v.size(), x_negative.size());
1520 
1521  // Check that the user index is the same. For this we can ignore that.
1522  DASSERT_EQ(x_positive[0].index, x_negative[0].index);
1523  DASSERT_NE(x_positive[1].index, x_negative[1].index);
1524 
1525  auto s = _apply_regularization_update(step_size);
1526 
1527  double fx_diff_value = (_fill_buffer_calc_value(buffer_1, x_positive, s.s_w_old, s.s_V_old)
1528  - _fill_buffer_calc_value(buffer_2, x_negative, s.s_w_old, s.s_V_old));
1529 
1530  // Everything needed for the prediction fx should now be
1531  // calculated; go ahead and use that to get the gradient of the
1532  // loss function at fx_value.
1533  double l_grad = loss_model.loss_grad(fx_diff_value, 0);
1534 
1535  if(! (std::fabs(l_grad) < 1e-16) ) {
1536 
1537  // No need to apply gradient to w0; this just gets canceled out.
1538 
1539  // Apply gradient to the w and V terms.
1540  _apply_w_V_gradient(buffer_1, l_grad, s.s_w_new_inv, s.s_V_new_inv,
1541  x_positive.size(), step_size);
1542 
1543  _apply_w_V_gradient(buffer_2, -l_grad, s.s_w_new_inv, s.s_V_new_inv,
1544  x_positive.size(), step_size);
1545  }
1546 
1547  return loss_model.loss(fx_diff_value, 0);
1548  }
1549 
1550 };
1551 
1552 }}
1553 
1554 #endif
GL_HOT_INLINE_FLATTEN double apply_sgd_step(size_t thread_idx, const std::vector< v2::ml_data_entry > &x, double y, double step_size)
size_t column_index
#define GL_HOT_FLATTEN
double calculate_fx(size_t thread_idx, const std::vector< v2::ml_data_entry > &x) const GL_HOT_FLATTEN
GL_HOT_INLINE_FLATTEN double apply_sgd_step(size_t thread_idx, const std::vector< v2::ml_data_entry > &x, double y, double step_size, bool apply_regularization)
void setup(const v2::ml_data &train_data, const std::map< std::string, flexible_type > &options)
double apply_pairwise_sgd_step(size_t thread_idx, const std::vector< v2::ml_data_entry > &x_positive, const std::vector< v2::ml_data_entry > &x_negative, double step_size) GL_HOT_FLATTEN
double value
static size_t cpu_count()
double pow(size_t b) const GL_HOT_INLINE_FLATTEN
void setup_optimization(size_t random_seed=size_t(-1), bool trial_mode=false)
#define GL_HOT_INLINE_FLATTEN
void in_parallel(const std::function< void(size_t thread_id, size_t num_threads)> &fn)
Definition: lambda_omp.hpp:35
size_t index