Turi Create  4.0
newton_method-inl.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_NEWTON_METHOD_H_
7 #define TURI_NEWTON_METHOD_H_
8 
9 #include <ml/optimization/optimization_interface.hpp>
10 #include <core/data/flexible_type/flexible_type.hpp>
11 #include <Eigen/Core>
12 #include <Eigen/Cholesky>
13 
14 #include <ml/optimization/utils.hpp>
15 #include <ml/optimization/optimization_interface.hpp>
16 #include <ml/optimization/regularizer_interface.hpp>
17 #include <ml/optimization/line_search-inl.hpp>
18 #include <core/logging/table_printer/table_printer.hpp>
19 
20 // TODO: List of todo's for this file
21 //------------------------------------------------------------------------------
22 // 1. Sparse hessian newton method?
23 
24 namespace turi {
25 
26 namespace optimization {
27 
28 
29 /**
30  * \ingroup group_optimization
31  * \addtogroup Newton Newton Method
32  * \{
33  */
34 
35 /**
36  *
37  * Solve a second_order_optimization_interface model with a (dense) hessian Newton
38  * method.
39  *
40  * \param[in,out] model Model with second order optimization interface.
41  * \param[in] init_point Starting point for the solver.
42  * \param[in,out] opts Solver options.
43  * \param[in] reg Shared ptr to an interface to a smooth regularizer.
44  * \param[out] stats Solver return stats.
45  * \tparam Vector Sparse or dense gradient representation.
46  *
47  * \note The hessian is always computed as a dense matrix. Only gradients are
48  * allowed to be sparse. The implementation of Newton method must change when
49  * the hessian is sparse. I.e we can no longer perform an LDLT decomposition to
50  * invert the hessian matrix. We have to switch methods to Conjugate gradient
51  * or Sparse LDLT decomposition.
52  *
53 */
54 template <typename Vector = DenseVector>
56  const DenseVector& init_point,
57  std::map<std::string, flexible_type>& opts,
58  const std::shared_ptr<smooth_regularizer_interface> reg=NULL){
59 
60  // Benchmarking utils.
61  timer t;
62  double start_time = t.current_time();
63  solver_return stats;
64 
65  logprogress_stream << "Starting Newton Method " << std::endl;
66  logprogress_stream << "--------------------------------------------------------" << std::endl;
67  std::stringstream ss;
68  ss.str("");
69 
70  // Step 1: Algorithm option init
71  // ------------------------------------------------------------------------
72  // Load options
73  size_t iter_limit = opts["max_iterations"];
74  double convergence_threshold = opts["convergence_threshold"];
75  double step_size = 1;
76  size_t iters = 0;
77 
78  // Log iteration and residual norms
79  table_printer printer(model.get_status_header(
80  {"Iteration", "Passes", "Elapsed Time"}));
81  printer.print_header();
82 
83  // First compute the gradient. Sometimes, you already have the solution
84  // during the starting point. In these settings, you don't want to waste
85  // time performing a newton the step.
86  DenseVector point = init_point;
87  Vector gradient(point.size());
88  DenseVector reg_gradient(point.size());
89  DenseMatrix hessian(gradient.size(), gradient.size());
90  DiagonalMatrix reg_hessian(gradient.size());
91  double func_value;
92  double relative_error;
93 
94  // Compute gradient (Add regularizer gradient)
95  model.compute_second_order_statistics(point, hessian, gradient, func_value);
96  stats.num_passes++;
97  if (reg != NULL){
98  reg->compute_gradient(point, reg_gradient);
99  gradient += reg_gradient;
100  }
101  double residual = compute_residual(gradient);
102 
103  // Keep track of previous point
104  DenseVector delta_point = point;
105  delta_point.setZero();
106 
107 
108  // Nan Checking!
109  if (std::isnan(residual) || std::isinf(residual)){
111  }
112 
113  // Step 2: Algorithm starts here
114  // ------------------------------------------------------------------------
115  // While not converged
116  while((residual >= convergence_threshold) && (iters < iter_limit)){
117 
118  // Add regularizer hessian
119  if (reg != NULL){
120  reg->compute_hessian(point, reg_hessian);
121  hessian += reg_hessian;
122  }
123  delta_point = -step_size * hessian.ldlt().solve(gradient);
124  relative_error = (hessian*delta_point + gradient).norm()
125  / std::max(gradient.norm(), OPTIMIZATION_ZERO);
126 
127  // LDLT Decomposition failed.
128  if (relative_error > convergence_threshold){
129  logprogress_stream << "WARNING: Matrix is close to being singular or"
130  << " badly scaled. The solution is accurate only up to a tolerance of "
131  << relative_error << ". This typically happens when regularization"
132  << " is not sufficient. Consider increasing regularization."
133  << std::endl;
135  break;
136  }
137 
138  // Update the new point and gradient
139  point = point + delta_point;
140 
141  // Numerical overflow. (Step size was too large)
142  if (!delta_point.array().isFinite().all()) {
144  break;
145  }
146 
147  model.compute_second_order_statistics(point, hessian, gradient, func_value);
148  if (reg != NULL){
149  reg->compute_gradient(point, reg_gradient);
150  gradient += reg_gradient;
151  }
152  residual = compute_residual(gradient);
153  stats.num_passes++;
154  iters++;
155 
156  // Log info for debugging.
157  logstream(LOG_INFO) << "Iters (" << iters << ") "
158  << "Passes (" << stats.num_passes << ") "
159  << "Residual (" << residual << ") "
160  << "Loss (" << func_value << ") "
161  << std::endl;
162 
163  // Check for nan's in the function value.
164  if(std::isinf(func_value) || std::isnan(func_value)) {
166  break;
167  }
168 
169  // Print progress
170  auto stat_info = {std::to_string(iters),
171  std::to_string(stats.num_passes),
172  std::to_string(t.current_time())};
173  auto row = model.get_status(point, stat_info);
174  printer.print_progress_row_strs(iters, row);
175 
176  }
177  printer.print_footer();
178 
179  // Step 3: Return optimization model status.
180  // ------------------------------------------------------------------------
181  if (stats.status == OPTIMIZATION_STATUS::OPT_UNSET) {
182  if (iters < iter_limit){
184  } else {
186  }
187  }
188  stats.iters = static_cast<int>(iters);
189  stats.residual = residual;
190  stats.func_value = func_value;
191  stats.solve_time = t.current_time() - start_time;
192  stats.solution = point;
193  stats.gradient = gradient;
194  stats.hessian = hessian;
195  stats.progress_table = printer.get_tracked_table();
196 
197  // Display solver stats
199 
200  return stats;
201 
202 }
203 
204 
205 } // optimizaiton
206 
207 /// \}
208 } // turicreate
209 
210 #endif
#define logstream(lvl)
Definition: logger.hpp:276
const double OPTIMIZATION_ZERO
Optimization method zero.
virtual void compute_second_order_statistics(const DenseVector &point, DenseMatrix &Hessian, DenseVector &gradient, double &function_value)=0
solver_return newton_method(second_order_opt_interface &model, const DenseVector &init_point, std::map< std::string, flexible_type > &opts, const std::shared_ptr< smooth_regularizer_interface > reg=NULL)
#define LOG_INFO
Definition: logger.hpp:101
virtual std::vector< std::string > get_status(const DenseVector &coefs, const std::vector< std::string > &stats)
virtual std::vector< std::pair< std::string, size_t > > get_status_header(const std::vector< std::string > &stats)
double current_time() const
Returns the elapsed time in seconds since turi::timer::start was last called.
Definition: timer.hpp:83
#define logprogress_stream
Definition: logger.hpp:325
Numerical overflow. Step size parameter may be too large.
double compute_residual(const DenseVector &gradient)
void print_header() const
A simple class that can be used for benchmarking/timing up to microsecond resolution.
Definition: timer.hpp:59
void log_solver_summary_stats(const solver_return &stats, bool simple_mode=false)
Numerical underflow (not enough progress).