Turi Create  4.0
line_search-inl.hpp
1 /* Copyright © 2017 Apple Inc. All rights reserved.
2  *
3  * Use of this source code is governed by a BSD-3-clause license that can
4  * be found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
5  */
6 #ifndef TURI_LINE_SEARCH_H_
7 #define TURI_LINE_SEARCH_H_
8 
9 // Types
10 #include <core/data/flexible_type/flexible_type.hpp>
11 
12 // Optimization
13 #include <ml/optimization/optimization_interface.hpp>
14 #include <ml/optimization/regularizer_interface.hpp>
15 #include <Eigen/Core>
16 #include <core/logging/assertions.hpp>
17 
18 // TODO: List of todo's for this file
19 //------------------------------------------------------------------------------
20 // 1. Feature: Armijo cubic interpolation line search.
21 // 2. Feature: Wolfe cubic interpolation line search.
22 // 3. Optimization: Add accepted function value to the ls_return structure
23 // (reduces 1 func eval per iteration)
24 //
25 
26 namespace turi {
27 
28 namespace optimization {
29 
30 
31 /**
32  * \ingroup group_optimization
33  * \addtogroup line_search Line Search
34  * \{
35  */
36 
37 
38 /** "Zoom" phase for More and Thuente line seach.
39  *
40  * \note Applicable for smooth functions only.
41  *
42  * This code is a C++ port of Jorge Nocedal's implementaiton of More and
43  * Thuente [2] line search. This code was availiable at
44  * http://www.ece.northwestern.edu/~nocedal/lbfgs.html
45  *
46  * Nocedal's Condition for Use: This software is freely available for
47  * educational or commercial purposes. We expect that all publications
48  * describing work using this software quote at least one of the references
49  * given below. This software is released under the BSD License
50  *
51  * The purpose of cstep is to compute a safeguarded step for a linesearch and
52  * to update an interval of uncertainty for a minimizer of the function.
53  *
54  * The parameter stx contains the step with the least function value. The
55  * parameter stp contains the current step. It is assumed that the derivative
56  * at stx is negative in the direction of the step. If brackt is set true then
57  * a minimizer has been bracketed in an interval of uncertainty with endpoints
58  * stx and sty.
59  *
60  *
61  * \param[in] stx Best step obtained so far.
62  * \param[in] fx Function value at the best step so far.
63  * \param[in] dx Directional derivative at the best step so far.
64  * \param[in] sty Step at the end point of the uncertainty interval.
65  * \param[in] fy Function value at end point of uncertainty interval.
66  * \param[in] dy Directional derivative at end point of uncertainty interval.
67  * \param[in] stp Current step.
68  * \param[in] fp Function value at current step.
69  * \param[in] dp Directional derivative at the current step.
70  * \param[in] brackt Has the minimizer has been bracketed?
71  * \param[in] stpmin Min step size
72  * \param[in] stpmax Max step size.
73  *
74  *
75  */
76 inline bool cstep(double &stx, double &fx, double &dx,
77  double &sty, double &fy, double &dy,
78  double &stp, double &fp, double &dp,
79  bool &brackt, double stpmin, double stpmax){
80 
81 
82  const double p66 = 0.66;
83  bool info = false;
84 
85  // Check the input parameters for errors.
86  // This should not happen.
87  if ((brackt && (stp <= std::min(stx,sty) || stp >= std::max(stx,sty)))
88  || (dx*(stp-stx) >= LS_ZERO) || (stpmax < stpmin)){
89  return info;
90  }
91 
92 
93  // Determine if the derivatives have opposite sign.
94  double sgnd = dp*(dx/std::abs(dx));
95 
96  // Local variables
97  bool bound;
98  double theta, s, gamma, p, q, r, stpc, stpq, stpf;
99 
100  // First case.
101  // -------------------------------------------------------------------------
102  // A higher function value. The minimum is bracketed. If the cubic step is
103  // closer to stx than the quadratic step, the cubic step is taken, else the
104  // average of the cubic and quadratic steps is taken.
105  if (fp > fx){
106 
107  info = true;
108  bound = true;
109  theta = 3*(fx - fp)/(stp - stx) + dx + dp;
110  s = std::max(std::abs(theta), std::max(std::abs(dx),std::abs(dp)));
111  gamma = s*sqrt(pow(theta/s,2) - (dx/s)*(dp/s));
112 
113  if (stp < stx)
114  gamma = -gamma;
115 
116  p = (gamma - dx) + theta;
117  q = ((gamma - dx) + gamma) + dp;
118  r = p/q;
119  stpc = stx + r*(stp - stx);
120  stpq = stx + ((dx/((fx-fp)/(stp-stx)+dx))/2)*(stp - stx);
121 
122  if (std::abs(stpc-stx) < std::abs(stpq-stx)){
123  stpf = stpc;
124  } else{
125  stpf = stpc + (stpq - stpc)/2;
126  }
127 
128  brackt = true;
129 
130  }
131  // Second case.
132  // -------------------------------------------------------------------------
133  // A lower function value and derivatives of opposite sign. The
134  // minimum is bracketed. If the cubic step is closer to stx than the
135  // quadratic (secant) step, the cubic step is taken, else the quadratic step
136  // is taken.
137  else if(sgnd <= 0){
138 
139  info = true;
140  bound = false;
141 
142  theta = 3*(fx - fp)/(stp - stx) + dx + dp;
143  s = std::max(std::abs(theta), std::max(std::abs(dx),std::abs(dp)));
144  gamma = s*sqrt(pow(theta/s,2) - (dx/s)*(dp/s));
145 
146  if (stp > stx)
147  gamma = -gamma;
148 
149  p = (gamma - dp) + theta;
150  q = ((gamma - dp) + gamma) + dx;
151  r = p/q;
152  stpc = stp + r*(stx - stp);
153  stpq = stp + (dp/(dp-dx))*(stx - stp);
154  if (std::abs(stpc-stp) > std::abs(stpq-stp)){
155  stpf = stpc;
156  } else {
157  stpf = stpq;
158  }
159  brackt = true;
160  }
161  // Third case.
162  // -------------------------------------------------------------------------
163  // A lower function value, derivatives of the same sign, and the
164  // magnitude of the derivative decreases. The cubic step is only used if the
165  // cubic tends to infinity in the direction of the step or if the minimum of
166  // the cubic is beyond stp. Otherwise the cubic step is defined to be either
167  // stpmin or stpmax. The quadratic (secant) step is also computed and if the
168  // minimum is bracketed then the the step closest to stx is taken, else the
169  // step farthest away is taken.
170  else if (std::abs(dp) < std::abs(dx)){
171 
172  info = true;
173  bound = true;
174  theta = 3*(fx - fp)/(stp - stx) + dx + dp;
175  s = std::max(std::abs(theta), std::max(std::abs(dx),std::abs(dp)));
176 
177  // The case gamma = 0 only arises if the cubic does not tend to infinity
178  // in the direction of the step.
179 
180  gamma = s*sqrt(std::max(0., pow((theta/s),2) - (dx/s)*(dp/s)));
181  if (stp > stx){
182  gamma = -gamma;
183  }
184 
185  p = (gamma - dp) + theta;
186  q = (gamma + (dx - dp)) + gamma;
187  r = p/q;
188  if((r < 0.0) && (std::abs(gamma) <= OPTIMIZATION_ZERO)){
189  stpc = stp + r*(stx - stp);
190  }else if (stp > stx){
191  stpc = stpmax;
192  }else{
193  stpc = stpmin;
194  }
195 
196  stpq = stp + (dp/(dp-dx))*(stx - stp);
197  if (brackt){
198  if (std::abs(stp-stpc) < std::abs(stp-stpq)){
199  stpf = stpc;
200  }else{
201  stpf = stpq;
202  }
203  }else{
204  if (std::abs(stp-stpc) > std::abs(stp-stpq)){
205  stpf = stpc;
206  }else{
207  stpf = stpq;
208  }
209  }
210 
211  }
212  // Fourth case.
213  // -------------------------------------------------------------------------
214  // A lower function value, derivatives of the same sign, and the magnitude of
215  // the derivative does not decrease. If the minimum is not bracketed, the
216  // step is either stpmin or stpmax, else the cubic step is taken.
217  else{
218  info = true;
219  bound = false;
220  if (brackt){
221  theta = 3*(fp - fy)/(sty - stp) + dy + dp;
222  s = std::max(std::abs(theta), std::max(std::abs(dy),std::abs(dp)));
223  gamma = s*sqrt(pow(theta/s,2) - (dy/s)*(dp/s));
224 
225  if (stp > sty){
226  gamma = -gamma;
227  }
228  p = (gamma - dp) + theta;
229  q = ((gamma - dp) + gamma) + dy;
230  r = p/q;
231  stpc = stp + r*(sty - stp);
232  stpf = stpc;
233  }else if (stp > stx){
234  stpf = stpmax;
235  }else {
236  stpf = stpmin;
237  }
238  }
239 
240  // Update the interval of uncertainty.
241  // This update does not depend on the new step or the case analysis above.
242  if (fp > fx){
243  sty = stp;
244  fy = fp;
245  dy = dp;
246  }else{
247 
248  if (sgnd < 0.0){
249  sty = stx;
250  fy = fx;
251  dy = dx;
252  }
253  stx = stp;
254  fx = fp;
255  dx = dp;
256  }
257 
258  // Compute the new step and safeguard it.
259  stpf = std::min(stpmax,stpf);
260  stpf = std::max(stpmin,stpf);
261  stp = stpf;
262  if (brackt && bound){
263  if (sty > stx){
264  stp = std::min(stx+p66*(sty-stx),stp);
265  }else{
266  stp = std::max(stx+p66*(sty-stx),stp);
267  }
268  }
269 
270  // cstep-failed
271  if (info == false){
272  logprogress_stream << "Warning:"
273  << " Unable to interpolate step size intervals."
274  << std::endl;
275  }
276  return info;
277 
278 }
279 
280 
281 
282 /**
283  *
284  * Compute step sizes for line search methods to satisfy Strong Wolfe conditions.
285  *
286  * \note Applicable for smooth functions only.
287  *
288  * This code is a C++ port of Jorge Nocedal's implementaiton of More and
289  * Thuente [2] line search. This code was availiable at
290  * http://www.ece.northwestern.edu/~nocedal/lbfgs.html
291  *
292  * Line search based on More' and Thuente [1] to find a step which satisfies
293  * a Wolfe conditions for sufficient decrease condition and a curvature
294  * condition.
295  *
296  * At each stage the function updates an interval of uncertainty with
297  * endpoints. The interval of uncertainty is initially chosen so that it
298  * contains a minimizer of the modified function
299  *
300  * f(x+stp*s) - f(x) - ftol*stp*(gradf(x)'s).
301  *
302  * If a step is obtained for which the modified function has a nonpositive
303  * function value and nonnegative derivative, then the interval of
304  * uncertainty is chosen so that it contains a minimizer of f(x+stp*s).
305  *
306  * The algorithm is designed to find a step which satisfies the sufficient
307  * decrease condition
308  * f(x+stp*s) .le. f(x) + ftol*stp*(gradf(x)'s), [W1]
309  * and the curvature condition
310  * std::abs(gradf(x+stp*s)'s)) .le. gtol*std::abs(gradf(x)'s). [W2]
311  *
312  * References:
313  *
314  * (1) More, J. J. and D. J. Thuente. "Line Search Algorithms with
315  * Guaranteed Sufficient Decrease." ACM Transactions on Mathematical Software
316  * 20, no. 3 (1994): 286-307.
317  *
318  * (2) Wright S.J and J. Nocedal. Numerical optimization. Vol. 2.
319  * New York: Springer, 1999.
320  * \param[in] model Any model with a first order optimization interface.
321  * \param[in] init_step Initial step size
322  * \param[in] point Starting point for the solver.
323  * \param[in] gradient Gradient value at this point (saves computation)
324  * \param[in] reg Shared ptr to an interface to a smooth regularizer.
325  * \returns stats Line searnorm ch return object
326  *
327 */
328 template <typename Vector>
331  double init_step,
332  double init_func_value,
333  DenseVector point,
334  Vector gradient,
335  DenseVector direction,
336  double function_scaling = 1.0,
337  const std::shared_ptr<smooth_regularizer_interface> reg=NULL,
338  size_t max_function_evaluations = LS_MAX_ITER){
339 
340 
341  // Initialize the return object
342  ls_return stats;
343 
344  // Input checking: Initial step size can't be zero.
345  if (init_step <= LS_ZERO){
346  logprogress_stream << " Error:"
347  <<" \nInitial step step less than "<< LS_ZERO
348  << "." << std::endl;
349  return stats;
350  }
351 
352 
353  // Check that the initia direction is a descent direction.
354  // This can only occur of your gradients were computed incorrectly
355  // or the problem is non-convex.
356  DenseVector x0 = point;
357  double Dphi0 = gradient.dot(direction);
358  if ( (init_step <= 0) | (Dphi0 >= OPTIMIZATION_ZERO) ){
359  logprogress_stream << " Error: Search direction is not a descent direction."
360  <<" \nDetected numerical difficulties." << std::endl;
361  }
362 
363  // Initializing local variables
364  // stx, fx, dgx: Values of the step, function, and derivative at the best
365  // step.
366  //
367  // sty, fy, dgy: Values of the step, function, and derivative at the other
368  // endpoint of the interval of uncertainty.
369  //
370  // st, f, dg : Values of the step, function, and derivative at current
371  // step
372  //
373  // g : Gradient w.r.t x and step (vector) at the current point
374  //
375  double stx = LS_ZERO;
376  double fx = init_func_value;
377  double dgx = Dphi0; // Derivative of f(x + s d) w.r.t s
378 
379  double sty = LS_ZERO;
380  double fy = init_func_value;
381  double dgy = Dphi0;
382 
383  double stp = init_step;
384  double f = init_func_value;
385  double dg = Dphi0;
386  Vector g = gradient;
387  DenseVector reg_gradient(gradient.size());
388 
389  // Interval [stmax, stmin] of uncertainty
390  double stmax = LS_ZERO;
391  double stmin = LS_MAX_STEP_SIZE;
392 
393  // Flags and local variables
394  bool brackt = false; // Bracket or Zoon phase?
395  bool stage1 = true;
396 
397  double wolfe_func_dec = LS_C1*Dphi0; // Wolfe condition [W1]
398  double wolfe_curvature = LS_C2*Dphi0; // Wolfe condition [W2]
399  double width = stmax - stmin;
400  double width2 = 2*width;
401 
402 
403  // Constants used in this code.
404  // (Based on http://www.ece.northwestern.edu/~nocedal/lbfgs.html)
405  const double p5 = 0.5;
406  const double p66 = 0.66;
407  const double xtrapf = 4;
408  bool infoc = true; // Zoom status
409 
410  // Start searching
411  while (true){
412 
413  // Set the min and max steps based on the current interval of uncertainty.
414  if (brackt == true){
415  stmin = std::min(stx,sty);
416  stmax = std::max(stx,sty);
417  }else{
418  stmin = stx;
419  stmax = stp + xtrapf*(stp - stx);
420  }
421 
422  // Force the step to be within the bounds
423  stp = std::max(stp,LS_ZERO);
424  stp = std::min(stp,LS_MAX_STEP_SIZE);
425 
426  // If an unusual termination is to occur then let 'stp' be the lowest point
427  // obtained so far.
428  if ( (infoc == false)
429  || (brackt && (stmax-stmin <= LS_ZERO))){
430 
431  logprogress_stream << "Warning:"
432  << " Unusual termination criterion reached."
433  << "\nReturning the best step found so far."
434  << " This typically happens when the number of features is much"
435  << " larger than the number of training samples. Consider pruning"
436  << " features manually or increasing the regularization value."
437  << std::endl;
438  stp = stx;
439  }
440 
441  // Reached func evaluation limit -- return the best one so far.
442  if (size_t(stats.func_evals) >= max_function_evaluations){
443  stats.step_size = stx;
444  stats.status = true;
445  return stats;
446  }
447 
448  // Evaluate the function and gradient at stp and compute the directional
449  // derivative.
450  point = x0 + stp * direction;
451  model.compute_first_order_statistics(point, g, f);
452  stats.num_passes++;
453  stats.func_evals++;
454  stats.gradient_evals++;
455  if (reg != NULL){
456  reg->compute_gradient(point, reg_gradient);
457  f += reg->compute_function_value(point);
458  g += reg_gradient;
459  }
460 
461  if(function_scaling != 1.0) {
462  f *= function_scaling;
463  g *= function_scaling;
464  }
465 
466  dg = g.dot(direction);
467 
468  double ftest = init_func_value + stp*wolfe_func_dec;
469 
470  // Termination checking
471  // Note: There are many good checks and balances used in Nocedal's code
472  // Some of them are overly defensive and should not happen.
473 
474  // Rounding errors
475  if ( (brackt && ((stp <= stmin) || (stp >= stmax))) || (infoc == false)){
476  logprogress_stream << "Warning: Rounding errors"
477  << " prevent further progress. \nThere may not be a step which"
478  << " satisfies the sufficient decrease and curvature conditions."
479  << " \nTolerances may be too small or dataset may be poorly scaled."
480  << " This typically happens when the number of features is much"
481  << " larger than the number of training samples. Consider pruning"
482  << " features manually or increasing the regularization value."
483  << std::endl;
484  stats.step_size = stp;
485  stats.status = false;
486  return stats;
487  }
488 
489  // Step is more than LS_MAX_STEP_SIZE
490  if ((stp >= LS_MAX_STEP_SIZE) && (f <= ftest) && (dg <= wolfe_func_dec)){
491  logprogress_stream << "Warning: Reached max step size."
492  << std::endl;
493  stats.step_size = stp;
494  stats.status = true;
495  return stats;
496  }
497 
498  // Step is smaller than LS_ZERO
499  if ((stp <= LS_ZERO) && ((f > ftest) || (dg >= wolfe_func_dec))){
500  logprogress_stream << "Error: Reached min step size."
501  << " Cannot proceed anymore."
502  << std::endl;
503  stats.step_size = stp;
504  stats.status = false;
505  return stats;
506  }
507 
508 
509 
510  // Relative width of the interval of uncertainty is reached.
511  if (brackt && (stmax-stmin <= LS_ZERO)){
512  logprogress_stream << "Error: \nInterval of uncertainty"
513  << "lower than step size limit." << std::endl;
514  stats.status = false;
515  return stats;
516  }
517 
518  // Wolfe conditions W1 and W2 are satisfied! Woo!
519  if ((f <= ftest) && (std::abs(dg) <= -wolfe_curvature)){
520  stats.step_size = stp;
521  stats.status = true;
522  return stats;
523  }
524 
525  // Stage 1 is a search for steps for which the modified function has
526  // a nonpositive value and nonnegative derivative.
527  if ( stage1 && (f <= ftest) && (dg >= wolfe_curvature)){
528  stage1 = false;
529  }
530 
531 
532  // A modified function is used to predict the step only if we have not
533  // obtained a step for which the modified function has a nonpositive
534  // function value and nonnegative derivative, and if a lower function
535  // value has been obtained but the decrease is not sufficient.
536 
537  if (stage1 && (f <= fx) && (f > ftest)){
538 
539  // Define the modified function and derivative values.
540  double fm = f - stp*wolfe_func_dec;
541  double fxm = fx - stx*wolfe_func_dec;
542  double fym = fy - sty*wolfe_func_dec;
543  double dgm = dg - wolfe_func_dec;
544  double dgxm = dgx - wolfe_func_dec;
545  double dgym = dgy - wolfe_func_dec;
546 
547  // Call cstep to update the interval of uncertainty and to compute the
548  // new step.
549  infoc = cstep(stx,fxm, dgxm,
550  sty, fym, dgym,
551  stp, fm, dgm,
552  brackt,
553  stmin,stmax);
554 
555  // Reset the function and gradient values for f.
556  fx = fxm + stx*wolfe_func_dec;
557  fy = fym + sty*wolfe_func_dec;
558  dgx = dgxm + wolfe_func_dec;
559  dgy = dgym + wolfe_func_dec;
560 
561  }else{
562 
563  // Call cstep to update the interval of uncertainty and to compute the
564  // new step.
565  infoc = cstep(stx,fx, dgx,
566  sty, fy, dgy,
567  stp, f, dg,
568  brackt,
569  stmin,stmax);
570 
571  }
572 
573 
574  // Force a sufficient decrease in the size of the interval of uncertainty.
575  if (brackt){
576  if (std::abs(sty-stx) >= p66*width2){
577  stp = stx + p5*(sty - stx);
578  }
579 
580  width2 = width;
581  width = std::abs(sty-stx);
582  }
583 
584  } // end-of-while
585 
586  return stats;
587 
588 }
589 
590 
591 /**
592  *
593  * Armijo backtracking to compute step sizes for line search methods.
594  *
595  * \note Applicable for smooth functions only.
596  *
597  * Line search based on an backtracking strategy to ensure sufficient
598  * decrease in function values at each iteration.
599  *
600  * References:
601  * (1) Wright S.J and J. Nocedal. Numerical optimization. Vol. 2.
602  * New York: Springer, 1999.
603  *
604  * \param[in] model Any model with a first order optimization interface.
605  * \param[in] init_step Initial step size.
606  * \param[in] init_func Initial function value.
607  * \param[in] point Starting point for the solver.
608  * \param[in] gradient Gradient value at this point.
609  * \param[in] direction Direction of the next step .
610  *
611  * \returns stats Line searnorm ch return object
612  *
613 */
614 template <typename Vector>
617  double init_step,
618  double init_func_value,
619  DenseVector point,
620  Vector gradient,
621  DenseVector direction){
622 
623  // Step 1: Initialize the function
624  // ------------------------------------------------------------------------
625  // Min function decrease according to Armijo conditions.
626  // The choice of constants are based on Nocedal and Wright [1].
627  double sufficient_decrease = LS_C1*(gradient.dot(direction));
628  ls_return stats;
629  double step_size = init_step;
630  DenseVector new_point = point;
631 
632  // Step 2: Backtrack
633  // -----------------------------------------------------------------------
634  while (stats.func_evals <= LS_MAX_ITER && step_size >= LS_ZERO){
635 
636  // Check for sufficient decrease
637  new_point = point + step_size * direction;
638  if (model.compute_function_value(new_point) <= init_func_value
639  + step_size * sufficient_decrease){
640 
641  stats.step_size = step_size;
642  stats.status = true;
643  return stats;
644 
645  }
646 
647  step_size *= 0.5;
648  stats.func_evals += 1;
649 
650  }
651 
652  return stats;
653 }
654 
655 
656 /**
657  *
658  * Backtracking to compute step sizes for line search methods.
659  *
660  * \note Applicable for non-smooth functions.
661  *
662  * Line search based on an backtracking strategy to "some" decrease in function
663  * values at each iteration.
664  *
665  * References:
666  * (1) Wright S.J and J. Nocedal. Numerical optimization. Vol. 2.
667  * New York: Springer, 1999.
668  *
669  * \param[in] model Any model with a first order optimization interface.
670  * \param[in] init_step Initial step size.
671  * \param[in] init_func Initial function value.
672  * \param[in] point Starting point for the solver.
673  * \param[in] gradient Gradient value at this point.
674  * \param[in] direction Direction of the next step .
675  * \param[in] reg Regularizer interface!
676  *
677  * \returns stats Line searnorm ch return object
678  *
679 */
680 template <typename Vector>
683  double init_step,
684  double init_func_value,
685  DenseVector point,
686  Vector gradient,
687  DenseVector direction,
688  const std::shared_ptr<regularizer_interface> reg=NULL){
689 
690  // Step 1: Initialize the function
691  // ------------------------------------------------------------------------
692  ls_return stats;
693  double step_size = init_step;
694  DenseVector delta_point(point.size());;
695  DenseVector new_point(point.size());;
696 
697  // Step 2: Backtrack
698  // -----------------------------------------------------------------------
699  while (stats.func_evals <= LS_MAX_ITER && step_size >= LS_ZERO){
700 
701  // Check for sufficient decrease
702  new_point = point + step_size * direction;
703  if(reg != NULL){
704  reg->apply_proximal_operator(new_point, step_size);
705  }
706 
707  delta_point = new_point - point;
708  if (model.compute_function_value(new_point) <=
709  init_func_value + gradient.dot(delta_point)
710  + 0.5 * delta_point.squaredNorm()/step_size){
711 
712  stats.step_size = step_size;
713  stats.status = true;
714  return stats;
715 
716  }
717 
718  step_size *= 0.5;
719  stats.func_evals += 1;
720 
721  }
722 
723  return stats;
724 }
725 
726 /** This function estimates a minumum point between
727  * two function values with known gradients.
728  *
729  * The minimum value must lie between the two function values, f1 and f2, and the
730  * gradients must indicate the minimum lies between the two values and that the
731  * function is convex.
732  *
733  * Under these situations, a third degree polynomial / cubic spline can be fit
734  * to the two function points, and this is gauranteed to have a single minimum
735  * between f1 and f2. This is what this function returns.
736  *
737  * \param[in] dist The distance between the points f1 and f2.
738  * \param[in] f1 The value of the function at the left point.
739  * \param[in] f2 The value of the function at the right point.
740  * \param[in] g1 The derivative of the function at the left point.
741  * \param[in] g2 The derivative of the function at the right point.
742  *
743  */
744 inline double gradient_bracketed_linesearch(double dist, double f1, double f2,
745  double g1, double g2) {
746  // Assume f1 is evaluated at 0, f2 is evaluated at 1. To make the latter true,
747  // we need to multiply the gradients by the appropriate scaling factor.
748  g1 *= dist;
749  g2 *= dist;
750 
751  // We have to have this such that a minimum point is between f1 and f2, which
752  DASSERT_LE(g1, 1e-4); // Condition 1
753  DASSERT_GE(g2, -1e-4); // Condition 2
754 
755  // Also make sure the function is convex;
756  DASSERT_LE(f2 - g2, f1 + 1e-4); // Condition 3
757  DASSERT_LE(f1 + g1, f2 + 1e-4); // Condition 4
758 
759 
760  // Now, we have 4 known variables, so construct a 3rd order polynomial to
761  // approximate the solution between the two points. Then find the minimum of
762  // that.
763 
764  // With the convexity coefficients above, a third order polynomial, given as
765  //
766  // p(t) = a*t^3 + b*t^2 + c*t + d
767  //
768  // will have exactly one minima between 0 and 1 by the conditions above.
769  //
770  // The coefficients can be easily derived by:
771  // p(0) = f1
772  // p(1) = f2
773  // p'(0) = g1
774  // p'(1) = g2
775  //
776  // Some algebra yields:
777  //
778  const double a = -2*(f2 - f1) + (g2 + g1);
779  const double b = 3*(f2 - f1) - g2 - 2*g1;
780  const double c = g1;
781  const double d = f1;
782 
783  // std::cout << "coeff = (" << a << ", " << b << ", " << c << std::endl;
784 
785  // Starting iterate
786  double left = 0;
787  double right = 1;
788 
789  double best_value = std::min(f1, f2);
790  double best_loc = f1 < f2 ? 0 : 1;
791 
792 
793  for(size_t m_iter = 0; m_iter < 32; ++m_iter) {
794 
795  double t = 0.5 * (right + left);
796 
797  // Make sure that the recent value is indeed better
798  double v = d + c*t + b*t*t + a*t*t*t;
799  // double vpp = 2*b + 6*a*t;
800 
801  if(v < best_value) {
802  best_value = v;
803  best_loc = t;
804  }
805 
806  if(right - left < 1e-6) {
807  break;
808  }
809 
810  double vp = c + 2*b*t + 3*a*t*t;
811 
812  if(vp > 0) {
813  right = t;
814  } else {
815  left = t;
816  }
817 
818  }
819 
820  return dist * best_loc;
821 }
822 
823 
824 
825 
826 /// \}
827 
828 } // optimizaiton
829 
830 } // turicreate
831 
832 #endif
double gradient_bracketed_linesearch(double dist, double f1, double f2, double g1, double g2)
const double LS_MAX_STEP_SIZE
Max allowable step size.
ls_return backtracking(first_order_opt_interface &model, double init_step, double init_func_value, DenseVector point, Vector gradient, DenseVector direction, const std::shared_ptr< regularizer_interface > reg=NULL)
const double LS_C2
Line search curvature approximation.
const double OPTIMIZATION_ZERO
Optimization method zero.
ls_return more_thuente(first_order_opt_interface &model, double init_step, double init_func_value, DenseVector point, Vector gradient, DenseVector direction, double function_scaling=1.0, const std::shared_ptr< smooth_regularizer_interface > reg=NULL, size_t max_function_evaluations=LS_MAX_ITER)
bool cstep(double &stx, double &fx, double &dx, double &sty, double &fy, double &dy, double &stp, double &fp, double &dp, bool &brackt, double stpmin, double stpmax)
const double LS_C1
Line search sufficient decrease parameters.
const int LS_MAX_ITER
Num func evals before a failed line search.
#define logprogress_stream
Definition: logger.hpp:325
double gamma(const double alpha=double(1))
Definition: random.hpp:438
virtual double compute_function_value(const DenseVector &point, const size_t mbStart=0, const size_t mbSize=-1)
const double LS_ZERO
Smallest allowable step length.
virtual void compute_first_order_statistics(const DenseVector &point, DenseVector &gradient, double &function_value, const size_t mbStart=0, const size_t mbSize=-1)=0
ls_return armijo_backtracking(first_order_opt_interface &model, double init_step, double init_func_value, DenseVector point, Vector gradient, DenseVector direction)