bfgs.hpp
Go to the documentation of this file.
1 #ifndef STAN_OPTIMIZATION_BFGS_HPP
2 #define STAN_OPTIMIZATION_BFGS_HPP
3 
4 #include <stan/math/prim/mat.hpp>
10 #include <boost/math/special_functions/fpclassify.hpp>
11 #include <algorithm>
12 #include <cmath>
13 #include <cstdlib>
14 #include <limits>
15 #include <string>
16 #include <vector>
17 
18 namespace stan {
19  namespace optimization {
20  typedef enum {
22  TERM_ABSX = 10,
23  TERM_ABSF = 20,
24  TERM_RELF = 21,
27  TERM_MAXIT = 40,
30 
31  template<typename Scalar = double>
33  public:
35  maxIts = 10000;
36  fScale = 1.0;
37 
38  tolAbsX = 1e-8;
39  tolAbsF = 1e-12;
40  tolAbsGrad = 1e-8;
41 
42  tolRelF = 1e+4;
43  tolRelGrad = 1e+3;
44  }
45  size_t maxIts;
46  Scalar tolAbsX;
47  Scalar tolAbsF;
48  Scalar tolRelF;
49  Scalar fScale;
50  Scalar tolAbsGrad;
51  Scalar tolRelGrad;
52  };
53 
54  template<typename Scalar = double>
55  class LSOptions {
56  public:
58  c1 = 1e-4;
59  c2 = 0.9;
60  minAlpha = 1e-12;
61  alpha0 = 1e-3;
62  }
63  Scalar c1;
64  Scalar c2;
65  Scalar alpha0;
66  Scalar minAlpha;
67  };
68  template<typename FunctorType, typename QNUpdateType,
69  typename Scalar = double, int DimAtCompile = Eigen::Dynamic>
70  class BFGSMinimizer {
71  public:
72  typedef Eigen::Matrix<Scalar, DimAtCompile, 1> VectorT;
73  typedef Eigen::Matrix<Scalar, DimAtCompile, DimAtCompile> HessianT;
74 
75  protected:
76  FunctorType &_func;
77  VectorT _gk, _gk_1, _xk_1, _xk, _pk, _pk_1;
78  Scalar _fk, _fk_1, _alphak_1;
79  Scalar _alpha, _alpha0;
80  size_t _itNum;
82  QNUpdateType _qn;
83 
84  public:
87 
88  QNUpdateType &get_qnupdate() { return _qn; }
89  const QNUpdateType &get_qnupdate() const { return _qn; }
90 
91  const Scalar &curr_f() const { return _fk; }
92  const VectorT &curr_x() const { return _xk; }
93  const VectorT &curr_g() const { return _gk; }
94  const VectorT &curr_p() const { return _pk; }
95 
96  const Scalar &prev_f() const { return _fk_1; }
97  const VectorT &prev_x() const { return _xk_1; }
98  const VectorT &prev_g() const { return _gk_1; }
99  const VectorT &prev_p() const { return _pk_1; }
100  Scalar prev_step_size() const { return _pk_1.norm()*_alphak_1; }
101 
102  inline Scalar rel_grad_norm() const {
103  return -_pk.dot(_gk) / std::max(std::fabs(_fk), _conv_opts.fScale);
104  }
105  inline Scalar rel_obj_decrease() const {
106  return std::fabs(_fk_1 - _fk) / std::max(std::fabs(_fk_1),
107  std::max(std::fabs(_fk),
108  _conv_opts.fScale));
109  }
110 
111  const Scalar &alpha0() const { return _alpha0; }
112  const Scalar &alpha() const { return _alpha; }
113  const size_t iter_num() const { return _itNum; }
114 
115  const std::string &note() const { return _note; }
116 
118  switch (retCode) {
119  case TERM_SUCCESS:
120  return std::string("Successful step completed");
121  case TERM_ABSF:
122  return std::string("Convergence detected: absolute change "
123  "in objective function was below tolerance");
124  case TERM_RELF:
125  return std::string("Convergence detected: relative change "
126  "in objective function was below tolerance");
127  case TERM_ABSGRAD:
128  return std::string("Convergence detected: "
129  "gradient norm is below tolerance");
130  case TERM_RELGRAD:
131  return std::string("Convergence detected: relative "
132  "gradient magnitude is below tolerance");
133  case TERM_ABSX:
134  return std::string("Convergence detected: "
135  "absolute parameter change was below tolerance");
136  case TERM_MAXIT:
137  return std::string("Maximum number of iterations hit, "
138  "may not be at an optima");
139  case TERM_LSFAIL:
140  return std::string("Line search failed to achieve a sufficient "
141  "decrease, no more progress can be made");
142  default:
143  return std::string("Unknown termination code");
144  }
145  }
146 
147  explicit BFGSMinimizer(FunctorType &f) : _func(f) { }
148 
149  void initialize(const VectorT &x0) {
150  int ret;
151  _xk = x0;
152  ret = _func(_xk, _fk, _gk);
153  if (ret) {
154  throw std::runtime_error("Error evaluating initial BFGS point.");
155  }
156  _pk = -_gk;
157 
158  _itNum = 0;
159  _note = "";
160  }
161 
162  int step() {
163  Scalar gradNorm, stepNorm;
164  VectorT sk, yk;
165  int retCode(0);
166  int resetB(0);
167 
168  _itNum++;
169 
170  if (_itNum == 1) {
171  resetB = 1;
172  _note = "";
173  } else {
174  resetB = 0;
175  _note = "";
176  }
177 
178  while (true) {
179  if (resetB) {
180  // Reset the Hessian approximation
181  _pk.noalias() = -_gk;
182  }
183 
184  // Get an initial guess for the step size (alpha)
185  if (_itNum > 1 && resetB != 2) {
186  // use cubic interpolation based on the previous step
187  _alpha0 = _alpha = std::min(1.0,
188  1.01*CubicInterp(_gk_1.dot(_pk_1),
189  _alphak_1,
190  _fk - _fk_1,
191  _gk.dot(_pk_1),
192  _ls_opts.minAlpha,
193  1.0));
194  } else {
195  // On the first step (or, after a reset) use the default step size
196  _alpha0 = _alpha = _ls_opts.alpha0;
197  }
198 
199  // Perform the line search. If successful, the results are in the
200  // variables: _xk_1, _fk_1 and _gk_1.
201  retCode = WolfeLineSearch(_func, _alpha, _xk_1, _fk_1, _gk_1,
202  _pk, _xk, _fk, _gk,
203  _ls_opts.c1, _ls_opts.c2,
204  _ls_opts.minAlpha);
205  if (retCode) {
206  // Line search failed...
207  if (resetB) {
208  // did a Hessian reset and it still failed,
209  // and nothing left to try
210  retCode = TERM_LSFAIL;
211  return retCode;
212  } else {
213  // try resetting the Hessian approximation
214  resetB = 2;
215  _note += "LS failed, Hessian reset";
216  continue;
217  }
218  } else {
219  break;
220  }
221  }
222 
223  // Swap things so that k is the most recent iterate
224  std::swap(_fk, _fk_1);
225  _xk.swap(_xk_1);
226  _gk.swap(_gk_1);
227  _pk.swap(_pk_1);
228 
229  sk.noalias() = _xk - _xk_1;
230  yk.noalias() = _gk - _gk_1;
231 
232  gradNorm = _gk.norm();
233  stepNorm = sk.norm();
234 
235  // Update QN approximation
236  if (resetB) {
237  // If the QN approximation was reset, automatically scale it
238  // and update the step-size accordingly
239  Scalar B0fact = _qn.update(yk, sk, true);
240  _pk_1 /= B0fact;
241  _alphak_1 = _alpha*B0fact;
242  } else {
243  _qn.update(yk, sk);
244  _alphak_1 = _alpha;
245  }
246  // Compute search direction for next step
247  _qn.search_direction(_pk, _gk);
248 
249  // Check for convergence
250  if (std::fabs(_fk_1 - _fk) < _conv_opts.tolAbsF) {
251  // Objective function improvement wasn't sufficient
252  retCode = TERM_ABSF;
253  } else if (gradNorm < _conv_opts.tolAbsGrad) {
254  retCode = TERM_ABSGRAD; // Gradient norm was below threshold
255  } else if (stepNorm < _conv_opts.tolAbsX) {
256  retCode = TERM_ABSX; // Change in x was too small
257  } else if (_itNum >= _conv_opts.maxIts) {
258  retCode = TERM_MAXIT; // Max number of iterations hit
259  } else if (rel_obj_decrease()
260  < _conv_opts.tolRelF
262  // Relative improvement in objective function wasn't sufficient
263  retCode = TERM_RELF;
264  } else if (rel_grad_norm()
265  < _conv_opts.tolRelGrad
267  // Relative gradient norm was below threshold
268  retCode = TERM_RELGRAD;
269  } else {
270  // Step was successful more progress to be made
271  retCode = TERM_SUCCESS;
272  }
273 
274  return retCode;
275  }
276 
277  int minimize(VectorT &x0) {
278  int retcode;
279  initialize(x0);
280  while (!(retcode = step()))
281  continue;
282  x0 = _xk;
283  return retcode;
284  }
285  };
286 
287  template <class M>
288  class ModelAdaptor {
289  private:
290  M& _model;
291  std::vector<int> _params_i;
292  std::ostream* _msgs;
293  std::vector<double> _x, _g;
294  size_t _fevals;
295 
296  public:
298  const std::vector<int>& params_i,
299  std::ostream* msgs)
300  : _model(model), _params_i(params_i), _msgs(msgs), _fevals(0) {}
301 
302  size_t fevals() const { return _fevals; }
303  int operator()(const Eigen::Matrix<double, Eigen::Dynamic, 1> &x,
304  double &f) {
305  using Eigen::Matrix;
306  using Eigen::Dynamic;
309  typedef typename index_type<Matrix<double, Dynamic, 1> >::type idx_t;
310 
311  _x.resize(x.size());
312  for (idx_t i = 0; i < x.size(); i++)
313  _x[i] = x[i];
314 
315  try {
316  f = - log_prob_propto<false>(_model, _x, _params_i, _msgs);
317  } catch (const std::exception& e) {
318  if (_msgs)
319  (*_msgs) << e.what() << std::endl;
320  return 1;
321  }
322 
323  if (boost::math::isfinite(f)) {
324  return 0;
325  } else {
326  if (_msgs)
327  *_msgs << "Error evaluating model log probability: "
328  "Non-finite function evaluation." << std::endl;
329  return 2;
330  }
331  }
332  int operator()(const Eigen::Matrix<double, Eigen::Dynamic, 1> &x,
333  double &f,
334  Eigen::Matrix<double, Eigen::Dynamic, 1> &g) {
335  using Eigen::Matrix;
336  using Eigen::Dynamic;
339  typedef typename index_type<Matrix<double, Dynamic, 1> >::type idx_t;
340 
341  _x.resize(x.size());
342  for (idx_t i = 0; i < x.size(); i++)
343  _x[i] = x[i];
344 
345  _fevals++;
346 
347  try {
348  f = - log_prob_grad<true, false>(_model, _x, _params_i, _g, _msgs);
349  } catch (const std::exception& e) {
350  if (_msgs)
351  (*_msgs) << e.what() << std::endl;
352  return 1;
353  }
354 
355  g.resize(_g.size());
356  for (size_t i = 0; i < _g.size(); i++) {
357  if (!boost::math::isfinite(_g[i])) {
358  if (_msgs)
359  *_msgs << "Error evaluating model log probability: "
360  "Non-finite gradient." << std::endl;
361  return 3;
362  }
363  g[i] = -_g[i];
364  }
365 
366  if (boost::math::isfinite(f)) {
367  return 0;
368  } else {
369  if (_msgs)
370  *_msgs << "Error evaluating model log probability: "
371  << "Non-finite function evaluation."
372  << std::endl;
373  return 2;
374  }
375  }
376  int df(const Eigen::Matrix<double, Eigen::Dynamic, 1> &x,
377  Eigen::Matrix<double, Eigen:: Dynamic, 1> &g) {
378  double f;
379  return (*this)(x, f, g);
380  }
381  };
382 
383  template<typename M, typename QNUpdateType, typename Scalar = double,
384  int DimAtCompile = Eigen::Dynamic>
386  : public BFGSMinimizer<ModelAdaptor<M>, QNUpdateType,
387  Scalar, DimAtCompile> {
388  private:
390 
391  public:
392  typedef BFGSMinimizer<ModelAdaptor<M>, QNUpdateType, Scalar, DimAtCompile>
394  typedef typename BFGSBase::VectorT vector_t;
396 
398  const std::vector<double>& params_r,
399  const std::vector<int>& params_i,
400  std::ostream* msgs = 0)
401  : BFGSBase(_adaptor),
402  _adaptor(model, params_i, msgs) {
403  initialize(params_r);
404  }
405 
406  void initialize(const std::vector<double>& params_r) {
407  Eigen::Matrix<double, Eigen::Dynamic, 1> x;
408  x.resize(params_r.size());
409  for (size_t i = 0; i < params_r.size(); i++)
410  x[i] = params_r[i];
412  }
413 
414  size_t grad_evals() { return _adaptor.fevals(); }
415  double logp() { return -(this->curr_f()); }
416  double grad_norm() { return this->curr_g().norm(); }
417  void grad(std::vector<double>& g) {
418  const vector_t &cg(this->curr_g());
419  g.resize(cg.size());
420  for (idx_t i = 0; i < cg.size(); i++)
421  g[i] = -cg[i];
422  }
423  void params_r(std::vector<double>& x) {
424  const vector_t &cx(this->curr_x());
425  x.resize(cx.size());
426  for (idx_t i = 0; i < cx.size(); i++)
427  x[i] = cx[i];
428  }
429  };
430 
431  }
432 
433 }
434 
435 #endif
QNUpdateType & get_qnupdate()
Definition: bfgs.hpp:88
T max(const caf::Proxy< T > &a, T b)
Scalar prev_step_size() const
Definition: bfgs.hpp:100
BFGSBase::VectorT vector_t
Definition: bfgs.hpp:394
const VectorT & curr_x() const
Definition: bfgs.hpp:92
bool isfinite(const stan::math::var &v)
void initialize(const std::vector< double > &params_r)
Definition: bfgs.hpp:406
double log_prob_grad(const M &model, std::vector< double > &params_r, std::vector< int > &params_i, std::vector< double > &gradient, std::ostream *msgs=0)
std::vector< int > _params_i
Definition: bfgs.hpp:291
fvar< T > fabs(const fvar< T > &x)
Definition: fabs.hpp:15
int minimize(VectorT &x0)
Definition: bfgs.hpp:277
int operator()(const Eigen::Matrix< double, Eigen::Dynamic, 1 > &x, double &f, Eigen::Matrix< double, Eigen::Dynamic, 1 > &g)
Definition: bfgs.hpp:332
const VectorT & curr_g() const
Definition: bfgs.hpp:93
Eigen::Matrix< Scalar, DimAtCompile, DimAtCompile > HessianT
Definition: bfgs.hpp:73
void grad(std::vector< double > &g)
Definition: bfgs.hpp:417
int WolfeLineSearch(FunctorType &func, Scalar &alpha, XType &x1, Scalar &f1, XType &gradx1, const XType &p, const XType &x0, const Scalar &f0, const XType &gradx0, const Scalar &c1, const Scalar &c2, const Scalar &minAlpha)
::xsd::cxx::tree::exception< char > exception
Definition: Database.h:225
Eigen::Matrix< Scalar, DimAtCompile, 1 > VectorT
Definition: bfgs.hpp:72
const Scalar & alpha() const
Definition: bfgs.hpp:112
const Scalar & alpha0() const
Definition: bfgs.hpp:111
stan::math::index_type< vector_t >::type idx_t
Definition: bfgs.hpp:395
const VectorT & prev_p() const
Definition: bfgs.hpp:99
void swap(art::HLTGlobalStatus &lhs, art::HLTGlobalStatus &rhs)
BFGSLineSearch(M &model, const std::vector< double > &params_r, const std::vector< int > &params_i, std::ostream *msgs=0)
Definition: bfgs.hpp:397
c2
Definition: demo5.py:33
int operator()(const Eigen::Matrix< double, Eigen::Dynamic, 1 > &x, double &f)
Definition: bfgs.hpp:303
Scalar rel_obj_decrease() const
Definition: bfgs.hpp:105
void initialize(const VectorT &x0)
Definition: bfgs.hpp:149
std::string get_code_string(int retCode)
Definition: bfgs.hpp:117
const VectorT & curr_p() const
Definition: bfgs.hpp:94
int df(const Eigen::Matrix< double, Eigen::Dynamic, 1 > &x, Eigen::Matrix< double, Eigen::Dynamic, 1 > &g)
Definition: bfgs.hpp:376
void params_r(std::vector< double > &x)
Definition: bfgs.hpp:423
const Scalar & prev_f() const
Definition: bfgs.hpp:96
LSOptions< Scalar > _ls_opts
Definition: bfgs.hpp:85
const VectorT & prev_x() const
Definition: bfgs.hpp:97
std::vector< double > initialize(Model &model, stan::io::var_context &init, RNG &rng, double init_radius, bool print_timing, stan::callbacks::logger &logger, stan::callbacks::writer &init_writer)
Definition: initialize.hpp:68
ConvergenceOptions< Scalar > _conv_opts
Definition: bfgs.hpp:86
BFGSMinimizer(FunctorType &f)
Definition: bfgs.hpp:147
BFGSMinimizer< ModelAdaptor< M >, QNUpdateType, Scalar, DimAtCompile > BFGSBase
Definition: bfgs.hpp:393
const std::string & note() const
Definition: bfgs.hpp:115
std::vector< double > _x
Definition: bfgs.hpp:293
Scalar CubicInterp(const Scalar &df0, const Scalar &x1, const Scalar &f1, const Scalar &df1, const Scalar &loX, const Scalar &hiX)
ModelAdaptor(M &model, const std::vector< int > &params_i, std::ostream *msgs)
Definition: bfgs.hpp:297
double epsilon
ModelAdaptor< M > _adaptor
Definition: bfgs.hpp:389
Scalar rel_grad_norm() const
Definition: bfgs.hpp:102
c1
Definition: demo5.py:24
const size_t iter_num() const
Definition: bfgs.hpp:113
T min(const caf::Proxy< T > &a, T b)
const Scalar & curr_f() const
Definition: bfgs.hpp:91
Float_t e
Definition: plot.C:35
const QNUpdateType & get_qnupdate() const
Definition: bfgs.hpp:89
const XML_Char XML_Content * model
Definition: expat.h:151
double log_prob_propto(const M &model, std::vector< double > &params_r, std::vector< int > &params_i, std::ostream *msgs=0)
const VectorT & prev_g() const
Definition: bfgs.hpp:98
enum BeamMode string