advi.hpp
Go to the documentation of this file.
1 #ifndef STAN_VARIATIONAL_ADVI_HPP
2 #define STAN_VARIATIONAL_ADVI_HPP
3 
4 #include <stan/math.hpp>
8 #include <stan/io/dump.hpp>
13 #include <boost/circular_buffer.hpp>
14 #include <boost/lexical_cast.hpp>
15 #include <algorithm>
16 #include <limits>
17 #include <numeric>
18 #include <ostream>
19 #include <vector>
20 #include <queue>
21 #include <string>
22 
23 namespace stan {
24 
25  namespace variational {
26 
27  /**
28  * Automatic Differentiation Variational Inference
29  *
30  * Implements "black box" variational inference using stochastic gradient
31  * ascent to maximize the Evidence Lower Bound for a given model
32  * and variational family.
33  *
34  * @tparam Model class of model
35  * @tparam Q class of variational distribution
36  * @tparam BaseRNG class of random number generator
37  */
38  template <class Model, class Q, class BaseRNG>
39  class advi {
40  public:
41  /**
42  * Constructor
43  *
44  * @param[in] m stan model
45  * @param[in] cont_params initialization of continuous parameters
46  * @param[in,out] rng random number generator
47  * @param[in] n_monte_carlo_grad number of samples for gradient computation
48  * @param[in] n_monte_carlo_elbo number of samples for ELBO computation
49  * @param[in] eval_elbo evaluate ELBO at every "eval_elbo" iters
50  * @param[in] n_posterior_samples number of samples to draw from posterior
51  * @throw std::runtime_error if n_monte_carlo_grad is not positive
52  * @throw std::runtime_error if n_monte_carlo_elbo is not positive
53  * @throw std::runtime_error if eval_elbo is not positive
54  * @throw std::runtime_error if n_posterior_samples is not positive
55  */
57  Eigen::VectorXd& cont_params,
58  BaseRNG& rng,
59  int n_monte_carlo_grad,
60  int n_monte_carlo_elbo,
61  int eval_elbo,
62  int n_posterior_samples)
63  : model_(m),
64  cont_params_(cont_params),
65  rng_(rng),
66  n_monte_carlo_grad_(n_monte_carlo_grad),
67  n_monte_carlo_elbo_(n_monte_carlo_elbo),
68  eval_elbo_(eval_elbo),
69  n_posterior_samples_(n_posterior_samples) {
70  static const char* function = "stan::variational::advi";
71  math::check_positive(function,
72  "Number of Monte Carlo samples for gradients",
74  math::check_positive(function,
75  "Number of Monte Carlo samples for ELBO",
77  math::check_positive(function,
78  "Evaluate ELBO at every eval_elbo iteration",
79  eval_elbo_);
80  math::check_positive(function,
81  "Number of posterior samples for output",
83  }
84 
85  /**
86  * Calculates the Evidence Lower BOund (ELBO) by sampling from
87  * the variational distribution and then evaluating the log joint,
88  * adjusted by the entropy term of the variational distribution.
89  *
90  * @param[in] variational variational approximation at which to evaluate
91  * the ELBO.
92  * @param logger logger for messages
93  * @return the evidence lower bound.
94  * @throw std::domain_error If, after n_monte_carlo_elbo_ number of draws
95  * from the variational distribution all give non-finite log joint
96  * evaluations. This means that the model is severly ill conditioned or
97  * that the variational distribution has somehow collapsed.
98  */
99  double calc_ELBO(const Q& variational,
100  callbacks::logger& logger)
101  const {
102  static const char* function =
103  "stan::variational::advi::calc_ELBO";
104 
105  double elbo = 0.0;
106  int dim = variational.dimension();
107  Eigen::VectorXd zeta(dim);
108 
109  int n_dropped_evaluations = 0;
110  for (int i = 0; i < n_monte_carlo_elbo_;) {
111  variational.sample(rng_, zeta);
112  try {
113  std::stringstream ss;
114  double log_prob = model_.template log_prob<false, true>(zeta, &ss);
115  if (ss.str().length() > 0)
116  logger.info(ss);
117  stan::math::check_finite(function, "log_prob", log_prob);
118  elbo += log_prob;
119  ++i;
120  } catch (const std::domain_error& e) {
121  ++n_dropped_evaluations;
122  if (n_dropped_evaluations >= n_monte_carlo_elbo_) {
123  const char* name = "The number of dropped evaluations";
124  const char* msg1 = "has reached its maximum amount (";
125  const char* msg2 = "). Your model may be either severely "
126  "ill-conditioned or misspecified.";
127  stan::math::domain_error(function, name, n_monte_carlo_elbo_,
128  msg1, msg2);
129  }
130  }
131  }
132  elbo /= n_monte_carlo_elbo_;
133  elbo += variational.entropy();
134  return elbo;
135  }
136 
137  /**
138  * Calculates the "black box" gradient of the ELBO.
139  *
140  * @param[in] variational variational approximation at which to evaluate
141  * the ELBO.
142  * @param[out] elbo_grad gradient of ELBO with respect to variational
143  * approximation.
144  * @param logger logger for messages
145  */
146  void calc_ELBO_grad(const Q& variational, Q& elbo_grad,
147  callbacks::logger& logger) const {
148  static const char* function =
149  "stan::variational::advi::calc_ELBO_grad";
150 
152  "Dimension of elbo_grad",
153  elbo_grad.dimension(),
154  "Dimension of variational q",
155  variational.dimension());
157  "Dimension of variational q",
158  variational.dimension(),
159  "Dimension of variables in model",
160  cont_params_.size());
161 
162  variational.calc_grad(elbo_grad,
164  logger);
165  }
166 
167  /**
168  * Heuristic grid search to adapt eta to the scale of the problem.
169  *
170  * @param[in] variational initial variational distribution.
171  * @param[in] adapt_iterations number of iterations to spend doing stochastic
172  * gradient ascent at each proposed eta value.
173  * @param[in,out] logger logger for messages
174  * @return adapted (tuned) value of eta via heuristic grid search
175  * @throw std::domain_error If either (a) the initial ELBO cannot be
176  * computed at the initial variational distribution, (b) all step-size
177  * proposals in eta_sequence fail.
178  */
179  double adapt_eta(Q& variational,
180  int adapt_iterations,
181  callbacks::logger& logger)
182  const {
183  static const char* function = "stan::variational::advi::adapt_eta";
184 
186  "Number of adaptation iterations",
187  adapt_iterations);
188 
189  logger.info("Begin eta adaptation.");
190 
191  // Sequence of eta values to try during adaptation
192  const int eta_sequence_size = 5;
193  double eta_sequence[eta_sequence_size] = {100, 10, 1, 0.1, 0.01};
194 
195  // Initialize ELBO tracking variables
196  double elbo = -std::numeric_limits<double>::max();
197  double elbo_best = -std::numeric_limits<double>::max();
198  double elbo_init;
199  try {
200  elbo_init = calc_ELBO(variational, logger);
201  } catch (const std::domain_error& e) {
202  const char* name = "Cannot compute ELBO using the initial "
203  "variational distribution.";
204  const char* msg1 = "Your model may be either "
205  "severely ill-conditioned or misspecified.";
206  stan::math::domain_error(function, name, "", msg1);
207  }
208 
209  // Variational family to store gradients
210  Q elbo_grad = Q(model_.num_params_r());
211 
212  // Adaptive step-size sequence
213  Q history_grad_squared = Q(model_.num_params_r());
214  double tau = 1.0;
215  double pre_factor = 0.9;
216  double post_factor = 0.1;
217 
218  double eta_best = 0.0;
219  double eta;
220  double eta_scaled;
221 
222  bool do_more_tuning = true;
223  int eta_sequence_index = 0;
224  while (do_more_tuning) {
225  // Try next eta
226  eta = eta_sequence[eta_sequence_index];
227 
228  int print_progress_m;
229  for (int iter_tune = 1; iter_tune <= adapt_iterations; ++iter_tune) {
230  print_progress_m = eta_sequence_index
231  * adapt_iterations + iter_tune;
233  ::print_progress(print_progress_m, 0,
234  adapt_iterations * eta_sequence_size,
235  adapt_iterations, true, "", "", logger);
236 
237  // (ROBUST) Compute gradient of ELBO. It's OK if it diverges.
238  // We'll try a smaller eta.
239  try {
240  calc_ELBO_grad(variational, elbo_grad, logger);
241  } catch (const std::domain_error& e) {
242  elbo_grad.set_to_zero();
243  }
244 
245  // Update step-size
246  if (iter_tune == 1) {
247  history_grad_squared += elbo_grad.square();
248  } else {
249  history_grad_squared = pre_factor * history_grad_squared
250  + post_factor * elbo_grad.square();
251  }
252  eta_scaled = eta / sqrt(static_cast<double>(iter_tune));
253  // Stochastic gradient update
254  variational += eta_scaled * elbo_grad
255  / (tau + history_grad_squared.sqrt());
256  }
257 
258  // (ROBUST) Compute ELBO. It's OK if it has diverged.
259  try {
260  elbo = calc_ELBO(variational, logger);
261  } catch (const std::domain_error& e) {
263  }
264 
265  // Check if:
266  // (1) ELBO at current eta is worse than the best ELBO
267  // (2) the best ELBO hasn't gotten worse than the initial ELBO
268  if (elbo < elbo_best && elbo_best > elbo_init) {
269  std::stringstream ss;
270  ss << "Success!"
271  << " Found best value [eta = " << eta_best
272  << "]";
273  if (eta_sequence_index < eta_sequence_size - 1)
274  ss << (" earlier than expected.");
275  else
276  ss << ".";
277  logger.info(ss);
278  logger.info("");
279  do_more_tuning = false;
280  } else {
281  if (eta_sequence_index < eta_sequence_size - 1) {
282  // Reset
283  elbo_best = elbo;
284  eta_best = eta;
285  } else {
286  // No more eta values to try, so use current eta if it
287  // didn't diverge or fail if it did diverge
288  if (elbo > elbo_init) {
289  std::stringstream ss;
290  ss << "Success!"
291  << " Found best value [eta = " << eta_best
292  << "].";
293  logger.info(ss);
294  logger.info("");
295  eta_best = eta;
296  do_more_tuning = false;
297  } else {
298  const char* name = "All proposed step-sizes";
299  const char* msg1 = "failed. Your model may be either "
300  "severely ill-conditioned or misspecified.";
301  stan::math::domain_error(function, name, "", msg1);
302  }
303  }
304  // Reset
305  history_grad_squared.set_to_zero();
306  }
307  ++eta_sequence_index;
308  variational = Q(cont_params_);
309  }
310  return eta_best;
311  }
312 
313  /**
314  * Runs stochastic gradient ascent with an adaptive stepsize sequence.
315  *
316  * @param[in,out] variational initia variational distribution
317  * @param[in] eta stepsize scaling parameter
318  * @param[in] tol_rel_obj relative tolerance parameter for convergence
319  * @param[in] max_iterations max number of iterations to run algorithm
320  * @param[in,out] logger logger for messages
321  * @param[in,out] diagnostic_writer writer for diagnostic information
322  * @throw std::domain_error If the ELBO or its gradient is ever
323  * non-finite, at any iteration
324  */
325  void stochastic_gradient_ascent(Q& variational,
326  double eta,
327  double tol_rel_obj,
328  int max_iterations,
329  callbacks::logger& logger,
330  callbacks::writer& diagnostic_writer)
331  const {
332  static const char* function =
333  "stan::variational::advi::stochastic_gradient_ascent";
334 
335  stan::math::check_positive(function, "Eta stepsize", eta);
337  "Relative objective function tolerance",
338  tol_rel_obj);
340  "Maximum iterations",
341  max_iterations);
342 
343  // Gradient parameters
344  Q elbo_grad = Q(model_.num_params_r());
345 
346  // Stepsize sequence parameters
347  Q history_grad_squared = Q(model_.num_params_r());
348  double tau = 1.0;
349  double pre_factor = 0.9;
350  double post_factor = 0.1;
351  double eta_scaled;
352 
353  // Initialize ELBO and convergence tracking variables
354  double elbo(0.0);
355  double elbo_best = -std::numeric_limits<double>::max();
356  double elbo_prev = -std::numeric_limits<double>::max();
357  double delta_elbo = std::numeric_limits<double>::max();
358  double delta_elbo_ave = std::numeric_limits<double>::max();
359  double delta_elbo_med = std::numeric_limits<double>::max();
360 
361  // Heuristic to estimate how far to look back in rolling window
362  int cb_size
363  = static_cast<int>(std::max(0.1 * max_iterations / eval_elbo_,
364  2.0));
365  boost::circular_buffer<double> elbo_diff(cb_size);
366 
367  logger.info("Begin stochastic gradient ascent.");
368  logger.info(" iter"
369  " ELBO"
370  " delta_ELBO_mean"
371  " delta_ELBO_med"
372  " notes ");
373 
374  // Timing variables
375  clock_t start = clock();
376  clock_t end;
377  double delta_t;
378 
379  // Main loop
380  bool do_more_iterations = true;
381  for (int iter_counter = 1; do_more_iterations; ++iter_counter) {
382  // Compute gradient using Monte Carlo integration
383  calc_ELBO_grad(variational, elbo_grad, logger);
384 
385  // Update step-size
386  if (iter_counter == 1) {
387  history_grad_squared += elbo_grad.square();
388  } else {
389  history_grad_squared = pre_factor * history_grad_squared
390  + post_factor * elbo_grad.square();
391  }
392  eta_scaled = eta / sqrt(static_cast<double>(iter_counter));
393 
394  // Stochastic gradient update
395  variational += eta_scaled * elbo_grad
396  / (tau + history_grad_squared.sqrt());
397 
398  // Check for convergence every "eval_elbo_"th iteration
399  if (iter_counter % eval_elbo_ == 0) {
400  elbo_prev = elbo;
401  elbo = calc_ELBO(variational, logger);
402  if (elbo > elbo_best)
403  elbo_best = elbo;
404  delta_elbo = rel_difference(elbo, elbo_prev);
405  elbo_diff.push_back(delta_elbo);
406  delta_elbo_ave = std::accumulate(elbo_diff.begin(),
407  elbo_diff.end(), 0.0)
408  / static_cast<double>(elbo_diff.size());
409  delta_elbo_med = circ_buff_median(elbo_diff);
410  std::stringstream ss;
411  ss << " "
412  << std::setw(4) << iter_counter
413  << " "
414  << std::setw(15) << std::fixed << std::setprecision(3)
415  << elbo
416  << " "
417  << std::setw(16) << std::fixed << std::setprecision(3)
418  << delta_elbo_ave
419  << " "
420  << std::setw(15) << std::fixed << std::setprecision(3)
421  << delta_elbo_med;
422 
423  end = clock();
424  delta_t = static_cast<double>(end - start) / CLOCKS_PER_SEC;
425 
426  std::vector<double> print_vector;
427  print_vector.clear();
428  print_vector.push_back(iter_counter);
429  print_vector.push_back(delta_t);
430  print_vector.push_back(elbo);
431  diagnostic_writer(print_vector);
432 
433  if (delta_elbo_ave < tol_rel_obj) {
434  ss << " MEAN ELBO CONVERGED";
435  do_more_iterations = false;
436  }
437 
438  if (delta_elbo_med < tol_rel_obj) {
439  ss << " MEDIAN ELBO CONVERGED";
440  do_more_iterations = false;
441  }
442 
443  if (iter_counter > 10 * eval_elbo_) {
444  if (delta_elbo_med > 0.5 || delta_elbo_ave > 0.5) {
445  ss << " MAY BE DIVERGING... INSPECT ELBO";
446  }
447  }
448 
449  logger.info(ss);
450 
451  if (do_more_iterations == false &&
452  rel_difference(elbo, elbo_best) > 0.05) {
453  logger.info("Informational Message: The ELBO at a previous "
454  "iteration is larger than the ELBO upon "
455  "convergence!");
456  logger.info("This variational approximation may not "
457  "have converged to a good optimum.");
458  }
459  }
460 
461  if (iter_counter == max_iterations) {
462  logger.info("Informational Message: The maximum number of "
463  "iterations is reached! The algorithm may not have "
464  "converged.");
465  logger.info("This variational approximation is not "
466  "guaranteed to be meaningful.");
467  do_more_iterations = false;
468  }
469  }
470  }
471 
472  /**
473  * Runs ADVI and writes to output.
474  *
475  * @param[in] eta eta parameter of stepsize sequence
476  * @param[in] adapt_engaged boolean flag for eta adaptation
477  * @param[in] adapt_iterations number of iterations for eta adaptation
478  * @param[in] tol_rel_obj relative tolerance parameter for convergence
479  * @param[in] max_iterations max number of iterations to run algorithm
480  * @param[in,out] logger logger for messages
481  * @param[in,out] parameter_writer writer for parameters
482  * (typically to file)
483  * @param[in,out] diagnostic_writer writer for diagnostic information
484  */
485  int run(double eta, bool adapt_engaged, int adapt_iterations,
486  double tol_rel_obj, int max_iterations,
487  callbacks::logger& logger,
488  callbacks::writer& parameter_writer,
489  callbacks::writer& diagnostic_writer)
490  const {
491  diagnostic_writer("iter,time_in_seconds,ELBO");
492 
493  // Initialize variational approximation
494  Q variational = Q(cont_params_);
495 
496  if (adapt_engaged) {
497  eta = adapt_eta(variational, adapt_iterations, logger);
498  parameter_writer("Stepsize adaptation complete.");
499  std::stringstream ss;
500  ss << "eta = " << eta;
501  parameter_writer(ss.str());
502  }
503 
504  stochastic_gradient_ascent(variational, eta,
505  tol_rel_obj, max_iterations,
506  logger, diagnostic_writer);
507 
508  // Write mean of posterior approximation on first output line
509  cont_params_ = variational.mean();
510  std::vector<double> cont_vector(cont_params_.size());
511  for (int i = 0; i < cont_params_.size(); ++i)
512  cont_vector.at(i) = cont_params_(i);
513  std::vector<int> disc_vector;
514  std::vector<double> values;
515 
516  std::stringstream msg;
517  model_.write_array(rng_, cont_vector, disc_vector, values,
518  true, true, &msg);
519  if (msg.str().length() > 0)
520  logger.info(msg);
521  values.insert(values.begin(), 0);
522  parameter_writer(values);
523 
524  // Draw more samples from posterior and write on subsequent lines
525  logger.info("");
526  std::stringstream ss;
527  ss << "Drawing a sample of size "
529  << " from the approximate posterior... ";
530  logger.info(ss);
531 
532  for (int n = 0; n < n_posterior_samples_; ++n) {
533  variational.sample(rng_, cont_params_);
534  for (int i = 0; i < cont_params_.size(); ++i) {
535  cont_vector.at(i) = cont_params_(i);
536  }
537  std::stringstream msg2;
538  model_.write_array(rng_, cont_vector, disc_vector, values,
539  true, true, &msg2);
540  if (msg2.str().length() > 0)
541  logger.info(msg2);
542  values.insert(values.begin(), 0);
543  parameter_writer(values);
544  }
545  logger.info("COMPLETED.");
546 
548  }
549 
550  // TODO(akucukelbir): move these things to stan math and test there
551 
552  /**
553  * Compute the median of a circular buffer.
554  *
555  * @param[in] cb circular buffer with some number of values in it.
556  * @return median of values in circular buffer.
557  */
558  double circ_buff_median(const boost::circular_buffer<double>& cb) const {
559  // FIXME: naive implementation; creates a copy as a vector
560  std::vector<double> v;
561  for (boost::circular_buffer<double>::const_iterator i = cb.begin();
562  i != cb.end(); ++i) {
563  v.push_back(*i);
564  }
565 
566  size_t n = v.size() / 2;
567  std::nth_element(v.begin(), v.begin()+n, v.end());
568  return v[n];
569  }
570 
571  /**
572  * Compute the relative difference between two double values.
573  *
574  * @param[in] prev previous value
575  * @param[in] curr current value
576  * @return absolutely value of relative difference
577  */
578  double rel_difference(double prev, double curr) const {
579  return std::fabs((curr - prev) / prev);
580  }
581 
582  protected:
584  Eigen::VectorXd& cont_params_;
585  BaseRNG& rng_;
590  };
591  } // variational
592 } // stan
593 #endif
void stochastic_gradient_ascent(Q &variational, double eta, double tol_rel_obj, int max_iterations, callbacks::logger &logger, callbacks::writer &diagnostic_writer) const
Definition: advi.hpp:325
T max(const caf::Proxy< T > &a, T b)
const XML_Char * name
Definition: expat.h:151
double adapt_eta(Q &variational, int adapt_iterations, callbacks::logger &logger) const
Definition: advi.hpp:179
void check_finite(const char *function, const char *name, const T_y &y)
fvar< T > fabs(const fvar< T > &x)
Definition: fabs.hpp:15
int run(double eta, bool adapt_engaged, int adapt_iterations, double tol_rel_obj, int max_iterations, callbacks::logger &logger, callbacks::writer &parameter_writer, callbacks::writer &diagnostic_writer) const
Definition: advi.hpp:485
T sqrt(T number)
Definition: d0nt_math.hpp:156
Float_t ss
Definition: plot.C:24
void check_size_match(const char *function, const char *name_i, T_size1 i, const char *name_j, T_size2 j)
rosenbrock_model_namespace::rosenbrock_model Model
void print_progress(int m, int start, int finish, int refresh, bool tune, const std::string &prefix, const std::string &suffix, callbacks::logger &logger)
void prev()
Definition: show_event.C:91
advi(Model &m, Eigen::VectorXd &cont_params, BaseRNG &rng, int n_monte_carlo_grad, int n_monte_carlo_elbo, int eval_elbo, int n_posterior_samples)
Definition: advi.hpp:56
double calc_ELBO(const Q &variational, callbacks::logger &logger) const
Definition: advi.hpp:99
void domain_error(const char *function, const char *name, const T &y, const char *msg1, const char *msg2)
virtual void info(const std::string &message)
Definition: logger.hpp:47
double rel_difference(double prev, double curr) const
Definition: advi.hpp:578
void check_positive(const char *function, const char *name, const T_y &y)
double circ_buff_median(const boost::circular_buffer< double > &cb) const
Definition: advi.hpp:558
Float_t e
Definition: plot.C:35
T max(sqlite3 *const db, std::string const &table_name, std::string const &column_name)
Definition: statistics.h:68
Eigen::VectorXd & cont_params_
Definition: advi.hpp:584
void calc_ELBO_grad(const Q &variational, Q &elbo_grad, callbacks::logger &logger) const
Definition: advi.hpp:146