Stan  2.14.0
probability, sampling & optimization
advi.hpp
Go to the documentation of this file.
1 #ifndef STAN_VARIATIONAL_ADVI_HPP
2 #define STAN_VARIATIONAL_ADVI_HPP
3 
4 #include <stan/math.hpp>
7 #include <stan/io/dump.hpp>
13 #include <boost/circular_buffer.hpp>
14 #include <boost/lexical_cast.hpp>
15 #include <algorithm>
16 #include <limits>
17 #include <numeric>
18 #include <ostream>
19 #include <vector>
20 #include <queue>
21 #include <string>
22 
23 namespace stan {
24 
25  namespace variational {
26 
38  template <class Model, class Q, class BaseRNG>
39  class advi {
40  public:
56  advi(Model& m,
57  Eigen::VectorXd& cont_params,
58  BaseRNG& rng,
59  int n_monte_carlo_grad,
60  int n_monte_carlo_elbo,
61  int eval_elbo,
62  int n_posterior_samples)
63  : model_(m),
64  cont_params_(cont_params),
65  rng_(rng),
66  n_monte_carlo_grad_(n_monte_carlo_grad),
67  n_monte_carlo_elbo_(n_monte_carlo_elbo),
68  eval_elbo_(eval_elbo),
69  n_posterior_samples_(n_posterior_samples) {
70  static const char* function = "stan::variational::advi";
71  math::check_positive(function,
72  "Number of Monte Carlo samples for gradients",
74  math::check_positive(function,
75  "Number of Monte Carlo samples for ELBO",
77  math::check_positive(function,
78  "Evaluate ELBO at every eval_elbo iteration",
79  eval_elbo_);
80  math::check_positive(function,
81  "Number of posterior samples for output",
83  }
84 
99  double calc_ELBO(const Q& variational,
101  const {
102  static const char* function =
103  "stan::variational::advi::calc_ELBO";
104 
105  double elbo = 0.0;
106  int dim = variational.dimension();
107  Eigen::VectorXd zeta(dim);
108 
109  int n_dropped_evaluations = 0;
110  for (int i = 0; i < n_monte_carlo_elbo_;) {
111  variational.sample(rng_, zeta);
112  try {
113  std::stringstream ss;
114  double log_prob = model_.template log_prob<false, true>(zeta, &ss);
115  if (ss.str().length() > 0)
116  message_writer(ss.str());
117  stan::math::check_finite(function, "log_prob", log_prob);
118  elbo += log_prob;
119  ++i;
120  } catch (const std::domain_error& e) {
121  ++n_dropped_evaluations;
122  if (n_dropped_evaluations >= n_monte_carlo_elbo_) {
123  const char* name = "The number of dropped evaluations";
124  const char* msg1 = "has reached its maximum amount (";
125  const char* msg2 = "). Your model may be either severely "
126  "ill-conditioned or misspecified.";
127  stan::math::domain_error(function, name, n_monte_carlo_elbo_,
128  msg1, msg2);
129  }
130  }
131  }
132  elbo /= n_monte_carlo_elbo_;
133  elbo += variational.entropy();
134  return elbo;
135  }
136 
146  void calc_ELBO_grad(const Q& variational, Q& elbo_grad,
148  message_writer)
149  const {
150  static const char* function =
151  "stan::variational::advi::calc_ELBO_grad";
152 
153  stan::math::check_size_match(function,
154  "Dimension of elbo_grad",
155  elbo_grad.dimension(),
156  "Dimension of variational q",
157  variational.dimension());
158  stan::math::check_size_match(function,
159  "Dimension of variational q",
160  variational.dimension(),
161  "Dimension of variables in model",
162  cont_params_.size());
163 
164  variational.calc_grad(elbo_grad,
166  message_writer);
167  }
168 
181  double adapt_eta(Q& variational,
182  int adapt_iterations,
184  const {
185  static const char* function = "stan::variational::advi::adapt_eta";
186 
187  stan::math::check_positive(function,
188  "Number of adaptation iterations",
189  adapt_iterations);
190 
191  message_writer("Begin eta adaptation.");
192 
193  // Sequence of eta values to try during adaptation
194  const int eta_sequence_size = 5;
195  double eta_sequence[eta_sequence_size] = {100, 10, 1, 0.1, 0.01};
196 
197  // Initialize ELBO tracking variables
198  double elbo = -std::numeric_limits<double>::max();
199  double elbo_best = -std::numeric_limits<double>::max();
200  double elbo_init;
201  try {
202  elbo_init = calc_ELBO(variational, message_writer);
203  } catch (const std::domain_error& e) {
204  const char* name = "Cannot compute ELBO using the initial "
205  "variational distribution.";
206  const char* msg1 = "Your model may be either "
207  "severely ill-conditioned or misspecified.";
208  stan::math::domain_error(function, name, "", msg1);
209  }
210 
211  // Variational family to store gradients
212  Q elbo_grad = Q(model_.num_params_r());
213 
214  // Adaptive step-size sequence
215  Q history_grad_squared = Q(model_.num_params_r());
216  double tau = 1.0;
217  double pre_factor = 0.9;
218  double post_factor = 0.1;
219 
220  double eta_best = 0.0;
221  double eta;
222  double eta_scaled;
223 
224  bool do_more_tuning = true;
225  int eta_sequence_index = 0;
226  while (do_more_tuning) {
227  // Try next eta
228  eta = eta_sequence[eta_sequence_index];
229 
230  int print_progress_m;
231  for (int iter_tune = 1; iter_tune <= adapt_iterations; ++iter_tune) {
232  print_progress_m = eta_sequence_index
233  * adapt_iterations + iter_tune;
235  ::print_progress(print_progress_m, 0,
236  adapt_iterations * eta_sequence_size,
237  adapt_iterations, true, "", "", message_writer);
238 
239  // (ROBUST) Compute gradient of ELBO. It's OK if it diverges.
240  // We'll try a smaller eta.
241  try {
242  calc_ELBO_grad(variational, elbo_grad, message_writer);
243  } catch (const std::domain_error& e) {
244  elbo_grad.set_to_zero();
245  }
246 
247  // Update step-size
248  if (iter_tune == 1) {
249  history_grad_squared += elbo_grad.square();
250  } else {
251  history_grad_squared = pre_factor * history_grad_squared
252  + post_factor * elbo_grad.square();
253  }
254  eta_scaled = eta / sqrt(static_cast<double>(iter_tune));
255  // Stochastic gradient update
256  variational += eta_scaled * elbo_grad
257  / (tau + history_grad_squared.sqrt());
258  }
259 
260  // (ROBUST) Compute ELBO. It's OK if it has diverged.
261  try {
262  elbo = calc_ELBO(variational, message_writer);
263  } catch (const std::domain_error& e) {
264  elbo = -std::numeric_limits<double>::max();
265  }
266 
267  // Check if:
268  // (1) ELBO at current eta is worse than the best ELBO
269  // (2) the best ELBO hasn't gotten worse than the initial ELBO
270  if (elbo < elbo_best && elbo_best > elbo_init) {
271  std::stringstream ss;
272  ss << "Success!"
273  << " Found best value [eta = " << eta_best
274  << "]";
275  if (eta_sequence_index < eta_sequence_size - 1)
276  ss << (" earlier than expected.");
277  else
278  ss << ".";
279  message_writer(ss.str());
280  message_writer();
281  do_more_tuning = false;
282  } else {
283  if (eta_sequence_index < eta_sequence_size - 1) {
284  // Reset
285  elbo_best = elbo;
286  eta_best = eta;
287  } else {
288  // No more eta values to try, so use current eta if it
289  // didn't diverge or fail if it did diverge
290  if (elbo > elbo_init) {
291  std::stringstream ss;
292  ss << "Success!"
293  << " Found best value [eta = " << eta_best
294  << "].";
295  message_writer(ss.str());
296  message_writer();
297  eta_best = eta;
298  do_more_tuning = false;
299  } else {
300  const char* name = "All proposed step-sizes";
301  const char* msg1 = "failed. Your model may be either "
302  "severely ill-conditioned or misspecified.";
303  stan::math::domain_error(function, name, "", msg1);
304  }
305  }
306  // Reset
307  history_grad_squared.set_to_zero();
308  }
309  ++eta_sequence_index;
310  variational = Q(cont_params_);
311  }
312  return eta_best;
313  }
314 
327  void stochastic_gradient_ascent(Q& variational,
328  double eta,
329  double tol_rel_obj,
330  int max_iterations,
332  interface_callbacks::writer::base_writer& diagnostic_writer)
333  const {
334  static const char* function =
335  "stan::variational::advi::stochastic_gradient_ascent";
336 
337  stan::math::check_positive(function, "Eta stepsize", eta);
338  stan::math::check_positive(function,
339  "Relative objective function tolerance",
340  tol_rel_obj);
341  stan::math::check_positive(function,
342  "Maximum iterations",
343  max_iterations);
344 
345  // Gradient parameters
346  Q elbo_grad = Q(model_.num_params_r());
347 
348  // Stepsize sequence parameters
349  Q history_grad_squared = Q(model_.num_params_r());
350  double tau = 1.0;
351  double pre_factor = 0.9;
352  double post_factor = 0.1;
353  double eta_scaled;
354 
355  // Initialize ELBO and convergence tracking variables
356  double elbo(0.0);
357  double elbo_best = -std::numeric_limits<double>::max();
358  double elbo_prev = -std::numeric_limits<double>::max();
359  double delta_elbo = std::numeric_limits<double>::max();
360  double delta_elbo_ave = std::numeric_limits<double>::max();
361  double delta_elbo_med = std::numeric_limits<double>::max();
362 
363  // Heuristic to estimate how far to look back in rolling window
364  int cb_size
365  = static_cast<int>(std::max(0.1 * max_iterations / eval_elbo_,
366  2.0));
367  boost::circular_buffer<double> elbo_diff(cb_size);
368 
369  message_writer("Begin stochastic gradient ascent.");
370  message_writer(" iter"
371  " ELBO"
372  " delta_ELBO_mean"
373  " delta_ELBO_med"
374  " notes ");
375 
376  // Timing variables
377  clock_t start = clock();
378  clock_t end;
379  double delta_t;
380 
381  // Main loop
382  bool do_more_iterations = true;
383  for (int iter_counter = 1; do_more_iterations; ++iter_counter) {
384  // Compute gradient using Monte Carlo integration
385  calc_ELBO_grad(variational, elbo_grad, message_writer);
386 
387  // Update step-size
388  if (iter_counter == 1) {
389  history_grad_squared += elbo_grad.square();
390  } else {
391  history_grad_squared = pre_factor * history_grad_squared
392  + post_factor * elbo_grad.square();
393  }
394  eta_scaled = eta / sqrt(static_cast<double>(iter_counter));
395 
396  // Stochastic gradient update
397  variational += eta_scaled * elbo_grad
398  / (tau + history_grad_squared.sqrt());
399 
400  // Check for convergence every "eval_elbo_"th iteration
401  if (iter_counter % eval_elbo_ == 0) {
402  elbo_prev = elbo;
403  elbo = calc_ELBO(variational, message_writer);
404  if (elbo > elbo_best)
405  elbo_best = elbo;
406  delta_elbo = rel_difference(elbo, elbo_prev);
407  elbo_diff.push_back(delta_elbo);
408  delta_elbo_ave = std::accumulate(elbo_diff.begin(),
409  elbo_diff.end(), 0.0)
410  / static_cast<double>(elbo_diff.size());
411  delta_elbo_med = circ_buff_median(elbo_diff);
412  std::stringstream ss;
413  ss << " "
414  << std::setw(4) << iter_counter
415  << " "
416  << std::right << std::setw(9) << std::setprecision(1)
417  << elbo
418  << " "
419  << std::setw(16) << std::fixed << std::setprecision(3)
420  << delta_elbo_ave
421  << " "
422  << std::setw(15) << std::fixed << std::setprecision(3)
423  << delta_elbo_med;
424 
425  end = clock();
426  delta_t = static_cast<double>(end - start) / CLOCKS_PER_SEC;
427 
428  std::vector<double> print_vector;
429  print_vector.clear();
430  print_vector.push_back(iter_counter);
431  print_vector.push_back(delta_t);
432  print_vector.push_back(elbo);
433  diagnostic_writer(print_vector);
434 
435  if (delta_elbo_ave < tol_rel_obj) {
436  ss << " MEAN ELBO CONVERGED";
437  do_more_iterations = false;
438  }
439 
440  if (delta_elbo_med < tol_rel_obj) {
441  ss << " MEDIAN ELBO CONVERGED";
442  do_more_iterations = false;
443  }
444 
445  if (iter_counter > 10 * eval_elbo_) {
446  if (delta_elbo_med > 0.5 || delta_elbo_ave > 0.5) {
447  ss << " MAY BE DIVERGING... INSPECT ELBO";
448  }
449  }
450 
451  message_writer(ss.str());
452 
453  if (do_more_iterations == false &&
454  rel_difference(elbo, elbo_best) > 0.05) {
455  message_writer("Informational Message: The ELBO at a previous "
456  "iteration is larger than the ELBO upon "
457  "convergence!");
458  message_writer("This variational approximation may not "
459  "have converged to a good optimum.");
460  }
461  }
462 
463  if (iter_counter == max_iterations) {
464  message_writer("Informational Message: The maximum number of "
465  "iterations is reached! The algorithm may not have "
466  "converged.");
467  message_writer("This variational approximation is not "
468  "guaranteed to be meaningful.");
469  do_more_iterations = false;
470  }
471  }
472  }
473 
486  int run(double eta, bool adapt_engaged, int adapt_iterations,
487  double tol_rel_obj, int max_iterations,
490  interface_callbacks::writer::base_writer& diagnostic_writer)
491  const {
492  diagnostic_writer("iter,time_in_seconds,ELBO");
493 
494  // Initialize variational approximation
495  Q variational = Q(cont_params_);
496 
497  if (adapt_engaged) {
498  eta = adapt_eta(variational, adapt_iterations, message_writer);
499  parameter_writer("Stepsize adaptation complete.");
500  std::stringstream ss;
501  ss << "eta = " << eta;
502  parameter_writer(ss.str());
503  }
504 
505  stochastic_gradient_ascent(variational, eta,
506  tol_rel_obj, max_iterations,
507  message_writer, diagnostic_writer);
508 
509  // Write mean of posterior approximation on first output line
510  cont_params_ = variational.mean();
511  std::vector<double> cont_vector(cont_params_.size());
512  for (int i = 0; i < cont_params_.size(); ++i)
513  cont_vector.at(i) = cont_params_(i);
514  std::vector<int> disc_vector;
515 
517  0, cont_vector, disc_vector,
518  message_writer,
519  parameter_writer);
520  // Draw more samples from posterior and write on subsequent lines
521  message_writer();
522  std::stringstream ss;
523  ss << "Drawing a sample of size "
525  << " from the approximate posterior... ";
526  message_writer(ss.str());
527 
528  for (int n = 0; n < n_posterior_samples_; ++n) {
529  variational.sample(rng_, cont_params_);
530  for (int i = 0; i < cont_params_.size(); ++i) {
531  cont_vector.at(i) = cont_params_(i);
532  }
534  0, cont_vector, disc_vector,
535  message_writer,
536  parameter_writer);
537  }
538  message_writer("COMPLETED.");
539 
541  }
542 
543  // TODO(akucukelbir): move these things to stan math and test there
544 
551  double circ_buff_median(const boost::circular_buffer<double>& cb) const {
552  // FIXME: naive implementation; creates a copy as a vector
553  std::vector<double> v;
554  for (boost::circular_buffer<double>::const_iterator i = cb.begin();
555  i != cb.end(); ++i) {
556  v.push_back(*i);
557  }
558 
559  size_t n = v.size() / 2;
560  std::nth_element(v.begin(), v.begin()+n, v.end());
561  return v[n];
562  }
563 
571  double rel_difference(double prev, double curr) const {
572  return std::fabs((curr - prev) / prev);
573  }
574 
575  protected:
576  Model& model_;
577  Eigen::VectorXd& cont_params_;
578  BaseRNG& rng_;
583  };
584  } // variational
585 } // stan
586 
587 #endif
double adapt_eta(Q &variational, int adapt_iterations, interface_callbacks::writer::base_writer &message_writer) const
Heuristic grid search to adapt eta to the scale of the problem.
Definition: advi.hpp:181
Automatic Differentiation Variational Inference.
Definition: advi.hpp:39
Probability, optimization and sampling library.
void write_iteration(Model &model, RNG &base_rng, double lp, std::vector< double > &cont_vector, std::vector< int > &disc_vector, interface_callbacks::writer::base_writer &message_writer, interface_callbacks::writer::base_writer &parameter_writer)
void calc_ELBO_grad(const Q &variational, Q &elbo_grad, interface_callbacks::writer::base_writer &message_writer) const
Calculates the "black box" gradient of the ELBO.
Definition: advi.hpp:146
base_writer is an abstract base class defining the interface for Stan writer callbacks.
Definition: base_writer.hpp:20
double rel_difference(double prev, double curr) const
Compute the relative difference between two double values.
Definition: advi.hpp:571
advi(Model &m, Eigen::VectorXd &cont_params, BaseRNG &rng, int n_monte_carlo_grad, int n_monte_carlo_elbo, int eval_elbo, int n_posterior_samples)
Constructor.
Definition: advi.hpp:56
double calc_ELBO(const Q &variational, interface_callbacks::writer::base_writer &message_writer) const
Calculates the Evidence Lower BOund (ELBO) by sampling from the variational distribution and then eva...
Definition: advi.hpp:99
void stochastic_gradient_ascent(Q &variational, double eta, double tol_rel_obj, int max_iterations, interface_callbacks::writer::base_writer &message_writer, interface_callbacks::writer::base_writer &diagnostic_writer) const
Runs stochastic gradient ascent with an adaptive stepsize sequence.
Definition: advi.hpp:327
Eigen::VectorXd & cont_params_
Definition: advi.hpp:577
void print_progress(int m, int start, int finish, int refresh, bool tune, const std::string &prefix, const std::string &suffix, interface_callbacks::writer::base_writer &writer)
Helper function for printing progress for variational inference.
int run(double eta, bool adapt_engaged, int adapt_iterations, double tol_rel_obj, int max_iterations, interface_callbacks::writer::base_writer &message_writer, interface_callbacks::writer::base_writer &parameter_writer, interface_callbacks::writer::base_writer &diagnostic_writer) const
Runs ADVI and writes to output.
Definition: advi.hpp:486
double circ_buff_median(const boost::circular_buffer< double > &cb) const
Compute the median of a circular buffer.
Definition: advi.hpp:551

     [ Stan Home Page ] © 2011–2016, Stan Development Team.