-
Notifications
You must be signed in to change notification settings - Fork 47
Adding Metropolis-Adjusted Langevin Algorithm #1617
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
5380b0d
949bfcf
8b68337
802a5ed
260f4c3
81b44dc
7934aab
8812281
5ee528a
0b0c025
7a6778b
2c954f0
f081b9f
565a312
bf4a474
9098dbe
c57946b
7ae65cd
b775a8d
2af917c
7cad7fd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,4 @@ | ||
| import logging | ||
| import numbers | ||
|
|
||
| import numpy as np | ||
|
|
@@ -62,6 +63,7 @@ def __init__(self, options: dict = None): | |
| self._mean_hist = None | ||
| self._cov_hist = None | ||
| self._cov_scale = None | ||
| self._cov_chol = None | ||
|
|
||
| @classmethod | ||
| def default_options(cls): | ||
|
|
@@ -75,11 +77,10 @@ def default_options(cls): | |
| # a higher value reduces the impact of early adaptation | ||
| "threshold_sample": 1, | ||
| # regularization factor for ill-conditioned cov matrices of | ||
| # the adapted proposal density. regularization might happen if the | ||
| # eigenvalues of the cov matrix strongly differ in order | ||
| # of magnitude. in this case, the algorithm adds a small | ||
| # diag matrix to the cov matrix with elements of this factor | ||
| "reg_factor": 1e-6, | ||
| # the adapted proposal density. | ||
| "reg_factor": 1e-8, | ||
| # maximum number of attempts to regularize the covariance matrix | ||
| "max_tries": 10, | ||
| # initial covariance matrix. defaults to a unit matrix | ||
| "cov0": None, | ||
| # target acceptance rate | ||
|
|
@@ -98,14 +99,18 @@ def initialize(self, problem: Problem, x0: np.ndarray): | |
| cov0 = float(cov0) * np.eye(len(x0)) | ||
| else: | ||
| cov0 = np.eye(len(x0)) | ||
| self._cov = regularize_covariance(cov0, self.options["reg_factor"]) | ||
| self._cov_chol, self._cov = regularize_covariance( | ||
| cov0, | ||
| reg_factor=self.options["reg_factor"], | ||
| max_tries=self.options["max_tries"], | ||
| ) | ||
| self._mean_hist = self.trace_x[-1] | ||
| self._cov_hist = self._cov | ||
| self._cov_scale = 1.0 | ||
|
|
||
| def _propose_parameter(self, x: np.ndarray): | ||
| x_new = np.random.multivariate_normal(x, self._cov) | ||
| return x_new | ||
| def _propose_parameter(self, x: np.ndarray, beta: float): | ||
| x_new: np.ndarray = np.random.multivariate_normal(x, self._cov) | ||
| return x_new, np.nan # no gradient needed | ||
|
|
||
| def _update_proposal( | ||
| self, x: np.ndarray, lpost: float, log_p_acc: float, n_sample_cur: int | ||
|
|
@@ -114,6 +119,7 @@ def _update_proposal( | |
| decay_constant = self.options["decay_constant"] | ||
| threshold_sample = self.options["threshold_sample"] | ||
| reg_factor = self.options["reg_factor"] | ||
| max_tries = self.options["max_tries"] | ||
| target_acceptance_rate = self.options["target_acceptance_rate"] | ||
|
|
||
| # compute historical mean and covariance | ||
|
|
@@ -136,7 +142,9 @@ def _update_proposal( | |
| self._cov = self._cov_scale * self._cov_hist | ||
|
|
||
| # regularize proposal covariance | ||
| self._cov = regularize_covariance(cov=self._cov, reg_factor=reg_factor) | ||
| self._cov_chol, self._cov = regularize_covariance( | ||
| cov=self._cov, reg_factor=reg_factor, max_tries=max_tries | ||
| ) | ||
|
|
||
|
|
||
| def update_history_statistics( | ||
|
|
@@ -186,7 +194,11 @@ def update_history_statistics( | |
| return mean, cov | ||
|
|
||
|
|
||
| def regularize_covariance(cov: np.ndarray, reg_factor: float) -> np.ndarray: | ||
| def regularize_covariance( | ||
| cov: np.ndarray, | ||
| reg_factor: float, | ||
| max_tries: int, | ||
| ) -> tuple[np.ndarray, np.ndarray]: | ||
| """ | ||
| Regularize the estimated covariance matrix of the sample. | ||
|
|
||
|
|
@@ -199,14 +211,49 @@ def regularize_covariance(cov: np.ndarray, reg_factor: float) -> np.ndarray: | |
| Estimate of the covariance matrix of the sample. | ||
| reg_factor: | ||
| Regularization factor. Larger values result in stronger regularization. | ||
| max_tries: | ||
| Maximum number of attempts to regularize the covariance matrix. | ||
|
|
||
| Returns | ||
| ------- | ||
| L: | ||
| Cholesky decomposition of the regularized covariance matrix. | ||
| cov: | ||
| Regularized estimate of the covariance matrix of the sample. | ||
|
Comment on lines
+219
to
222
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think returning the covariance first makes more sense, unless you think users/devs might be more interested in the Cholesky decomposition? This method used to only return |
||
| """ | ||
| eig = np.linalg.eigvals(cov) | ||
| eig_min = min(eig) | ||
| if eig_min <= 0: | ||
| cov += (abs(eig_min) + reg_factor) * np.eye(cov.shape[0]) | ||
| return cov | ||
| # we symmetrize cov first | ||
| cov = 0.5 * (cov + cov.T) | ||
|
Comment on lines
+224
to
+225
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this necessary if the input is already a covariance matrix? |
||
|
|
||
| d = cov.shape[0] | ||
| eye = np.eye(d) | ||
| if not np.all(np.isfinite(cov)): | ||
| s = np.nanmean(np.diag(cov)) | ||
| s = 1.0 if not np.isfinite(s) or s <= 0 else s | ||
| return np.sqrt(s) * eye, s * eye | ||
|
Comment on lines
+229
to
+232
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you explain this a bit more in comments? What does this part achieve? What does Why is there |
||
|
|
||
| # Try Cholesky on original matrix first | ||
| try: | ||
| L = np.linalg.cholesky(cov) | ||
| return L, cov | ||
| except np.linalg.LinAlgError: | ||
| pass # Need regularization | ||
|
|
||
| # scale for the initial jitter | ||
| s = np.mean(np.diag(cov)) | ||
| s = 1.0 if not np.isfinite(s) or s <= 0 else s | ||
|
Comment on lines
+241
to
+243
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why would |
||
|
|
||
| jitter = reg_factor * s | ||
| for _ in range(max_tries): | ||
| try: | ||
| new_cov = cov + jitter * eye | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Previously, the diagonal was incremented by the smallest eigenvalue (and regularization factor). How does that compare to incrementing by the scaled "mean" |
||
| L = np.linalg.cholesky(new_cov) | ||
| return L, new_cov | ||
| except np.linalg.LinAlgError: | ||
| jitter *= 2.0 | ||
| logging.warning( | ||
| "Could not regularize covariance matrix. Falling back to " | ||
| "scaled identity." | ||
| ) | ||
|
|
||
| # final safe fallback | ||
| return np.sqrt(s) * eye, s * eye | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,170 @@ | ||||||||||||||||||||
| import logging | ||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||
|
|
||||||||||||||||||||
| import numpy as np | ||||||||||||||||||||
|
|
||||||||||||||||||||
| from ..problem import Problem | ||||||||||||||||||||
| from .adaptive_metropolis import AdaptiveMetropolisSampler | ||||||||||||||||||||
|
|
||||||||||||||||||||
| logger = logging.getLogger(__name__) | ||||||||||||||||||||
|
|
||||||||||||||||||||
|
|
||||||||||||||||||||
| class MalaSampler(AdaptiveMetropolisSampler): | ||||||||||||||||||||
| """Metropolis-Adjusted Langevin Algorithm (MALA) sampler with preconditioning. | ||||||||||||||||||||
|
|
||||||||||||||||||||
| MALA is a gradient-based MCMC method that uses the gradient of the | ||||||||||||||||||||
| log-posterior to guide the proposal distribution. This allows for | ||||||||||||||||||||
| more efficient exploration of the parameter space compared to | ||||||||||||||||||||
| standard random-walk Metropolis-Hastings. | ||||||||||||||||||||
|
|
||||||||||||||||||||
| The proposal distribution is: | ||||||||||||||||||||
| x_new = x + (step_size / 2) * M * grad_log_p(x) + sqrt(step_size) * sqrt(M) * noise | ||||||||||||||||||||
|
|
||||||||||||||||||||
| where grad_log_p(x) is the gradient of the log-posterior at x, | ||||||||||||||||||||
| M is the preconditioning matrix, step_size is the discretization step size, | ||||||||||||||||||||
| and noise is standard Gaussian noise. | ||||||||||||||||||||
|
|
||||||||||||||||||||
| Preconditioning can significantly improve convergence for poorly conditioned | ||||||||||||||||||||
| problems by rescaling the parameter space. Here, we use an adaptive covariance | ||||||||||||||||||||
| matrix as the preconditioning matrix M, which is updated during sampling. | ||||||||||||||||||||
| We aim to converge to a fixed target acceptance rate of 0.574, as suggested | ||||||||||||||||||||
| by theoretical results for MALA. | ||||||||||||||||||||
|
|
||||||||||||||||||||
| By default, the step size is set to sqrt((1/dim)^(1/3)), where dim is the | ||||||||||||||||||||
| dimension of the target distribution, following Boisvert-Beaudry and Bedard (2022). | ||||||||||||||||||||
| This scaling helps maintain efficient exploration as the dimensionality increases, | ||||||||||||||||||||
| and leads to an asymptotically performant sampler, in the sense that exploring the | ||||||||||||||||||||
| parameter space requires O(dim^(1/3)) steps. | ||||||||||||||||||||
|
|
||||||||||||||||||||
| For reference, see: | ||||||||||||||||||||
|
|
||||||||||||||||||||
| * Roberts et al. 1996. | ||||||||||||||||||||
arrjon marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||||
| Exponential convergence of Langevin distributions and their | ||||||||||||||||||||
| discrete approximations | ||||||||||||||||||||
| (https://doi.org/10.2307/3318418) | ||||||||||||||||||||
| * Girolami & Calderhead 2011. | ||||||||||||||||||||
| Riemann manifold Langevin and Hamiltonian Monte Carlo methods | ||||||||||||||||||||
| (https://doi.org/10.1111/j.1467-9868.2010.00765.x) | ||||||||||||||||||||
| * Boisvert-Beaudry and Bedard 2022. | ||||||||||||||||||||
| MALA with annealed proposals: a generalization of locally and globally balanced proposal distributions | ||||||||||||||||||||
| """ | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def __init__(self, step_size: float = None, options: dict = None): | ||||||||||||||||||||
| """Initialize the sampler. | ||||||||||||||||||||
|
|
||||||||||||||||||||
| Parameters | ||||||||||||||||||||
| ---------- | ||||||||||||||||||||
| step_size: | ||||||||||||||||||||
| Step size for the Langevin dynamics. If None, it will be set to | ||||||||||||||||||||
| sqrt((1/n_parameter)^(1/3)) during initialization. | ||||||||||||||||||||
| options: | ||||||||||||||||||||
| Options for the sampler. | ||||||||||||||||||||
| """ | ||||||||||||||||||||
| super().__init__(options) | ||||||||||||||||||||
| self.step_size = step_size | ||||||||||||||||||||
| self._dimension_of_target_weight = 1.0 | ||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Describe with a comment |
||||||||||||||||||||
|
|
||||||||||||||||||||
| @classmethod | ||||||||||||||||||||
| def default_options(cls): | ||||||||||||||||||||
| """Return the default options for the sampler.""" | ||||||||||||||||||||
| return { | ||||||||||||||||||||
| # controls adaptation degeneration velocity of the proposals | ||||||||||||||||||||
| # in [0, 1], with 0 -> no adaptation, i.e. classical | ||||||||||||||||||||
| # Metropolis-Hastings | ||||||||||||||||||||
| "decay_constant": 0.51, | ||||||||||||||||||||
| # number of samples before adaptation decreases significantly. | ||||||||||||||||||||
| # a higher value reduces the impact of early adaptation | ||||||||||||||||||||
| "threshold_sample": 1, | ||||||||||||||||||||
| # regularization factor for ill-conditioned cov matrices of | ||||||||||||||||||||
| # the adapted proposal density. | ||||||||||||||||||||
| "reg_factor": 1e-8, | ||||||||||||||||||||
| # maximum number of attempts to regularize the covariance matrix | ||||||||||||||||||||
| "max_tries": 10, | ||||||||||||||||||||
| # initial covariance matrix. defaults to a unit matrix | ||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||
| "cov0": None, | ||||||||||||||||||||
| # target acceptance rate | ||||||||||||||||||||
| "target_acceptance_rate": 0.574, | ||||||||||||||||||||
| # show progress | ||||||||||||||||||||
| "show_progress": None, | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def initialize(self, problem: Problem, x0: np.ndarray): | ||||||||||||||||||||
| """Initialize the sampler.""" | ||||||||||||||||||||
| # Check if gradient is available | ||||||||||||||||||||
| if not problem.objective.has_grad: | ||||||||||||||||||||
| raise ValueError("MALA sampler requires gradient information.") | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # Boisvert-Beaudry and Bedard (2022) | ||||||||||||||||||||
| # Set step size to problem dimension if not specified | ||||||||||||||||||||
| if self.step_size is None: | ||||||||||||||||||||
| self.step_size = np.sqrt((1 / problem.dim) ** (1 / 3)) | ||||||||||||||||||||
| logger.info(f"Step size set to {self.step_size}") | ||||||||||||||||||||
| self._dimension_of_target_weight = 1 + (1 / problem.dim) ** (1 / 3) | ||||||||||||||||||||
|
Comment on lines
+98
to
+101
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Also I guess this should technically be Also probably faster to do |
||||||||||||||||||||
|
|
||||||||||||||||||||
| super().initialize(problem, x0) | ||||||||||||||||||||
| self._current_x_grad = -self.neglogpost.get_grad(x0) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def _propose_parameter(self, x: np.ndarray, beta: float): | ||||||||||||||||||||
| """Propose a parameter using preconditioned MALA dynamics.""" | ||||||||||||||||||||
| # Get gradient of log-posterior at current position | ||||||||||||||||||||
| grad = -self.neglogpost.get_grad(x) | ||||||||||||||||||||
| # todo: in parallel tempering, this should be the tempered gradient, but only the likelihood is tempered | ||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TODO -- probably easy to implement now? Otherwise emit a warning if this is used in parallel tempering. |
||||||||||||||||||||
|
|
||||||||||||||||||||
| # Apply preconditioning to gradient | ||||||||||||||||||||
| precond_grad = self._cov @ (beta * grad) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # Generate standard Gaussian noise | ||||||||||||||||||||
| noise = np.random.randn(len(x)) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # Apply preconditioning to noise (via Cholesky decomposition) | ||||||||||||||||||||
| precond_noise = self._cov_chol @ noise | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # Preconditioned MALA proposal: x + (h/2) * M * grad + sqrt(h) * sqrt(M) * noise | ||||||||||||||||||||
| drift = (self.step_size / 2.0) * precond_grad | ||||||||||||||||||||
| diffusion = np.sqrt(self.step_size) * precond_noise | ||||||||||||||||||||
|
Comment on lines
+122
to
+123
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why does |
||||||||||||||||||||
|
|
||||||||||||||||||||
| x_new: np.ndarray = ( | ||||||||||||||||||||
| x + self._dimension_of_target_weight * drift + diffusion | ||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where does this formula with |
||||||||||||||||||||
| ) | ||||||||||||||||||||
| return x_new, grad | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def _compute_transition_log_prob( | ||||||||||||||||||||
| self, x_from: np.ndarray, x_to: np.ndarray, x_from_grad: np.ndarray | ||||||||||||||||||||
| ): | ||||||||||||||||||||
| """Compute the log probability of transitioning from x_from to x_to with preconditioning.""" | ||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Which equation in which reference provides the math in this method? |
||||||||||||||||||||
| # Apply preconditioning to gradient | ||||||||||||||||||||
| precond_grad_from = self._cov @ x_from_grad | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # Mean of the preconditioned proposal distribution | ||||||||||||||||||||
| mean = ( | ||||||||||||||||||||
| x_from | ||||||||||||||||||||
| + self._dimension_of_target_weight | ||||||||||||||||||||
| * (self.step_size / 2.0) | ||||||||||||||||||||
| * precond_grad_from | ||||||||||||||||||||
| ) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # For preconditioned MALA, the covariance is step_size * M | ||||||||||||||||||||
| # We need to compute the log probability under N(mean, step_size * M) | ||||||||||||||||||||
| diff = x_to - mean | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # Use Cholesky decomposition for efficient computation | ||||||||||||||||||||
| # We need to solve L @ L^T @ z = diff, i.e., z = M^{-1} @ diff | ||||||||||||||||||||
| # This is done by first solving L @ y = diff, then L^T @ z = y | ||||||||||||||||||||
| try: | ||||||||||||||||||||
| # Forward substitution: L @ y = diff | ||||||||||||||||||||
| y = np.linalg.solve(self._cov_chol, diff) | ||||||||||||||||||||
| # Quadratic form: diff^T @ M^{-1} @ diff = y^T @ y | ||||||||||||||||||||
| log_prob = -0.5 * np.dot(y, y) / self.step_size | ||||||||||||||||||||
| except np.linalg.LinAlgError: | ||||||||||||||||||||
| # If matrix is singular, return -inf | ||||||||||||||||||||
| return -np.inf | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # Add normalization constant: -0.5 * log|2π * step_size * M| | ||||||||||||||||||||
| # = -0.5 * (d * log(2π * step_size) + log|M|) | ||||||||||||||||||||
| # log|M| = 2 * log|L| = 2 * sum(log(diag(L))) | ||||||||||||||||||||
| d = len(x_from) | ||||||||||||||||||||
| log_det_cov = 2 * np.sum(np.log(np.diag(self._cov_chol))) | ||||||||||||||||||||
| log_prob -= 0.5 * ( | ||||||||||||||||||||
| d * np.log(2 * np.pi * self.step_size) + log_det_cov | ||||||||||||||||||||
| ) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| return log_prob | ||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Document the return value and return type? Where does a developer see that
_propose_parameterreturns a 2-tuple where the first entry is the parameter, the second entry something else?