ICB-DCM · arrjon · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025
@@ -249,7 +249,7 @@
    "outputs": [],
    "source": [
     "import seaborn as sns\n",
-    "from scipy.stats import multivariate_normal\n",
+    "from scipy.stats import multivariate_normal, norm\n",
     "\n",
     "import pypesto\n",
     "import pypesto.sample as sample\n",
@@ -266,7 +266,26 @@
     "    return -np.log(density(x))\n",
     "\n",
     "\n",
-    "objective = pypesto.Objective(fun=nllh)\n",
+    "def nllh_grad(x):\n",
+    "    w = np.array([0.3, 0.7])\n",
+    "    mus = np.array([-1.5, 2.5])\n",
+    "    sigmas2 = np.array([0.1, 0.2])\n",
+    "    sigmas = np.sqrt(sigmas2)\n",
+    "\n",
+    "    pdfs = np.array([w[i] * norm.pdf(x, mus[i], sigmas[i]) for i in range(2)])\n",
+    "    dpdx = np.sum(\n",
+    "        [\n",
+    "            w[i] * norm.pdf(x, mus[i], sigmas[i]) * (mus[i] - x) / sigmas2[i]\n",
+    "            for i in range(2)\n",
+    "        ],\n",
+    "        axis=0,\n",
+    "    )\n",
+    "\n",
+    "    grad = -dpdx / np.sum(pdfs, axis=0)\n",
+    "    return grad\n",
+    "\n",
+    "\n",
+    "objective = pypesto.Objective(fun=nllh, grad=nllh_grad)\n",
     "problem = pypesto.Problem(objective=objective, lb=-4, ub=5, x_names=[\"x\"])"
    ]
   },
@@ -498,6 +517,43 @@
     "result.sample_result.betas"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "### Metropolis-Adjusted Langevin Algorithm\n"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "So far, we have used only the function values to guide the sampling. If gradients are available, they can be used to guide the sampling more effectively. The `pypesto.sample.MetropolisAdjustedLangevinAlgorithm` implements this idea. It can be used within the parallel tempering framework as well."
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sampler = sample.AdaptiveParallelTemperingSampler(\n",
+    "    internal_sampler=sample.MalaSampler(), n_chains=3\n",
+    ")\n",
+    "result = sample.sample(\n",
+    "    problem, 1e3, sampler, x0=np.array([0.5]), filename=None\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample.geweke_test(result)\n",
+    "for i_chain in range(len(result.sample_result.betas)):\n",
+    "    visualize.sampling_1d_marginals(\n",
+    "        result, i_chain=i_chain, suptitle=f\"Chain: {i_chain}\"\n",
+    "    )"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

@@ -17,6 +17,7 @@
     laplace_approximation_log_evidence,
     parallel_tempering_log_evidence,
 )
+from .mala import MalaSampler
 from .metropolis import MetropolisSampler
 from .parallel_tempering import ParallelTemperingSampler
 from .pymc import PymcSampler

@@ -1,3 +1,4 @@
+import logging
 import numbers
 
 import numpy as np
@@ -62,6 +63,7 @@ def __init__(self, options: dict = None):
         self._mean_hist = None
         self._cov_hist = None
         self._cov_scale = None
+        self._cov_chol = None
 
     @classmethod
     def default_options(cls):
@@ -75,11 +77,10 @@ def default_options(cls):
             # a higher value reduces the impact of early adaptation
             "threshold_sample": 1,
             # regularization factor for ill-conditioned cov matrices of
-            # the adapted proposal density. regularization might happen if the
-            # eigenvalues of the cov matrix strongly differ in order
-            # of magnitude. in this case, the algorithm adds a small
-            # diag matrix to the cov matrix with elements of this factor
-            "reg_factor": 1e-6,
+            # the adapted proposal density.
+            "reg_factor": 1e-8,
+            # maximum number of attempts to regularize the covariance matrix
+            "max_tries": 10,
             # initial covariance matrix. defaults to a unit matrix
             "cov0": None,
             # target acceptance rate
@@ -98,14 +99,18 @@ def initialize(self, problem: Problem, x0: np.ndarray):
                 cov0 = float(cov0) * np.eye(len(x0))
         else:
             cov0 = np.eye(len(x0))
-        self._cov = regularize_covariance(cov0, self.options["reg_factor"])
+        self._cov_chol, self._cov = regularize_covariance(
+            cov0,
+            reg_factor=self.options["reg_factor"],
+            max_tries=self.options["max_tries"],
+        )
         self._mean_hist = self.trace_x[-1]
         self._cov_hist = self._cov
         self._cov_scale = 1.0
 
-    def _propose_parameter(self, x: np.ndarray):
-        x_new = np.random.multivariate_normal(x, self._cov)
-        return x_new
+    def _propose_parameter(self, x: np.ndarray, beta: float):
+        x_new: np.ndarray = np.random.multivariate_normal(x, self._cov)
+        return x_new, np.nan  # no gradient needed
 
     def _update_proposal(
         self, x: np.ndarray, lpost: float, log_p_acc: float, n_sample_cur: int
@@ -114,6 +119,7 @@ def _update_proposal(
         decay_constant = self.options["decay_constant"]
         threshold_sample = self.options["threshold_sample"]
         reg_factor = self.options["reg_factor"]
+        max_tries = self.options["max_tries"]
         target_acceptance_rate = self.options["target_acceptance_rate"]
 
         # compute historical mean and covariance
@@ -136,7 +142,9 @@ def _update_proposal(
         self._cov = self._cov_scale * self._cov_hist
 
         # regularize proposal covariance
-        self._cov = regularize_covariance(cov=self._cov, reg_factor=reg_factor)
+        self._cov_chol, self._cov = regularize_covariance(
+            cov=self._cov, reg_factor=reg_factor, max_tries=max_tries
+        )
 
 
 def update_history_statistics(
@@ -186,7 +194,11 @@ def update_history_statistics(
     return mean, cov
 
 
-def regularize_covariance(cov: np.ndarray, reg_factor: float) -> np.ndarray:
+def regularize_covariance(
+    cov: np.ndarray,
+    reg_factor: float,
+    max_tries: int,
+) -> tuple[np.ndarray, np.ndarray]:
     """
     Regularize the estimated covariance matrix of the sample.
 
@@ -199,14 +211,49 @@ def regularize_covariance(cov: np.ndarray, reg_factor: float) -> np.ndarray:
         Estimate of the covariance matrix of the sample.
     reg_factor:
         Regularization factor. Larger values result in stronger regularization.
+    max_tries:
+        Maximum number of attempts to regularize the covariance matrix.
 
     Returns
     -------
+    L:
+        Cholesky decomposition of the regularized covariance matrix.
     cov:
         Regularized estimate of the covariance matrix of the sample.
     """
-    eig = np.linalg.eigvals(cov)
-    eig_min = min(eig)
-    if eig_min <= 0:
-        cov += (abs(eig_min) + reg_factor) * np.eye(cov.shape[0])
-    return cov
+    # we symmetrize cov first
+    cov = 0.5 * (cov + cov.T)
+
+    d = cov.shape[0]
+    eye = np.eye(d)
+    if not np.all(np.isfinite(cov)):
+        s = np.nanmean(np.diag(cov))
+        s = 1.0 if not np.isfinite(s) or s <= 0 else s
+        return np.sqrt(s) * eye, s * eye
+
+    # Try Cholesky on original matrix first
+    try:
+        L = np.linalg.cholesky(cov)
+        return L, cov
+    except np.linalg.LinAlgError:
+        pass  # Need regularization
+
+    # scale for the initial jitter
+    s = np.mean(np.diag(cov))
+    s = 1.0 if not np.isfinite(s) or s <= 0 else s
+
+    jitter = reg_factor * s
+    for _ in range(max_tries):
+        try:
+            new_cov = cov + jitter * eye
+            L = np.linalg.cholesky(new_cov)
+            return L, new_cov
+        except np.linalg.LinAlgError:
+            jitter *= 2.0
+    logging.warning(
+        "Could not regularize covariance matrix. Falling back to "
+        "scaled identity."
+    )
+
+    # final safe fallback
+    return np.sqrt(s) * eye, s * eye
@@ -0,0 +1,170 @@
+import logging
-import logging
+"""The MALA sampler."""
+import logging
-import logging
+"""The MALA sampler."""
+import logging
+
+import numpy as np
+
+from ..problem import Problem
+from .adaptive_metropolis import AdaptiveMetropolisSampler
+
+logger = logging.getLogger(__name__)
+
+
+class MalaSampler(AdaptiveMetropolisSampler):
+    """Metropolis-Adjusted Langevin Algorithm (MALA) sampler with preconditioning.
+
+    MALA is a gradient-based MCMC method that uses the gradient of the
+    log-posterior to guide the proposal distribution. This allows for
+    more efficient exploration of the parameter space compared to
+    standard random-walk Metropolis-Hastings.
+
+    The proposal distribution is:
+    x_new = x + (step_size / 2) * M * grad_log_p(x) + sqrt(step_size) * sqrt(M) * noise
+
+    where grad_log_p(x) is the gradient of the log-posterior at x,
+    M is the preconditioning matrix, step_size is the discretization step size,
+    and noise is standard Gaussian noise.
+
+    Preconditioning can significantly improve convergence for poorly conditioned
+    problems by rescaling the parameter space. Here, we use an adaptive covariance
+    matrix as the preconditioning matrix M, which is updated during sampling.
+    We aim to converge to a fixed target acceptance rate of 0.574, as suggested
+    by theoretical results for MALA.
+
+    By default, the step size is set to sqrt((1/dim)^(1/3)), where dim is the
+    dimension of the target distribution, following Boisvert-Beaudry and Bedard (2022).
+    This scaling helps maintain efficient exploration as the dimensionality increases,
+    and leads to an asymptotically performant sampler, in the sense that exploring the
+    parameter space requires O(dim^(1/3)) steps.
+
+    For reference, see:
+
+    * Roberts et al. 1996.
+      Exponential convergence of Langevin distributions and their
+      discrete approximations
+      (https://doi.org/10.2307/3318418)
+    * Girolami & Calderhead 2011.
+      Riemann manifold Langevin and Hamiltonian Monte Carlo methods
+      (https://doi.org/10.1111/j.1467-9868.2010.00765.x)
+    * Boisvert-Beaudry and Bedard 2022.
+      MALA with annealed proposals: a generalization of locally and globally balanced proposal distributions
+    """
+
+    def __init__(self, step_size: float = None, options: dict = None):
+        """Initialize the sampler.
+
+        Parameters
+        ----------
+        step_size:
+            Step size for the Langevin dynamics. If None, it will be set to
+            sqrt((1/n_parameter)^(1/3)) during initialization.
+        options:
+            Options for the sampler.
+        """
+        super().__init__(options)
+        self.step_size = step_size
+        self._dimension_of_target_weight = 1.0
+
+    @classmethod
+    def default_options(cls):
+        """Return the default options for the sampler."""
+        return {
+            # controls adaptation degeneration velocity of the proposals
+            # in [0, 1], with 0 -> no adaptation, i.e. classical
+            # Metropolis-Hastings
+            "decay_constant": 0.51,
+            # number of samples before adaptation decreases significantly.
+            # a higher value reduces the impact of early adaptation
+            "threshold_sample": 1,
+            # regularization factor for ill-conditioned cov matrices of
+            # the adapted proposal density.
+            "reg_factor": 1e-8,
+            # maximum number of attempts to regularize the covariance matrix
+            "max_tries": 10,
+            # initial covariance matrix. defaults to a unit matrix
-            # initial covariance matrix. defaults to a unit matrix
+            # initial covariance matrix. defaults to the identity matrix
-            # initial covariance matrix. defaults to a unit matrix
+            # initial covariance matrix. defaults to the identity matrix
+            "cov0": None,
+            # target acceptance rate
+            "target_acceptance_rate": 0.574,
+            # show progress
+            "show_progress": None,
+        }
+
+    def initialize(self, problem: Problem, x0: np.ndarray):
+        """Initialize the sampler."""
+        # Check if gradient is available
+        if not problem.objective.has_grad:
+            raise ValueError("MALA sampler requires gradient information.")
+
+        # Boisvert-Beaudry and Bedard (2022)
+        # Set step size to problem dimension if not specified
+        if self.step_size is None:
+            self.step_size = np.sqrt((1 / problem.dim) ** (1 / 3))
+            logger.info(f"Step size set to {self.step_size}")
+        self._dimension_of_target_weight = 1 + (1 / problem.dim) ** (1 / 3)
-        if self.step_size is None:
-            self.step_size = np.sqrt((1 / problem.dim) ** (1 / 3))
-            logger.info(f"Step size set to {self.step_size}")
-        self._dimension_of_target_weight = 1 + (1 / problem.dim) ** (1 / 3)
+        _default_step_size = np.sqrt((1 / problem.dim) ** (1 / 3))
+        if self.step_size is None:
+            self.step_size = _default_step_size
+            logger.info(f"Step size set to {self.step_size}")
+        self._dimension_of_target_weight = 1 + _default_step_size ** 2
-        if self.step_size is None:
-            self.step_size = np.sqrt((1 / problem.dim) ** (1 / 3))
-            logger.info(f"Step size set to {self.step_size}")
-        self._dimension_of_target_weight = 1 + (1 / problem.dim) ** (1 / 3)
+        _default_step_size = np.sqrt((1 / problem.dim) ** (1 / 3))
+        if self.step_size is None:
+            self.step_size = _default_step_size
+            logger.info(f"Step size set to {self.step_size}")
+        self._dimension_of_target_weight = 1 + _default_step_size ** 2
+
+        super().initialize(problem, x0)
+        self._current_x_grad = -self.neglogpost.get_grad(x0)
+
+    def _propose_parameter(self, x: np.ndarray, beta: float):
+        """Propose a parameter using preconditioned MALA dynamics."""
+        # Get gradient of log-posterior at current position
+        grad = -self.neglogpost.get_grad(x)
+        # todo: in parallel tempering, this should be the tempered gradient, but only the likelihood is tempered
+
+        # Apply preconditioning to gradient
+        precond_grad = self._cov @ (beta * grad)
+
+        # Generate standard Gaussian noise
+        noise = np.random.randn(len(x))
+
+        # Apply preconditioning to noise (via Cholesky decomposition)
+        precond_noise = self._cov_chol @ noise
+
+        # Preconditioned MALA proposal: x + (h/2) * M * grad + sqrt(h) * sqrt(M) * noise
+        drift = (self.step_size / 2.0) * precond_grad
+        diffusion = np.sqrt(self.step_size) * precond_noise
+
+        x_new: np.ndarray = (
+            x + self._dimension_of_target_weight * drift + diffusion
+        )
+        return x_new, grad
+
+    def _compute_transition_log_prob(
+        self, x_from: np.ndarray, x_to: np.ndarray, x_from_grad: np.ndarray
+    ):
+        """Compute the log probability of transitioning from x_from to x_to with preconditioning."""
+        # Apply preconditioning to gradient
+        precond_grad_from = self._cov @ x_from_grad
+
+        # Mean of the preconditioned proposal distribution
+        mean = (
+            x_from
+            + self._dimension_of_target_weight
+            * (self.step_size / 2.0)
+            * precond_grad_from
+        )
+
+        # For preconditioned MALA, the covariance is step_size * M
+        # We need to compute the log probability under N(mean, step_size * M)
+        diff = x_to - mean
+
+        # Use Cholesky decomposition for efficient computation
+        # We need to solve L @ L^T @ z = diff, i.e., z = M^{-1} @ diff
+        # This is done by first solving L @ y = diff, then L^T @ z = y
+        try:
+            # Forward substitution: L @ y = diff
+            y = np.linalg.solve(self._cov_chol, diff)
+            # Quadratic form: diff^T @ M^{-1} @ diff = y^T @ y
+            log_prob = -0.5 * np.dot(y, y) / self.step_size
+        except np.linalg.LinAlgError:
+            # If matrix is singular, return -inf
+            return -np.inf
+
+        # Add normalization constant: -0.5 * log|2π * step_size * M|
+        # = -0.5 * (d * log(2π * step_size) + log|M|)
+        # log|M| = 2 * log|L| = 2 * sum(log(diag(L)))
+        d = len(x_from)
+        log_det_cov = 2 * np.sum(np.log(np.diag(self._cov_chol)))
+        log_prob -= 0.5 * (
+            d * np.log(2 * np.pi * self.step_size) + log_det_cov
+        )
+
+        return log_prob