mathLab
diff --git a/‎pina/solver/autoregressive_solver/autoregressive_solver.py‎
Lines changed: 66 additions & 40 deletions b/‎pina/solver/autoregressive_solver/autoregressive_solver.py‎
Lines changed: 66 additions & 40 deletions
@@ -47,7 +47,7 @@ def __init__(
         scheduler=None,
         weighting=None,
         use_lt=False,
-        N_epochs_with_same_weights=10,
+        reset_weighting_at_epoch_start=True,
     ):
         """
         Initialization of the :class:`AutoregressiveSolver` class.
@@ -69,8 +69,11 @@ def __init__(
             If ``None``, uniform weighting is used. Default is ``None``.
         :param bool use_lt: Whether to use LabelTensors.
             Default is ``False``.
-        :param int N_epochs_with_same_weights: Number of epochs to keep the same adaptive weights
-            before recomputing them. Default is ``10``.
+        :param bool reset_weighting_at_epoch_start: If ``True``, resets
+            the running averages used for adaptive weighting at the start
+            of each epoch. Default is ``True``. This parameter is for an advanced
+            use case, setting it to False can improve stability, especially
+            when data per epoch are very scarse.
         """
 
         super().__init__(
@@ -82,11 +85,9 @@ def __init__(
             weighting=weighting,
             use_lt=use_lt,
         )
-        # cache for per-condition adaptive weights and epoch-based update control
-        # this is the most generic way to implement periodic weight updates I found
-        self._cached_weights = {}
-        self._epochs_since_update = 0
-        self.N_epochs_with_same_weights = N_epochs_with_same_weights
+        self._running_avg_step_losses = {}
+        self._running_step_counts = {}
+        self.reset_weighting_at_epoch_start = reset_weighting_at_epoch_start
 
     @staticmethod
     def unroll(
@@ -165,7 +166,9 @@ def decide_starting_indices(
 
         return indices
 
-    def loss_data(self, unroll, eps=None, aggregation_strategy=None, condition_name=None):
+    def loss_data(
+        self, unroll, eps=None, aggregation_strategy=None, condition_name=None
+    ):
         """
         Compute the autoregressive multi-step data loss.
 
@@ -197,32 +200,68 @@ def loss_data(self, unroll, eps=None, aggregation_strategy=None, condition_name=
             step_loss = self._loss_fn(predicted_state, target_state)
             losses.append(step_loss)
 
-            if logger.isEnabledFor(logging.DEBUG) and (step <= 3 or torch.isnan(step_loss)):
+            if step <= 3 or torch.isnan(step_loss):
                 logger.debug(
                     "  Step %d: loss=%.4e, pred=[%.3f, %.3f]",
                     step,
                     float(step_loss.item()),
-                    float(predicted_state.min()),
-                    float(predicted_state.max()),
+                    float(predicted_state.detach().min()),
+                    float(predicted_state.detach().max()),
                 )
-            
+
             current_state = predicted_state
 
         step_losses = torch.stack(losses)  # [unroll_length]
 
         with torch.no_grad():
             condition_name = condition_name or "default"
             weights = self.get_weights(condition_name, step_losses, eps)
-            if logger.isEnabledFor(logging.DEBUG):
-                logger.debug("  Losses: %s", step_losses.detach().cpu().numpy().round(4))
-                logger.debug("  Weights: %s", weights.cpu().numpy().round(4))
-                logger.debug("  Weight ratio: %.1f", float(weights.max() / weights.min()))
+
+            logger.debug(
+                "  Losses: %s", step_losses.detach().cpu().numpy().round(4)
+            )
+            logger.debug("  Weights: %s", weights.cpu().numpy().round(4))
+            logger.debug(
+                "  Weight ratio: %.1f", float(weights.max() / weights.min())
+            )
 
         if aggregation_strategy is None:
             aggregation_strategy = torch.sum
 
         return aggregation_strategy(step_losses * weights)
 
+    def get_weights(self, condition_name, step_losses, eps):
+        """
+        Return cached weights or compute new ones.
+        :param str condition_name: Name of the condition.
+        :param torch.Tensor step_losses: 1D tensor of per-step losses.
+        :param float eps: Weighting parameter.
+        :return: Weights tensor.
+        :rtype: torch.Tensor
+        """
+        key = condition_name or "default"
+        x = step_losses.detach()
+
+        if x.dim() != 1:
+            raise ValueError(
+                f"step_losses must be a 1D tensor, got shape {x.shape}"
+            )
+
+        if key not in self._running_avg_step_losses:
+            self._running_avg_step_losses[key] = x.clone()
+            self._running_step_counts[key] = 1
+        else:
+            self._running_step_counts[key] += 1
+            k = self._running_step_counts[key]
+            # update running average
+            self._running_avg_step_losses[key] += (
+                x - self._running_avg_step_losses[key]
+            ) / k
+
+        return self._compute_adaptive_weights(
+            self._running_avg_step_losses[key], eps
+        )
+
     def _compute_adaptive_weights(self, step_losses, eps):
         """
         Actual computation of adaptive weights.
@@ -231,38 +270,25 @@ def _compute_adaptive_weights(self, step_losses, eps):
         :return: Computed weights tensor.
         :rtype: torch.Tensor
         """
-        print(f"updating weights, eps={eps}")
+        logger.debug(f"updating weights, eps={eps}")
 
         if eps is None:
             return torch.ones_like(step_losses) / step_losses.numel()
 
+        # normalize to mean 1 (avoid too large exponents)
+        step_losses = step_losses / (step_losses.mean() + 1e-12)
+
         log_w = torch.clamp(-eps * torch.cumsum(step_losses, dim=0), -20, 20)
         return torch.softmax(log_w, dim=0)
 
-    def get_weights(self, condition_name, step_losses, eps):
-        """
-        Return cached weights or compute new ones.
-        :param str condition_name: Name of the condition.
-        :param torch.Tensor step_losses: 1D tensor of per-step losses.
-        :param float eps: Weighting parameter.
-        :return: Weights tensor.
-        :rtype: torch.Tensor
-        """
-        cached = self._cached_weights.get(condition_name, None)
-        if cached is None:
-            cached = self._compute_adaptive_weights(step_losses, eps).cpu()
-            self._cached_weights[condition_name] = cached
-        return cached.to(step_losses.device)
-
-    def on_train_epoch_end(self):
+    def on_train_epoch_start(self):
         """
-        Hook called by Lightning at the end of each epoch.
-        Forces periodic recalculation of weights by clearing the cache.
+        Hook called by Lightning at the beginning of each epoch.
+        Forces periodic cleaning of he dictionaries used for weighting estimate.
         """
-        self._epochs_since_update += 1
-        if self._epochs_since_update >= self.N_epochs_with_same_weights:
-            self._cached_weights.clear()
-            self._epochs_since_update = 0
+        if self.reset_weighting_at_epoch_start:
+            self._running_avg_step_losses.clear()
+            self._running_step_counts.clear()
 
     def predict(self, initial_state, num_steps):
         """