feat(tidy3d): FXC-5311-enable-cached-loading-from-batch-data

marcorudolphflex · marcorudolphflex · commit b2a5769cea82 · 2026-02-04T17:07:15.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added `GaussianPort` and `AstigmaticGaussianPort` for S-matrix calculations using Gaussian beam sources and overlap monitors.
 - Added `symmetric_pseudo` option for `s_param_def` in `TerminalComponentModeler` which applies a scaling factor that ensures the S-matrix is symmetric in reciprocal systems.
 - Added deprecation warning for ``TemperatureMonitor`` and ``SteadyPotentialMonitor`` when ``unstructured`` parameter is not explicitly set. The default value of ``unstructured`` will change from ``False`` to ``True`` after the 2.11 release.
+- Added in-memory caching for downloaded batch results, configurable via ``config.batch_data_cache``.
 
 ### Breaking Changes
 - Added optional automatic extrusion of structures at the simulation boundaries into/through PML/Absorber layers via `extrude_structures` field in class `AbsorberSpec`.
diff --git a/docs/configuration/reference.rst b/docs/configuration/reference.rst
@@ -245,6 +245,29 @@ Controls the optional on-disk cache for simulation artifacts.
      - Maximum number of cached simulations retained. ``0`` means no limit and eviction falls back to size constraints.
 
 
+Batch Data Cache (``config.batch_data_cache``)
+----------------------------------------------
+
+Controls the in-memory cache used when accessing entries in ``BatchData``.
+
+.. list-table::
+   :header-rows: 1
+   :widths: 24 18 10 48
+
+   * - Option
+     - Default
+     - Persisted
+     - Description
+   * - ``enabled``
+     - ``True``
+     - Yes
+     - Cache batch results in memory when all task data files are below the size threshold.
+   * - ``max_total_size_gb``
+     - ``1.0``
+     - Yes
+     - Cache batch task data only when the combined size of all task data files is at or below this threshold. ``0`` disables caching.
+
+
 Plugins (``config.plugins``)
 ----------------------------
 
diff --git a/tests/test_web/test_batch_data_cache.py b/tests/test_web/test_batch_data_cache.py
@@ -0,0 +1,95 @@
+"""Tests for BatchData in-memory caching."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import tidy3d as td
+from tidy3d.web.api import container as web_container
+
+
+def _write_bytes(path: Path, size: int) -> None:
+    path.write_bytes(b"0" * size)
+
+
+def test_batch_data_caches_small_files(monkeypatch, tmp_path):
+    task_paths = {
+        "task1": str(tmp_path / "task1.hdf5"),
+        "task2": str(tmp_path / "task2.hdf5"),
+    }
+    task_ids = {"task1": "task-1", "task2": "task-2"}
+    _write_bytes(Path(task_paths["task1"]), 1)
+    _write_bytes(Path(task_paths["task2"]), 2)
+
+    monkeypatch.setattr(td.config.batch_data_cache, "enabled", True)
+    monkeypatch.setattr(td.config.batch_data_cache, "max_total_size_gb", 1.0)
+
+    calls = {"load": 0, "info": 0}
+    sentinels = [object(), object()]
+
+    def fake_load(*args, **kwargs):
+        result = sentinels[calls["load"]]
+        calls["load"] += 1
+        return result
+
+    def fake_get_info(*args, **kwargs):
+        calls["info"] += 1
+        return None
+
+    monkeypatch.setattr(web_container.web, "load", fake_load)
+    monkeypatch.setattr(web_container.web, "get_info", fake_get_info)
+
+    batch_data = td.web.BatchData(
+        task_paths=task_paths,
+        task_ids=task_ids,
+        is_downloaded=True,
+    )
+
+    first = batch_data["task1"]
+    second = batch_data["task1"]
+
+    assert first is second
+    assert calls["load"] == 1
+    assert calls["info"] == 1
+
+
+def test_batch_data_skips_cache_when_any_file_is_large(monkeypatch, tmp_path):
+    task_paths = {
+        "task1": str(tmp_path / "task1.hdf5"),
+        "task2": str(tmp_path / "task2.hdf5"),
+    }
+    task_ids = {"task1": "task-1", "task2": "task-2"}
+    _write_bytes(Path(task_paths["task1"]), 1)
+    _write_bytes(Path(task_paths["task2"]), 2)
+
+    threshold_gb = 2 / (1024**3)
+    monkeypatch.setattr(td.config.batch_data_cache, "enabled", True)
+    monkeypatch.setattr(td.config.batch_data_cache, "max_total_size_gb", threshold_gb)
+
+    calls = {"load": 0, "info": 0}
+    sentinels = [object(), object()]
+
+    def fake_load(*args, **kwargs):
+        result = sentinels[calls["load"]]
+        calls["load"] += 1
+        return result
+
+    def fake_get_info(*args, **kwargs):
+        calls["info"] += 1
+        return None
+
+    monkeypatch.setattr(web_container.web, "load", fake_load)
+    monkeypatch.setattr(web_container.web, "get_info", fake_get_info)
+
+    batch_data = td.web.BatchData(
+        task_paths=task_paths,
+        task_ids=task_ids,
+        is_downloaded=True,
+    )
+
+    first = batch_data["task1"]
+    second = batch_data["task1"]
+
+    assert first is not second
+    assert calls["load"] == 2
+    assert calls["info"] == 2
diff --git a/tidy3d/config/README.md b/tidy3d/config/README.md
@@ -50,7 +50,7 @@ flowchart LR
 
 ## Module Reference
 
-- `sections.py` - Pydantic models for built-in sections (logging, simulation, microwave, adjoint, web, local cache, plugin container) registered via `register_section`. The bundled models inherit from the internal `ConfigSection` helper, but external code can use plain `BaseModel` subclasses. Optional handlers perform side effects. Fields mark persistence with `json_schema_extra={"persist": True}`.
+- `sections.py` - Pydantic models for built-in sections (logging, simulation, microwave, adjoint, web, local cache, in-memory batch data cache, plugin container) registered via `register_section`. The bundled models inherit from the internal `ConfigSection` helper, but external code can use plain `BaseModel` subclasses. Optional handlers perform side effects. Fields mark persistence with `json_schema_extra={"persist": True}`.
 - `registry.py` - Stores section and handler registries and notifies the attached manager so new entries appear immediately.
 - `manager.py` - `ConfigManager` caches validated models, tracks runtime overrides per profile, filters persisted fields, exposes helpers such as `plugins`, `profiles`, and `format`. `SectionAccessor` routes attribute access to `update_section`.
 - `loader.py` - Resolves the config directory, loads `config.toml` and `profiles/<name>.toml`, parses environment overrides, and writes atomically through `serializer.build_document`.
diff --git a/tidy3d/config/sections.py b/tidy3d/config/sections.py
@@ -514,6 +514,27 @@ def _serialize_directory(self, value: Path) -> str:
         return str(value)
 
 
+class BatchDataCacheConfig(ConfigSection):
+    """Settings controlling in-memory caching for batch data."""
+
+    enabled: bool = Field(
+        True,
+        title="Enable batch data cache",
+        description="Cache batch results in memory when files are below the size threshold.",
+        json_schema_extra={"persist": True},
+    )
+
+    max_total_size_gb: NonNegativeFloat = Field(
+        1.0,
+        title="Maximum total batch data size (GB)",
+        description=(
+            "Cache batch task data only when the combined size of all task data files is at or "
+            "below this threshold. Set to 0 to disable."
+        ),
+        json_schema_extra={"persist": True},
+    )
+
+
 @register_section("plugins")
 class PluginsContainer(ConfigSection):
     """Container that holds plugin-specific configuration sections."""
@@ -527,10 +548,12 @@ class PluginsContainer(ConfigSection):
     register_section("web")(WebConfig)
     register_handler("web")(apply_web)
     register_section("local_cache")(LocalCacheConfig)
+    register_section("batch_data_cache")(BatchDataCacheConfig)
 
 
 __all__ = [
     "AdjointConfig",
+    "BatchDataCacheConfig",
     "LocalCacheConfig",
     "LoggingConfig",
     "MicrowaveConfig",
diff --git a/tidy3d/web/api/container.py b/tidy3d/web/api/container.py
@@ -27,10 +27,12 @@
     TimeElapsedColumn,
 )
 
+from tidy3d._runtime import WASM_BUILD
 from tidy3d.components.base import Tidy3dBaseModel, cached_property
 from tidy3d.components.mode.mode_solver import ModeSolver
 from tidy3d.components.types import annotate_type
 from tidy3d.components.types.workflow import WorkflowDataType, WorkflowType
+from tidy3d.config import config
 from tidy3d.exceptions import DataError
 from tidy3d.log import get_logging_console, log
 from tidy3d.web.api import webapi as web
@@ -617,24 +619,80 @@ class BatchData(Tidy3dBaseModel, Mapping):
         description="Whether the simulation data was downloaded before.",
     )
 
+    _data_cache: dict[TaskName, WorkflowDataType] = PrivateAttr(default_factory=dict)
+    _cache_enabled: Optional[bool] = PrivateAttr(default=None)
+
+    def _should_cache_data(self) -> bool:
+        """Return True when in-memory caching should be enabled for batch data."""
+        if self._cache_enabled is not None:
+            return self._cache_enabled
+
+        self._cache_enabled = False
+        if WASM_BUILD:
+            return False
+
+        try:
+            cache_config = config.batch_data_cache
+        except AttributeError:
+            return False
+        if not cache_config.enabled:
+            return False
+
+        max_bytes = int(cache_config.max_total_size_gb * (1024**3))
+        if max_bytes <= 0:
+            return False
+
+        total_size = 0
+        for task_path in self.task_paths.values():
+            try:
+                file_size = Path(task_path).stat().st_size
+            except FileNotFoundError:  # not downloaded yet
+                self._cache_enabled = None
+                return False
+            total_size += file_size
+            if total_size > max_bytes:
+                return False
+
+        self._cache_enabled = True
+        return True
+
     def load_sim_data(self, task_name: str) -> WorkflowDataType:
-        """Load a simulation data object from file by task name."""
+        """Load a simulation data object from file by task name.
+
+        When ``config.batch_data_cache.enabled`` is ``True`` and the total size of all task
+        files stays under the configured threshold, the loaded object is cached in
+        memory for subsequent accesses.
+        """
+        cache_enabled = self._should_cache_data()
+        if cache_enabled and task_name in self._data_cache:
+            return self._data_cache[task_name]
+
         task_data_path = Path(self.task_paths[task_name])
         task_id = self.task_ids[task_name]
         from_cache = self.cached_tasks[task_name] if self.cached_tasks else False
         if not from_cache:
             web.get_info(task_id)
 
-        return web.load(
+        data = web.load(
             task_id=None if from_cache else task_id,
             path=task_data_path,
             verbose=False,
             replace_existing=not (from_cache or self.is_downloaded),
             lazy=self.lazy,
         )
 
+        if not cache_enabled and self._cache_enabled is None:
+            cache_enabled = self._should_cache_data()
+        if cache_enabled:
+            self._data_cache[task_name] = data
+        return data
+
     def __getitem__(self, task_name: TaskName) -> WorkflowDataType:
-        """Get the simulation data object for a given ``task_name``."""
+        """Get the simulation data object for a given ``task_name``.
+
+        When ``config.batch_data_cache.enabled`` is `True` and the batch data size is within
+        the configured threshold, the result is cached in memory.
+        """
         return self.load_sim_data(task_name)
 
     def __iter__(self) -> Iterator[TaskName]:
@@ -811,9 +869,10 @@ def run(
         >>> for task_name, sim_data in batch_data.items(): # doctest: +SKIP
         ...     # do something with data. # doctest: +SKIP
 
-        ``batch_data`` does not store all of the data objects in memory,
-        rather it iterates over the task names and loads the corresponding
-        data from file one by one. If no file exists for that task, it downloads it.
+        ``batch_data`` iterates over task names and loads the corresponding data
+        from file one by one. When ``config.batch_data_cache.enabled`` is ``True`` and the
+        total size of all task files is below `config.batch_data_cache.max_total_size_gb`,
+        accessed results are cached in memory to avoid repeated loads.
         """
         loaded = [job.load_if_cached for job in self.jobs.values()]
         self._check_path_dir(path_dir)