Fix/assigned job timeout (#858)

PythonFZ · claude · web-flow · commit ca260052d008 · 2026-01-20T22:35:58.000+01:00
* fix: remove _worker_id to fix race condition in job processing

The client was storing _worker_id from server responses, but this caused
a race condition during socket reconnection:

1. Socket reconnects with NEW sid
2. Server assigns pending job to NEW sid
3. Socket event handler receives job:assign
4. But _worker_id still holds OLD value (not yet updated)
5. Worker sends wrong worker_id to server → 400 BAD REQUEST

The fix removes _worker_id entirely and always uses socket.sio.sid
directly. This is safe because:
- The server always assigns jobs to the socket's current sid
- The socket that receives job:assign is always the one with that sid
- socket.sio.sid always reflects the current connection

Changes:
- Remove _worker_id field from ZnDraw dataclass
- Simplify sid property to return socket.sio.sid directly
- Remove worker_id storage in socket_manager._register_extensions_after_join
- Update tests to use vis.sid instead of vis._worker_id

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;

* refactor: remove unused workerId return from registration methods

The register_extension() and register_filesystem() methods in
api_manager.py were returning workerId from server responses, but
callers no longer use this value after removing _worker_id storage.

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;

* feat: add ASSIGNED job timeout with lazy cleanup

Jobs stuck in ASSIGNED state for more than 30 seconds are now
automatically failed during job listing. This handles cases where
a worker disconnects before confirming the job.

Changes:
- Add cleanup_stale_assigned_jobs() to JobManager
- Call cleanup lazily during list_active_jobs()
- Add error and workerId fields to job API response
- Convert Job class to dataclass with proper type hints
- Add tests for timeout behavior

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;

* fix: use assigned_at timestamp for job timeout instead of created_at

- Add assigned_at timestamp when job transitions to ASSIGNED state
- Update cleanup_stale_assigned_jobs() to use assigned_at (with fallback
  to created_at for backwards compatibility)
- Update test to verify assigned_at is used for timeout calculation

This fixes the issue where a job that waited in PENDING state would
incorrectly timeout immediately upon assignment.

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;

* fix

* auto-refresh / fix bug

---------

Co-authored-by: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/app/src/hooks/useSchemas.ts b/app/src/hooks/useSchemas.ts
@@ -214,6 +214,22 @@ export const useJobs = (room: string) => {
 		};
 	}, [room, refetch]);
 
+	// Poll every 5 seconds when there are jobs in "assigned" state
+	// This triggers lazy cleanup on the backend for timed-out jobs
+	useEffect(() => {
+		const hasAssignedJobs = jobs.some((job) => job.status === "assigned");
+
+		if (!hasAssignedJobs || !room) {
+			return;
+		}
+
+		const intervalId = setInterval(() => {
+			refetch(false); // Silent refetch, no loading spinner
+		}, 5000);
+
+		return () => clearInterval(intervalId);
+	}, [jobs, room, refetch]);
+
 	return {
 		data: jobs,
 		isLoading: isLoading && !hasLoaded, // Only show loading if we haven't loaded yet
diff --git a/src/zndraw/api_manager.py b/src/zndraw/api_manager.py
@@ -374,9 +374,6 @@ def register_extension(
                 f"Extension registration failed: {data.get('error', 'Unknown error')}"
             )
 
-        # Return the worker_id assigned by server so caller can store it
-        return data.get("workerId")
-
     def register_filesystem(
         self,
         name: str,
@@ -431,9 +428,6 @@ def register_filesystem(
                 f"Filesystem registration failed: {data.get('error', 'Unknown error')}"
             )
 
-        # Return the worker_id assigned by server so caller can store it
-        return data.get("workerId")
-
     def get_frames(
         self, indices_or_slice, keys: list[str] | None = None
     ) -> list[dict[bytes, bytes]]:
diff --git a/src/zndraw/app/job_manager.py b/src/zndraw/app/job_manager.py
@@ -15,6 +15,10 @@
 
 log = logging.getLogger(__name__)
 
+# Jobs in ASSIGNED state should transition to PROCESSING within seconds.
+# If they stay in ASSIGNED longer than this, they're considered stale (worker died/disconnected).
+ASSIGNED_TIMEOUT_SECONDS = 30
+
 
 def _emit_job_state_changed(
     socketio,
@@ -249,6 +253,7 @@ def assign_job(
         update_data = {
             "status": JobStatus.ASSIGNED,
             "worker_id": worker_id,
+            "assigned_at": utc_now_iso(),
         }
 
         redis_client.hset(job_keys.hash_key(), mapping=update_data)
@@ -440,6 +445,73 @@ def fail_job(redis_client: Any, job_id: str, error: str, socketio=None) -> bool:
 
         return True
 
+    @staticmethod
+    def cleanup_stale_assigned_jobs(
+        redis_client: Any, room: str, socketio=None
+    ) -> list[str]:
+        """Fail jobs stuck in ASSIGNED state for too long.
+
+        Jobs in ASSIGNED state should transition to PROCESSING within seconds.
+        If they stay in ASSIGNED longer than ASSIGNED_TIMEOUT_SECONDS, the
+        worker likely died or disconnected before confirming the job.
+
+        Parameters
+        ----------
+        redis_client
+            Redis client instance
+        room : str
+            Room identifier
+        socketio
+            SocketIO instance for emitting events (optional)
+
+        Returns
+        -------
+        list[str]
+            List of job IDs that were failed due to timeout
+        """
+        room_keys = RoomKeys(room)
+        # Convert to list to avoid "Set changed size during iteration" error
+        # since fail_job() removes jobs from the active set
+        job_ids = list(redis_client.smembers(room_keys.jobs_active()))
+        failed_jobs = []
+        now = utc_now_timestamp()
+
+        for job_id in job_ids:
+            job_keys = JobKeys(job_id)
+            job_data = redis_client.hgetall(job_keys.hash_key())
+
+            if not job_data:
+                continue
+
+            # Only check ASSIGNED jobs
+            if job_data.get("status") != JobStatus.ASSIGNED:
+                continue
+
+            # Check if job has been in ASSIGNED state too long
+            # Use assigned_at if available, fallback to created_at for backwards compatibility
+            assigned_at = job_data.get("assigned_at") or job_data.get("created_at")
+            if not assigned_at:
+                continue
+
+            try:
+                assigned_timestamp = isoparse(assigned_at).timestamp()
+                age_seconds = now - assigned_timestamp
+
+                if age_seconds > ASSIGNED_TIMEOUT_SECONDS:
+                    # Fail the job due to timeout
+                    error_msg = (
+                        f"Job timed out in ASSIGNED state after {age_seconds:.1f}s "
+                        f"(threshold: {ASSIGNED_TIMEOUT_SECONDS}s). "
+                        "Worker likely disconnected before processing."
+                    )
+                    JobManager.fail_job(redis_client, job_id, error_msg, socketio)
+                    failed_jobs.append(job_id)
+                    log.warning(f"Cleaned up stale ASSIGNED job {job_id}: {error_msg}")
+            except Exception as e:
+                log.error(f"Error checking job {job_id} for timeout: {e}")
+
+        return failed_jobs
+
     @staticmethod
     def get_job(redis_client: Any, job_id: str) -> Optional[dict]:
         """Get job details.
@@ -464,16 +536,28 @@ def get_job(redis_client: Any, job_id: str) -> Optional[dict]:
         return job_data
 
     @staticmethod
-    def list_active_jobs(redis_client: Any, room: str) -> list[dict]:
+    def list_active_jobs(redis_client: Any, room: str, socketio=None) -> list[dict]:
         """List all active (queued or running) jobs for a room.
 
-        Args:
-            redis_client: Redis client instance
-            room: Room identifier
+        Performs lazy cleanup of stale ASSIGNED jobs before returning results.
 
-        Returns:
+        Parameters
+        ----------
+        redis_client
+            Redis client instance
+        room : str
+            Room identifier
+        socketio
+            SocketIO instance for emitting events during cleanup (optional)
+
+        Returns
+        -------
+        list[dict]
             List of job data dicts
         """
+        # Lazy cleanup: fail any jobs stuck in ASSIGNED state
+        JobManager.cleanup_stale_assigned_jobs(redis_client, room, socketio)
+
         room_keys = RoomKeys(room)
         job_ids = redis_client.smembers(room_keys.jobs_active())
 
@@ -536,15 +620,26 @@ def list_inactive_jobs(redis_client: Any, room: str) -> list[dict]:
         return jobs
 
     @staticmethod
-    def list_all_jobs(redis_client: Any, room: str) -> list[dict]:
+    def list_all_jobs(redis_client: Any, room: str, socketio=None) -> list[dict]:
         """List all jobs for a room.
-        Args:
-            redis_client: Redis client instance
-            room: Room identifier
-        Returns:
+
+        Performs lazy cleanup of stale ASSIGNED jobs via list_active_jobs.
+
+        Parameters
+        ----------
+        redis_client
+            Redis client instance
+        room : str
+            Room identifier
+        socketio
+            SocketIO instance for emitting events during cleanup (optional)
+
+        Returns
+        -------
+        list[dict]
             List of job data dicts
         """
-        active_jobs = JobManager.list_active_jobs(redis_client, room)
+        active_jobs = JobManager.list_active_jobs(redis_client, room, socketio)
         inactive_jobs = JobManager.list_inactive_jobs(redis_client, room)
         return active_jobs + inactive_jobs
 
diff --git a/src/zndraw/app/job_routes.py b/src/zndraw/app/job_routes.py
@@ -120,10 +120,13 @@ def _transition_worker_to_idle(
 
 @jobs.route("/api/rooms/<string:room_id>/jobs", methods=["GET"])
 def list_jobs(room_id: str):
-    """List active jobs for a room."""
+    """List active jobs for a room.
+
+    Performs lazy cleanup of stale ASSIGNED jobs.
+    """
     redis_client = current_app.extensions["redis"]
-    jobs = JobManager.list_all_jobs(redis_client, room_id)
-    return jobs, 200
+    jobs_list = JobManager.list_all_jobs(redis_client, room_id, socketio)
+    return jobs_list, 200
 
 
 @jobs.route("/api/rooms/<string:room_id>/jobs/<string:job_id>", methods=["GET"])
@@ -191,6 +194,8 @@ def get_job_details(job_id: str):
         "public": job.get("public") == "true",
         "status": job["status"],
         "createdAt": job.get("created_at"),
+        "error": job.get("error") or None,
+        "workerId": job.get("worker_id") or None,
     }
 
     log.debug(f"Worker fetching job {job_id}: {job['category']}/{job['extension']}")
diff --git a/src/zndraw/job.py b/src/zndraw/job.py
@@ -1,19 +1,19 @@
 """Job object for tracking job progress."""
 
 import time
+from dataclasses import dataclass, field
 from typing import Any
 
 from zndraw.app.job_manager import JobStatus
 
 
+@dataclass
 class Job:
     """Represents a submitted job and allows tracking its progress.
 
     This object is returned by vis.run() and provides methods to monitor
     job execution, wait for completion, and retrieve results.
 
-    In Jupyter notebooks, displays an iframe showing live progress.
-
     Parameters
     ----------
     job_id : str
@@ -22,23 +22,23 @@ class Job:
         Server URL
     room : str
         Room ID
-    api : APIManager
+    api : Any
         API manager instance for making requests
+    socket : Any
+        Socket manager instance (optional)
 
     Examples
     --------
     >>> job = vis.run(MyExtension(param=42))
     >>> job.wait()  # Block until completion
-    >>> result = job.get_result()
     """
 
-    def __init__(self, job_id: str, url: str, room: str, api: Any, socket: Any = None):
-        self.job_id = job_id
-        self.url = url
-        self.room = room
-        self.api = api
-        self.socket = socket
-        self._cached_data: dict[str, Any] | None = None
+    job_id: str
+    url: str
+    room: str
+    api: Any
+    socket: Any = None
+    _cached_data: dict[str, Any] = field(default_factory=dict, repr=False)
 
     def refresh(self) -> dict[str, Any]:
         """Fetch latest job status from server.
@@ -51,6 +51,12 @@ def refresh(self) -> dict[str, Any]:
         self._cached_data = self.api.get_job(self.job_id)
         return self._cached_data
 
+    def _ensure_cached(self) -> dict[str, Any]:
+        """Ensure cached data is loaded, fetching if necessary."""
+        if not self._cached_data:
+            self.refresh()
+        return self._cached_data
+
     @property
     def status(self) -> str:
         """Get current job status.
@@ -60,9 +66,7 @@ def status(self) -> str:
         str
             One of: pending, assigned, processing, completed, failed
         """
-        if self._cached_data is None:
-            self.refresh()
-        return self._cached_data.get("status", "unknown")
+        return self._ensure_cached().get("status", "unknown")
 
     def is_pending(self) -> bool:
         """Check if job is pending (waiting for worker)."""
@@ -88,6 +92,28 @@ def is_done(self) -> bool:
         """Check if job is in a terminal state (completed or failed)."""
         return self.is_completed() or self.is_failed()
 
+    @property
+    def error(self) -> str | None:
+        """Get error message if job failed.
+
+        Returns
+        -------
+        str | None
+            Error message if failed, None otherwise
+        """
+        return self._ensure_cached().get("error") or None
+
+    @property
+    def worker_id(self) -> str | None:
+        """Get the worker ID assigned to this job.
+
+        Returns
+        -------
+        str | None
+            Worker session ID if assigned, None otherwise
+        """
+        return self._ensure_cached().get("workerId") or None
+
     def wait(self, timeout: float | None = None, poll_interval: float = 0.5) -> None:
         """Block until job completes or fails.
 
@@ -120,19 +146,4 @@ def wait(self, timeout: float | None = None, poll_interval: float = 0.5) -> None
 
     def __repr__(self) -> str:
         """Terminal-friendly representation."""
-        status = self.status
-        return f"Job(id={self.job_id}, status={status})"
-
-    # def _repr_html_(self) -> str:
-    #     """Jupyter notebook representation using iframe.
-
-    #     Displays live progress by embedding the server's job page.
-    #     """
-    #     try:
-    #         from IPython.display import IFrame
-    #     except ImportError:
-    #         raise ImportError(
-    #             "IPython is required for viewer display. Install with: uv add / pip install ipython"
-    #         )
-    #     iframe_url = f"{self.url}/job/{self.job_id}"
-    #     return IFrame(src=iframe_url, width="100%", height=600)._repr_html_()
+        return f"Job(id={self.job_id}, status={self.status})"
diff --git a/src/zndraw/zndraw.py b/src/zndraw/zndraw.py
@@ -689,16 +689,15 @@ def load_selection_group(self, group_name: str) -> None:
     def sid(self) -> str | None:
         """Return the worker ID assigned by the server.
 
-        The server assigns a worker ID (its request.sid) during extension registration.
-        This ID is used consistently for both registration and disconnect cleanup.
+        The server assigns a worker ID (its request.sid) during room:join.
+        This ID is used for job assignment and worker tracking.
 
         Returns
         -------
         str | None
-            The worker ID assigned by server, client's socket.sio.sid if not yet registered,
-            or None if not connected.
+            The server-assigned worker ID, or None if not yet connected.
         """
-        return self._worker_id if self._worker_id else self.socket.sio.sid
+        return self._worker_id
 
     @property
     def is_admin(self) -> bool:
diff --git a/tests/test_job_endpoints.py b/tests/test_job_endpoints.py