Skip to content

Commit e21dca0

Browse files
wesmclaude
andauthored
Robust daemon lifecycle management and safety improvements (#118)
## Summary This PR comprehensively improves daemon lifecycle management, addressing multiple issues around daemon startup, shutdown, and process safety. ### Core Changes - **Fix update command creating multiple daemons** - Use `stopDaemon()` and `killAllDaemons()` instead of HTTP POST to ensure old daemon is properly killed before starting new one - **Highlander mode** - Prevent multiple daemons from running simultaneously by detecting and cleaning up existing instances at startup - **Per-PID runtime files** - Use `daemon.$PID.json` pattern instead of single `daemon.json` to properly track multiple daemon instances and enable reliable cleanup ### Safety Improvements - **PID reuse protection** - Tri-state process identity checking (`processIsRoborev`, `processNotRoborev`, `processUnknown`) prevents killing unrelated processes if a PID is reused - **Conservative unknown handling** - When process identity can't be determined (permissions, missing tools), keep runtime file to avoid orphaning running daemons - **Loopback validation** - Strict address validation prevents SSRF via malicious runtime files; only allows HTTP calls to 127.x.x.x, ::1, and localhost - **Invalid runtime cleanup** - Remove runtime files with invalid data (pid <= 0, empty addr) during listing ### Reliability Improvements - **IsDaemonAlive retry logic** - 2 attempts with 1s timeout to avoid false negatives on slow or transiently failing daemons - **Windows locale-independent detection** - Use `tasklist /FO CSV` and `wmic` for process detection instead of parsing localized strings - **PowerShell fallback** - Use `Get-CimInstance` on Windows systems without `wmic` (common on newer Win11) - **UTF-16 BOM handling** - Properly handle UTF-16LE/BE encoded output from Windows tools ### TUI Improvements - **Daemon reconnection** - TUI sessions now automatically reconnect if daemon restarts on a different port. After 3 consecutive connection failures, the TUI re-reads runtime files to find the new daemon address. ## Test plan - [x] Run `roborev update --force` and verify only one daemon process running - [x] All daemon tests pass on macOS - [x] TUI reconnection tests pass - [ ] CI passes on Linux and Windows - [ ] `pgrep -fl "roborev daemon"` shows only one process after update 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 6895ad5 commit e21dca0

File tree

12 files changed

+1828
-86
lines changed

12 files changed

+1828
-86
lines changed

cmd/roborev/main.go

Lines changed: 29 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ import (
55
"bytes"
66
"context"
77
"encoding/json"
8-
"errors"
98
"fmt"
109
"io"
1110
"log"
@@ -82,7 +81,7 @@ func main() {
8281

8382
// getDaemonAddr returns the daemon address from runtime file or default
8483
func getDaemonAddr() string {
85-
if info, err := daemon.ReadRuntime(); err == nil {
84+
if info, err := daemon.GetAnyRunningDaemon(); err == nil {
8685
return fmt.Sprintf("http://%s", info.Addr)
8786
}
8887
return serverAddr
@@ -93,8 +92,8 @@ func getDaemonAddr() string {
9392
func ensureDaemon() error {
9493
client := &http.Client{Timeout: 500 * time.Millisecond}
9594

96-
// First check runtime file for daemon address
97-
if info, err := daemon.ReadRuntime(); err == nil {
95+
// First check runtime files for any running daemon
96+
if info, err := daemon.GetAnyRunningDaemon(); err == nil {
9897
addr := fmt.Sprintf("http://%s/api/status", info.Addr)
9998
resp, err := client.Get(addr)
10099
if err == nil {
@@ -165,7 +164,7 @@ func startDaemon() error {
165164
client := &http.Client{Timeout: 500 * time.Millisecond}
166165
for i := 0; i < 30; i++ {
167166
time.Sleep(100 * time.Millisecond)
168-
if info, err := daemon.ReadRuntime(); err == nil {
167+
if info, err := daemon.GetAnyRunningDaemon(); err == nil {
169168
addr := fmt.Sprintf("http://%s", info.Addr)
170169
resp, err := client.Get(addr + "/api/status")
171170
if err == nil {
@@ -182,59 +181,41 @@ func startDaemon() error {
182181
// ErrDaemonNotRunning indicates no daemon runtime file was found
183182
var ErrDaemonNotRunning = fmt.Errorf("daemon not running (no runtime file found)")
184183

185-
// stopDaemon stops the running daemon using PID from daemon.json.
186-
// Returns ErrDaemonNotRunning if the runtime file is missing.
187-
// For corrupted/malformed files (JSON parse errors), removes them and returns ErrDaemonNotRunning.
188-
// Propagates permission/IO errors to the caller.
184+
// stopDaemon stops any running daemons.
185+
// Returns ErrDaemonNotRunning if no daemon runtime files are found.
189186
func stopDaemon() error {
190-
info, err := daemon.ReadRuntime()
187+
runtimes, err := daemon.ListAllRuntimes()
191188
if err != nil {
189+
// Check if it's just a "not exist" type error
192190
if os.IsNotExist(err) {
193191
return ErrDaemonNotRunning
194192
}
195-
// Check if it's a JSON parse error (corrupted file)
196-
// This includes SyntaxError, UnmarshalTypeError, and UnexpectedEOF (truncated files)
197-
var syntaxErr *json.SyntaxError
198-
var typeErr *json.UnmarshalTypeError
199-
if errors.As(err, &syntaxErr) || errors.As(err, &typeErr) || errors.Is(err, io.ErrUnexpectedEOF) {
200-
// Corrupted file - remove it and treat as not running
201-
daemon.RemoveRuntime()
202-
return ErrDaemonNotRunning
203-
}
204-
// Permission/IO error - propagate to caller
205-
return fmt.Errorf("failed to read daemon runtime file: %w", err)
193+
// Propagate other errors (permission, IO, etc.)
194+
return fmt.Errorf("failed to list daemon runtimes: %w", err)
206195
}
207-
if info.PID <= 0 {
208-
daemon.RemoveRuntime() // Clean up invalid runtime file
196+
if len(runtimes) == 0 {
209197
return ErrDaemonNotRunning
210198
}
211199

212-
// Kill by specific PID (works regardless of process name)
213-
if runtime.GOOS == "windows" {
214-
exec.Command("taskkill", "/PID", fmt.Sprintf("%d", info.PID), "/F").Run()
215-
} else {
216-
// Send SIGTERM first for graceful shutdown
217-
exec.Command("kill", "-TERM", fmt.Sprintf("%d", info.PID)).Run()
218-
time.Sleep(500 * time.Millisecond)
219-
// Then SIGKILL to ensure it's dead
220-
exec.Command("kill", "-KILL", fmt.Sprintf("%d", info.PID)).Run()
221-
}
222-
// Clean up runtime file
223-
daemon.RemoveRuntime()
224-
time.Sleep(500 * time.Millisecond)
225-
return nil
200+
// Kill all found daemons, track failures
201+
var lastErr error
202+
for _, info := range runtimes {
203+
if !daemon.KillDaemon(info) {
204+
lastErr = fmt.Errorf("failed to kill daemon (pid %d)", info.PID)
205+
}
206+
}
207+
208+
return lastErr
226209
}
227210

228211
// killAllDaemons kills any roborev daemon processes that might be running
229212
// This handles orphaned processes from old binaries or crashed restarts
230213
func killAllDaemons() {
231214
if runtime.GOOS == "windows" {
232-
// On Windows, use wmic to kill roborev.exe processes except our own PID
233-
// taskkill /IM would kill the CLI itself, so we filter by PID
234-
myPID := os.Getpid()
235-
// Use wmic to find and kill daemon processes, excluding our PID
215+
// On Windows, use wmic to find daemon processes by command line
216+
// and kill only those running "daemon run"
236217
exec.Command("wmic", "process", "where",
237-
fmt.Sprintf("name='roborev.exe' and processid!=%d", myPID),
218+
"commandline like '%roborev%daemon%run%'",
238219
"call", "terminate").Run()
239220
} else {
240221
// On Unix, use pkill to kill all roborev daemon processes
@@ -2038,18 +2019,13 @@ official release over a dev build.`,
20382019
}
20392020
}
20402021

2041-
// Restart daemon if running
2042-
if daemonInfo, err := daemon.ReadRuntime(); err == nil && daemonInfo != nil {
2022+
// Restart daemon if any are running
2023+
if runtimes, err := daemon.ListAllRuntimes(); err == nil && len(runtimes) > 0 {
20432024
fmt.Print("Restarting daemon... ")
2044-
// Stop old daemon with timeout
2045-
stopURL := fmt.Sprintf("http://%s/api/shutdown", daemonInfo.Addr)
2046-
client := &http.Client{Timeout: 5 * time.Second}
2047-
if resp, err := client.Post(stopURL, "application/json", nil); err != nil {
2048-
fmt.Printf("warning: failed to stop daemon: %v\n", err)
2049-
} else {
2050-
resp.Body.Close()
2051-
}
2052-
time.Sleep(500 * time.Millisecond)
2025+
// Stop all running daemons
2026+
_ = stopDaemon()
2027+
// Kill any orphaned daemon processes
2028+
killAllDaemons()
20532029

20542030
// Start new daemon using "roborev daemon run"
20552031
newBinary := filepath.Join(binDir, "roborev")

cmd/roborev/main_test.go

Lines changed: 32 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,8 @@ import (
88
"bytes"
99
"context"
1010
"encoding/json"
11-
"errors"
1211
"fmt"
1312
"io"
14-
"io/fs"
1513
"net/http"
1614
"net/http/httptest"
1715
"os"
@@ -39,7 +37,22 @@ import (
3937
func setupMockDaemon(t *testing.T, handler http.Handler) (*httptest.Server, func()) {
4038
t.Helper()
4139

42-
ts := httptest.NewServer(handler)
40+
// Wrap handler to handle /api/status requests.
41+
// IsDaemonAlive calls /api/status to verify daemon is alive.
42+
// ensureDaemon also calls /api/status to check the daemon version.
43+
// We need to return a proper response with version info to prevent
44+
// ensureDaemon from trying to restart the daemon (which would kill the test).
45+
wrappedHandler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
46+
if r.URL.Path == "/api/status" && r.Method == http.MethodGet {
47+
json.NewEncoder(w).Encode(map[string]interface{}{
48+
"version": version.Version,
49+
})
50+
return
51+
}
52+
handler.ServeHTTP(w, r)
53+
})
54+
55+
ts := httptest.NewServer(wrappedHandler)
4356

4457
// Use ROBOREV_DATA_DIR to override data directory (works cross-platform)
4558
// On Windows, HOME doesn't work since os.UserHomeDir() uses USERPROFILE
@@ -1531,16 +1544,20 @@ func TestDaemonStopInvalidPID(t *testing.T) {
15311544
}
15321545
}()
15331546

1534-
// Create daemon.json with PID 0
1535-
daemonInfo := daemon.RuntimeInfo{PID: 0, Addr: "127.0.0.1:7373"}
1547+
// Create daemon.json with PID 0 and an address on a port that's definitely not in use
1548+
// Port 59999 is unlikely to be in use and will get connection refused quickly
1549+
daemonInfo := daemon.RuntimeInfo{PID: 0, Addr: "127.0.0.1:59999"}
15361550
data, _ := json.Marshal(daemonInfo)
15371551
if err := os.WriteFile(filepath.Join(tmpDir, "daemon.json"), data, 0644); err != nil {
15381552
t.Fatalf("write daemon.json: %v", err)
15391553
}
15401554

1555+
// ListAllRuntimes validates files and removes invalid ones (pid <= 0),
1556+
// so it returns an empty list, and stopDaemon returns ErrDaemonNotRunning.
1557+
// The key is that the invalid file gets cleaned up.
15411558
err := stopDaemon()
15421559
if err != ErrDaemonNotRunning {
1543-
t.Errorf("expected ErrDaemonNotRunning for PID 0, got %v", err)
1560+
t.Errorf("expected ErrDaemonNotRunning (invalid file removed during listing), got %v", err)
15441561
}
15451562

15461563
// Verify daemon.json was cleaned up
@@ -1611,8 +1628,11 @@ func TestDaemonStopTruncatedFile(t *testing.T) {
16111628
}
16121629
}
16131630

1614-
// TestDaemonStopPermissionError verifies stopDaemon propagates permission errors
1615-
func TestDaemonStopPermissionError(t *testing.T) {
1631+
// TestDaemonStopUnreadableFileSkipped verifies stopDaemon skips unreadable files
1632+
// With the new per-PID runtime file pattern, ListAllRuntimes continues scanning
1633+
// even when some files are unreadable. This allows daemon discovery to work even
1634+
// if some runtime files are temporarily inaccessible.
1635+
func TestDaemonStopUnreadableFileSkipped(t *testing.T) {
16161636
if os.Geteuid() == 0 {
16171637
t.Skip("skipping permission test when running as root")
16181638
}
@@ -1652,16 +1672,10 @@ func TestDaemonStopPermissionError(t *testing.T) {
16521672
}
16531673

16541674
err := stopDaemon()
1655-
// Should propagate the permission error, not ErrDaemonNotRunning
1656-
if err == ErrDaemonNotRunning {
1657-
t.Error("expected permission error to be propagated, got ErrDaemonNotRunning")
1658-
}
1659-
if err == nil {
1660-
t.Error("expected error for permission denied, got nil")
1661-
}
1662-
// Check that the underlying error is a permission error (using errors.Is to unwrap)
1663-
if !errors.Is(err, fs.ErrPermission) {
1664-
t.Errorf("expected permission error, got: %v", err)
1675+
// With the new behavior, unreadable files are skipped during ListAllRuntimes.
1676+
// Since no readable daemon files exist, stopDaemon returns ErrDaemonNotRunning.
1677+
if err != ErrDaemonNotRunning {
1678+
t.Errorf("expected ErrDaemonNotRunning (unreadable file skipped), got: %v", err)
16651679
}
16661680
}
16671681

0 commit comments

Comments
 (0)