repos / zmx

session persistence for terminal processes
git clone https://github.com/neurosnap/zmx.git

commit
11a12cc
parent
690487b
author
Ian Tay
date
2026-03-08 12:55:02 -0400 EDT
fix: only clean up socket on ConnectionRefused, not Timeout

A 1-second probe timeout doesn't mean the daemon is dead — it may just be
busy (heavy output, terminal resize reflow, serializing state for another
client). Deleting a live daemon's socket orphans it permanently with no
way to reach it via zmx commands.

Applied to all callsites: get_session_entries, ensureSession (worst case:
spawns a replacement daemon leaving the old one orphaned), kill, detachAll,
history. For kill, the user now gets a helpful message if the daemon is
busy vs. actually dead. The `zmx list` status label for error entries now
says `status=unreachable` (not `cleaning up`) on Timeout, so the display
doesn't contradict what we actually did.

`zmx wait` now treats is_error entries as done+failed: on Timeout the
socket persists, so without this the session would sit at task_ended_at==0
forever, defeating both the completion check and the zero-match timeout.
2 files changed,  +46, -9
M src/main.zig
+31, -7
 1@@ -360,9 +360,18 @@ const Daemon = struct {
 2                 if (self.command != null) {
 3                     std.log.warn("session already exists, ignoring command session={s}", .{self.session_name});
 4                 }
 5-            } else |_| {
 6-                socket.cleanupStaleSocket(dir, self.session_name);
 7-                should_create = true;
 8+            } else |err| switch (err) {
 9+                // Daemon is definitively gone: safe to replace.
10+                error.ConnectionRefused => {
11+                    socket.cleanupStaleSocket(dir, self.session_name);
12+                    should_create = true;
13+                },
14+                // Probe didn't respond in time -- daemon may just be busy.
15+                // The probe is only to decide create-vs-attach; the session
16+                // exists, so proceed to attach rather than fail or orphan.
17+                else => {
18+                    std.log.warn("probe slow ({s}), proceeding to attach session={s}", .{ @errorName(err), self.session_name });
19+                },
20             }
21         }
22 
23@@ -710,6 +719,17 @@ fn wait(cfg: *Cfg, session_names: std.ArrayList([]const u8)) !void {
24             }
25 
26             total += 1;
27+            if (session.is_error) {
28+                // Daemon unreachable (probe timed out). On Timeout the socket
29+                // is no longer deleted, so this session would otherwise
30+                // persist as task_ended_at==0 forever → infinite "still
31+                // waiting". Count it as done+failed so wait terminates.
32+                try stdout.print("task unreachable: {s} ({s})\n", .{ session.name, session.error_name orelse "unknown" });
33+                try stdout.flush();
34+                agg_exit_code = 1;
35+                done += 1;
36+                continue;
37+            }
38             if (session.task_ended_at == 0) {
39                 try stdout.print("still waiting task={s}\n", .{session.name});
40                 try stdout.flush();
41@@ -818,7 +838,7 @@ fn detachAll(cfg: *Cfg) !void {
42     defer alloc.free(socket_path);
43     const result = ipc.probeSession(alloc, socket_path) catch |err| {
44         std.log.err("session unresponsive: {s}", .{@errorName(err)});
45-        socket.cleanupStaleSocket(dir, session_name);
46+        if (err == error.ConnectionRefused) socket.cleanupStaleSocket(dir, session_name);
47         return;
48     };
49     defer posix.close(result.fd);
50@@ -846,10 +866,14 @@ fn kill(cfg: *Cfg, session_name: []const u8) !void {
51     defer alloc.free(socket_path);
52     const result = ipc.probeSession(alloc, socket_path) catch |err| {
53         std.log.err("session unresponsive: {s}", .{@errorName(err)});
54-        socket.cleanupStaleSocket(dir, session_name);
55         var buf: [4096]u8 = undefined;
56         var w = std.fs.File.stdout().writer(&buf);
57-        w.interface.print("cleaned up stale session {s}\n", .{session_name}) catch {};
58+        if (err == error.ConnectionRefused) {
59+            socket.cleanupStaleSocket(dir, session_name);
60+            w.interface.print("cleaned up stale session {s}\n", .{session_name}) catch {};
61+        } else {
62+            w.interface.print("session {s} is unresponsive ({s}) -- daemon may be busy, try again or kill the process directly\n", .{ session_name, @errorName(err) }) catch {};
63+        }
64         w.interface.flush() catch {};
65         return;
66     };
67@@ -883,7 +907,7 @@ fn history(cfg: *Cfg, session_name: []const u8, format: util.HistoryFormat) !voi
68     defer alloc.free(socket_path);
69     const result = ipc.probeSession(alloc, socket_path) catch |err| {
70         std.log.err("session unresponsive: {s}", .{@errorName(err)});
71-        socket.cleanupStaleSocket(dir, session_name);
72+        if (err == error.ConnectionRefused) socket.cleanupStaleSocket(dir, session_name);
73         return;
74     };
75     defer posix.close(result.fd);
M src/util.zig
+15, -2
 1@@ -54,7 +54,12 @@ pub fn get_session_entries(alloc: std.mem.Allocator, socket_dir: []const u8) !st
 2                     .task_exit_code = 1,
 3                     .task_ended_at = 0,
 4                 });
 5-                socket.cleanupStaleSocket(dir, entry.name);
 6+                // Only clean up when the daemon is definitively gone. A busy
 7+                // daemon can miss the probe timeout; deleting its socket
 8+                // orphans it permanently.
 9+                if (err == error.ConnectionRefused) {
10+                    socket.cleanupStaleSocket(dir, entry.name);
11+                }
12                 continue;
13             };
14             posix.close(result.fd);
15@@ -292,10 +297,18 @@ pub fn writeSessionLine(writer: *std.Io.Writer, session: SessionEntry, short: bo
16     }
17 
18     if (session.is_error) {
19-        try writer.print("{s}name={s}\terr={s}\tstatus=cleaning up\n", .{
20+        // "cleaning up" is only truthful when the probe was definitively
21+        // refused (socket deleted this pass). On Timeout/Unexpected the
22+        // daemon may just be busy, so don't lie about what we did.
23+        const status = if (std.mem.eql(u8, session.error_name.?, "ConnectionRefused"))
24+            "cleaning up"
25+        else
26+            "unreachable";
27+        try writer.print("{s}name={s}\terr={s}\tstatus={s}\n", .{
28             prefix,
29             session.name,
30             session.error_name.?,
31+            status,
32         });
33         return;
34     }