- commit
- 11a12cc
- parent
- 690487b
- author
- Ian Tay
- date
- 2026-03-08 12:55:02 -0400 EDT
fix: only clean up socket on ConnectionRefused, not Timeout A 1-second probe timeout doesn't mean the daemon is dead — it may just be busy (heavy output, terminal resize reflow, serializing state for another client). Deleting a live daemon's socket orphans it permanently with no way to reach it via zmx commands. Applied to all callsites: get_session_entries, ensureSession (worst case: spawns a replacement daemon leaving the old one orphaned), kill, detachAll, history. For kill, the user now gets a helpful message if the daemon is busy vs. actually dead. The `zmx list` status label for error entries now says `status=unreachable` (not `cleaning up`) on Timeout, so the display doesn't contradict what we actually did. `zmx wait` now treats is_error entries as done+failed: on Timeout the socket persists, so without this the session would sit at task_ended_at==0 forever, defeating both the completion check and the zero-match timeout.
2 files changed,
+46,
-9
+31,
-7
1@@ -360,9 +360,18 @@ const Daemon = struct {
2 if (self.command != null) {
3 std.log.warn("session already exists, ignoring command session={s}", .{self.session_name});
4 }
5- } else |_| {
6- socket.cleanupStaleSocket(dir, self.session_name);
7- should_create = true;
8+ } else |err| switch (err) {
9+ // Daemon is definitively gone: safe to replace.
10+ error.ConnectionRefused => {
11+ socket.cleanupStaleSocket(dir, self.session_name);
12+ should_create = true;
13+ },
14+ // Probe didn't respond in time -- daemon may just be busy.
15+ // The probe is only to decide create-vs-attach; the session
16+ // exists, so proceed to attach rather than fail or orphan.
17+ else => {
18+ std.log.warn("probe slow ({s}), proceeding to attach session={s}", .{ @errorName(err), self.session_name });
19+ },
20 }
21 }
22
23@@ -710,6 +719,17 @@ fn wait(cfg: *Cfg, session_names: std.ArrayList([]const u8)) !void {
24 }
25
26 total += 1;
27+ if (session.is_error) {
28+ // Daemon unreachable (probe timed out). On Timeout the socket
29+ // is no longer deleted, so this session would otherwise
30+ // persist as task_ended_at==0 forever → infinite "still
31+ // waiting". Count it as done+failed so wait terminates.
32+ try stdout.print("task unreachable: {s} ({s})\n", .{ session.name, session.error_name orelse "unknown" });
33+ try stdout.flush();
34+ agg_exit_code = 1;
35+ done += 1;
36+ continue;
37+ }
38 if (session.task_ended_at == 0) {
39 try stdout.print("still waiting task={s}\n", .{session.name});
40 try stdout.flush();
41@@ -818,7 +838,7 @@ fn detachAll(cfg: *Cfg) !void {
42 defer alloc.free(socket_path);
43 const result = ipc.probeSession(alloc, socket_path) catch |err| {
44 std.log.err("session unresponsive: {s}", .{@errorName(err)});
45- socket.cleanupStaleSocket(dir, session_name);
46+ if (err == error.ConnectionRefused) socket.cleanupStaleSocket(dir, session_name);
47 return;
48 };
49 defer posix.close(result.fd);
50@@ -846,10 +866,14 @@ fn kill(cfg: *Cfg, session_name: []const u8) !void {
51 defer alloc.free(socket_path);
52 const result = ipc.probeSession(alloc, socket_path) catch |err| {
53 std.log.err("session unresponsive: {s}", .{@errorName(err)});
54- socket.cleanupStaleSocket(dir, session_name);
55 var buf: [4096]u8 = undefined;
56 var w = std.fs.File.stdout().writer(&buf);
57- w.interface.print("cleaned up stale session {s}\n", .{session_name}) catch {};
58+ if (err == error.ConnectionRefused) {
59+ socket.cleanupStaleSocket(dir, session_name);
60+ w.interface.print("cleaned up stale session {s}\n", .{session_name}) catch {};
61+ } else {
62+ w.interface.print("session {s} is unresponsive ({s}) -- daemon may be busy, try again or kill the process directly\n", .{ session_name, @errorName(err) }) catch {};
63+ }
64 w.interface.flush() catch {};
65 return;
66 };
67@@ -883,7 +907,7 @@ fn history(cfg: *Cfg, session_name: []const u8, format: util.HistoryFormat) !voi
68 defer alloc.free(socket_path);
69 const result = ipc.probeSession(alloc, socket_path) catch |err| {
70 std.log.err("session unresponsive: {s}", .{@errorName(err)});
71- socket.cleanupStaleSocket(dir, session_name);
72+ if (err == error.ConnectionRefused) socket.cleanupStaleSocket(dir, session_name);
73 return;
74 };
75 defer posix.close(result.fd);
+15,
-2
1@@ -54,7 +54,12 @@ pub fn get_session_entries(alloc: std.mem.Allocator, socket_dir: []const u8) !st
2 .task_exit_code = 1,
3 .task_ended_at = 0,
4 });
5- socket.cleanupStaleSocket(dir, entry.name);
6+ // Only clean up when the daemon is definitively gone. A busy
7+ // daemon can miss the probe timeout; deleting its socket
8+ // orphans it permanently.
9+ if (err == error.ConnectionRefused) {
10+ socket.cleanupStaleSocket(dir, entry.name);
11+ }
12 continue;
13 };
14 posix.close(result.fd);
15@@ -292,10 +297,18 @@ pub fn writeSessionLine(writer: *std.Io.Writer, session: SessionEntry, short: bo
16 }
17
18 if (session.is_error) {
19- try writer.print("{s}name={s}\terr={s}\tstatus=cleaning up\n", .{
20+ // "cleaning up" is only truthful when the probe was definitively
21+ // refused (socket deleted this pass). On Timeout/Unexpected the
22+ // daemon may just be busy, so don't lie about what we did.
23+ const status = if (std.mem.eql(u8, session.error_name.?, "ConnectionRefused"))
24+ "cleaning up"
25+ else
26+ "unreachable";
27+ try writer.print("{s}name={s}\terr={s}\tstatus={s}\n", .{
28 prefix,
29 session.name,
30 session.error_name.?,
31+ status,
32 });
33 return;
34 }