repos / zmx

session persistence for terminal processes
git clone https://github.com/neurosnap/zmx.git

commit
12a19ee
parent
35f56e6
author
Ian Tay
date
2026-03-08 12:54:06 -0400 EDT
fix: use platform-correct O_NONBLOCK for fcntl; fix PTY write and double-close

The hardcoded value 0o4000 is Linux-specific; on macOS O_NONBLOCK is 0x4.
posix.SOCK.NONBLOCK is for socket()/accept4(), not fcntl(F_SETFL) — it only
worked on Linux by coincidence where both constants share the same value.
On macOS, non-blocking mode was never actually set on the PTY, client
socket, or stdin.

Setting O_NONBLOCK correctly exposes two latent issues, also fixed here:

- PTY writes in handleInput/handleRun did `_ = try posix.write()`,
  discarding short-write counts and propagating WouldBlock as an error
  that crashed the daemon. Added ptyWriteAll() that polls on WouldBlock
  and retries short writes — same blocking semantics macOS had implicitly.

- ensureSession's errdefer + defer both closed server_sock_fd when
  daemonLoop errored, causing EBADF (posix.close treats this as
  unreachable → panic in safe builds). Restructured so spawnPty failure
  is handled by an explicit catch; the defer is the sole owner after.

Additionally, O_NONBLOCK is set on the open file description (shared with
the parent shell), so stdin's original flags must be restored on exit to
avoid leaving the parent shell's stdin in non-blocking mode.
1 files changed,  +51, -16
M src/main.zig
+51, -16
  1@@ -33,6 +33,9 @@ fn zmxLogFn(
  2 var sigwinch_received: std.atomic.Value(bool) = std.atomic.Value(bool).init(false);
  3 var sigterm_received: std.atomic.Value(bool) = std.atomic.Value(bool).init(false);
  4 
  5+// https://github.com/ziglang/zig/blob/738d2be9d6b6ef3ff3559130c05159ef53336224/lib/std/posix.zig#L3505
  6+const O_NONBLOCK: usize = 1 << @bitOffsetOf(posix.O, "NONBLOCK");
  7+
  8 pub fn main() !void {
  9     // use c_allocator to avoid "reached unreachable code" panic in DebugAllocator when forking
 10     const alloc = std.heap.c_allocator;
 11@@ -334,7 +337,7 @@ const Daemon = struct {
 12 
 13         // make pty non-blocking
 14         const flags = try posix.fcntl(master_fd, posix.F.GETFL, 0);
 15-        _ = try posix.fcntl(master_fd, posix.F.SETFL, flags | @as(u32, 0o4000));
 16+        _ = try posix.fcntl(master_fd, posix.F.SETFL, flags | O_NONBLOCK);
 17         return master_fd;
 18     }
 19 
 20@@ -372,12 +375,19 @@ const Daemon = struct {
 21                 defer self.alloc.free(session_log_path);
 22                 try log_system.init(self.alloc, session_log_path);
 23 
 24-                errdefer {
 25+                // If spawnPty fails, clean up here. Once it succeeds,
 26+                // the inner block's defer takes ownership of cleanup to
 27+                // avoid double-closing server_sock_fd on daemonLoop error.
 28+                const pty_fd = self.spawnPty() catch |err| {
 29                     posix.close(server_sock_fd);
 30                     dir.deleteFile(self.session_name) catch {};
 31-                }
 32-                const pty_fd = try self.spawnPty();
 33+                    return err;
 34+                };
 35+
 36                 defer {
 37+                    self.handleKill();
 38+                    self.deinit();
 39+                    _ = posix.waitpid(self.pid, 0);
 40                     posix.close(pty_fd);
 41                     posix.close(server_sock_fd);
 42                     std.log.info("deleting socket file session_name={s}", .{self.session_name});
 43@@ -385,10 +395,8 @@ const Daemon = struct {
 44                         std.log.warn("failed to delete socket file err={s}", .{@errorName(err)});
 45                     };
 46                 }
 47+
 48                 try daemonLoop(self, server_sock_fd, pty_fd);
 49-                self.handleKill();
 50-                _ = posix.waitpid(self.pid, 0);
 51-                self.deinit();
 52                 return .{ .created = true, .is_daemon = true };
 53             }
 54             posix.close(server_sock_fd);
 55@@ -399,10 +407,33 @@ const Daemon = struct {
 56         return .{ .created = false, .is_daemon = false };
 57     }
 58 
 59-    pub fn handleInput(self: *Daemon, pty_fd: i32, payload: []const u8) !void {
 60+    /// Best-effort write to the (non-blocking) PTY fd. Retries short writes
 61+    /// until complete, but on WouldBlock (kernel buffer full) gives up and
 62+    /// drops the remainder — the daemon is single-threaded, so blocking here
 63+    /// to wait for POLLOUT would deadlock against a shell that's itself
 64+    /// blocked writing echo to a full PTY output buffer that we're not
 65+    /// draining. Dropping is the same trade-off the old code made implicitly
 66+    /// (short writes were silently truncated), just without the crash.
 67+    fn ptyWrite(pty_fd: i32, data: []const u8) void {
 68+        var remaining = data;
 69+        while (remaining.len > 0) {
 70+            const n = posix.write(pty_fd, remaining) catch |err| {
 71+                if (err == error.WouldBlock) {
 72+                    std.log.warn("pty write dropped {d}/{d} bytes (buffer full)", .{ remaining.len, data.len });
 73+                } else {
 74+                    std.log.warn("pty write failed, {d} bytes lost: {s}", .{ remaining.len, @errorName(err) });
 75+                }
 76+                return;
 77+            };
 78+            if (n == 0) return;
 79+            remaining = remaining[n..];
 80+        }
 81+    }
 82+
 83+    pub fn handleInput(self: *Daemon, pty_fd: i32, payload: []const u8) void {
 84         _ = self;
 85         if (payload.len > 0) {
 86-            _ = try posix.write(pty_fd, payload);
 87+            ptyWrite(pty_fd, payload);
 88         }
 89     }
 90 
 91@@ -575,7 +606,7 @@ const Daemon = struct {
 92         self.is_task_mode = true;
 93 
 94         if (payload.len > 0) {
 95-            _ = try posix.write(pty_fd, payload);
 96+            ptyWrite(pty_fd, payload);
 97         }
 98         try ipc.appendMessage(self.alloc, &client.write_buf, .Ack, "");
 99         client.has_pending_output = true;
100@@ -1071,8 +1102,9 @@ fn clientLoop(client_sock_fd: i32) !void {
101     setupSigwinchHandler();
102 
103     // Make socket non-blocking to avoid blocking on writes
104-    const sock_flags = try posix.fcntl(client_sock_fd, posix.F.GETFL, 0);
105-    _ = try posix.fcntl(client_sock_fd, posix.F.SETFL, sock_flags | posix.SOCK.NONBLOCK);
106+    var sock_flags = try posix.fcntl(client_sock_fd, posix.F.GETFL, 0);
107+    sock_flags |= O_NONBLOCK;
108+    _ = try posix.fcntl(client_sock_fd, posix.F.SETFL, sock_flags);
109 
110     // Buffer for outgoing socket writes
111     var sock_write_buf = try std.ArrayList(u8).initCapacity(alloc, 4096);
112@@ -1093,9 +1125,12 @@ fn clientLoop(client_sock_fd: i32) !void {
113 
114     const stdin_fd = posix.STDIN_FILENO;
115 
116-    // Make stdin non-blocking
117-    const flags = try posix.fcntl(stdin_fd, posix.F.GETFL, 0);
118-    _ = try posix.fcntl(stdin_fd, posix.F.SETFL, flags | posix.SOCK.NONBLOCK);
119+    // Make stdin non-blocking. O_NONBLOCK is set on the open file description,
120+    // which is shared with the parent shell; restore on exit to avoid
121+    // corrupting the parent's stdin.
122+    const stdin_orig_flags = try posix.fcntl(stdin_fd, posix.F.GETFL, 0);
123+    _ = try posix.fcntl(stdin_fd, posix.F.SETFL, stdin_orig_flags | O_NONBLOCK);
124+    defer _ = posix.fcntl(stdin_fd, posix.F.SETFL, stdin_orig_flags) catch {};
125 
126     while (true) {
127         // Check for pending SIGWINCH
128@@ -1369,7 +1404,7 @@ fn daemonLoop(daemon: *Daemon, server_sock_fd: i32, pty_fd: i32) !void {
129 
130                 while (client.read_buf.next()) |msg| {
131                     switch (msg.header.tag) {
132-                        .Input => try daemon.handleInput(pty_fd, msg.payload),
133+                        .Input => daemon.handleInput(pty_fd, msg.payload),
134                         .Init => try daemon.handleInit(client, pty_fd, &term, msg.payload),
135                         .Resize => try daemon.handleResize(pty_fd, &term, msg.payload),
136                         .Detach => {