From be50c886bb8cd06b5655a017ea18ff2985e60799 Mon Sep 17 00:00:00 2001 From: Dorian Date: Wed, 1 Jul 2026 21:29:54 +0100 Subject: [PATCH] fix(mesh/reticulum): kill the whole daemon process group on drop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reticulum daemon is a PyInstaller one-file binary: a bootloader parent that forks the real Python process. `kill_on_drop`/`start_kill()` only SIGKILL the bootloader, orphaning the forked child — which keeps holding the RNode serial port. Across the listener's 30-min RX-stall reconnects this piled up (observed 9 concurrent instances on a live node) all clutching /dev/ttyUSB0, garbling the RNode so it stopped transmitting entirely. Spawn the daemon as its own process-group leader (`process_group(0)`) and, on drop, signal the whole group (SIGTERM for a clean RNode/socket release, then SIGKILL as a hard backstop) so the forked child can never be orphaned. Co-Authored-By: Claude Opus 4.8 (1M context) --- core/Cargo.lock | 1 + core/archipelago/Cargo.toml | 1 + core/archipelago/src/mesh/reticulum.rs | 23 ++++++++++++++++++++--- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/core/Cargo.lock b/core/Cargo.lock index b0940218..705aa085 100644 --- a/core/Cargo.lock +++ b/core/Cargo.lock @@ -128,6 +128,7 @@ dependencies = [ "hyper-ws-listener", "iroh", "iroh-blobs", + "libc", "mainline", "mdns-sd", "nostr-sdk", diff --git a/core/archipelago/Cargo.toml b/core/archipelago/Cargo.toml index 9c87e2d1..df9fed85 100644 --- a/core/archipelago/Cargo.toml +++ b/core/archipelago/Cargo.toml @@ -22,6 +22,7 @@ iroh-swarm = ["dep:iroh", "dep:iroh-blobs"] [dependencies] # Core dependencies tokio = { version = "1", features = ["full"] } +libc = "0.2" # process-group signalling for the supervised reticulum daemon serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" anyhow = "1.0" diff --git a/core/archipelago/src/mesh/reticulum.rs b/core/archipelago/src/mesh/reticulum.rs index a0f72a89..bdcebafd 100644 --- a/core/archipelago/src/mesh/reticulum.rs +++ b/core/archipelago/src/mesh/reticulum.rs @@ -104,6 +104,11 @@ fn daemon_command( .arg(x); } cmd.kill_on_drop(true) + // Run the daemon as its own process-group leader. The packaged binary is + // a PyInstaller one-file bootloader that forks the real Python process; + // making it a group leader lets Drop signal the WHOLE group so the + // forked child can't be orphaned and keep holding the serial port. + .process_group(0) .stdin(std::process::Stdio::null()) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()); @@ -945,9 +950,21 @@ fn contains_detect_resp(buf: &[u8]) -> bool { impl Drop for ReticulumLink { fn drop(&mut self) { - // Best-effort: ask the daemon to shut down cleanly (frees the serial - // port promptly); `kill_on_drop` on the Command is the hard backstop - // if the daemon doesn't exit in time. + // The packaged daemon is a PyInstaller one-file bootloader that forks the + // real Python process into the same (leader) group we spawned it in. + // SIGKILLing only the bootloader (what `start_kill`/`kill_on_drop` do) + // orphans that child, which keeps holding the serial port — the root + // cause of daemons piling up across reconnects and jamming the RNode. + // Signal the whole process group instead: SIGTERM is caught by the + // daemon's handler (clean RNode + socket release), and SIGKILL is the + // hard backstop so a wedged daemon can never survive. + if let Some(pid) = self.child.id() { + let pgid = -(pid as i32); + unsafe { + libc::kill(pgid, libc::SIGTERM); + libc::kill(pgid, libc::SIGKILL); + } + } let _ = self.child.start_kill(); let _ = std::fs::remove_file(&self.socket_path); }