From ab6fcef6f3be9351de4d147ad10675d4a5539da7 Mon Sep 17 00:00:00 2001 From: archipelago Date: Tue, 16 Jun 2026 10:49:36 -0400 Subject: [PATCH] fix(containers): periodically restart crashed stack members at runtime (#16/#17) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit immich_server/redis/postgres + indeedhub-* are multi-container stack members whose sub-container app_ids are NOT in package_data, so the health monitor skips them as "orphans" and never restarts them when they exit — Immich/IndeedHub stay down until the next reboot (the boot-only start_stopped_stack_containers was the only recovery). Spawn a 120s supervisor that reuses that same recovery at runtime. It cheaply skips already-running containers and honours the user-stopped list (set on every container by package.stop), so it only revives genuinely crashed members and never fights a user stop. cargo check clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- core/archipelago/src/main.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/core/archipelago/src/main.rs b/core/archipelago/src/main.rs index 17c16ecc..cfea4950 100644 --- a/core/archipelago/src/main.rs +++ b/core/archipelago/src/main.rs @@ -297,6 +297,31 @@ async fn main() -> Result<()> { }); } + // Periodically restart crashed multi-container stack members (immich, + // indeedhub, …) at RUNTIME, not just at boot. The health monitor skips them + // as "orphans" because the sub-container app_ids (e.g. immich_server) aren't + // in package_data, so without this a crashed immich_server / indeedhub-api + // never comes back until the next reboot (#16/#17). Reuses the boot + // recovery, which cheaply skips already-running containers and respects the + // user-stopped list, so this only acts on genuinely-down stack members. + { + let data_dir = config.data_dir.clone(); + tokio::spawn(async move { + let mut tick = tokio::time::interval(Duration::from_secs(120)); + tick.tick().await; // consume the immediate tick; boot recovery covers t0 + loop { + tick.tick().await; + let report = crash_recovery::start_stopped_stack_containers(&data_dir).await; + if report.recovered > 0 { + info!( + "🔄 Stack supervisor: restarted {} crashed stack member(s) (failed: {:?})", + report.recovered, report.failed + ); + } + } + }); + } + // Spawn disk space monitor (warns at 85%, auto-cleans at 90%) disk_monitor::spawn_disk_monitor(config.data_dir.clone());