fix(containers): periodically restart crashed stack members at runtime (#16/#17)
immich_server/redis/postgres + indeedhub-* are multi-container stack members whose sub-container app_ids are NOT in package_data, so the health monitor skips them as "orphans" and never restarts them when they exit — Immich/IndeedHub stay down until the next reboot (the boot-only start_stopped_stack_containers was the only recovery). Spawn a 120s supervisor that reuses that same recovery at runtime. It cheaply skips already-running containers and honours the user-stopped list (set on every container by package.stop), so it only revives genuinely crashed members and never fights a user stop. cargo check clean. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
c7cd068e1a
commit
ab6fcef6f3
@ -297,6 +297,31 @@ async fn main() -> Result<()> {
|
||||
});
|
||||
}
|
||||
|
||||
// Periodically restart crashed multi-container stack members (immich,
|
||||
// indeedhub, …) at RUNTIME, not just at boot. The health monitor skips them
|
||||
// as "orphans" because the sub-container app_ids (e.g. immich_server) aren't
|
||||
// in package_data, so without this a crashed immich_server / indeedhub-api
|
||||
// never comes back until the next reboot (#16/#17). Reuses the boot
|
||||
// recovery, which cheaply skips already-running containers and respects the
|
||||
// user-stopped list, so this only acts on genuinely-down stack members.
|
||||
{
|
||||
let data_dir = config.data_dir.clone();
|
||||
tokio::spawn(async move {
|
||||
let mut tick = tokio::time::interval(Duration::from_secs(120));
|
||||
tick.tick().await; // consume the immediate tick; boot recovery covers t0
|
||||
loop {
|
||||
tick.tick().await;
|
||||
let report = crash_recovery::start_stopped_stack_containers(&data_dir).await;
|
||||
if report.recovered > 0 {
|
||||
info!(
|
||||
"🔄 Stack supervisor: restarted {} crashed stack member(s) (failed: {:?})",
|
||||
report.recovered, report.failed
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Spawn disk space monitor (warns at 85%, auto-cleans at 90%)
|
||||
disk_monitor::spawn_disk_monitor(config.data_dir.clone());
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user