From 1843739e0c3e2fde4c33466ecb8465d23ad0628d Mon Sep 17 00:00:00 2001 From: archipelago Date: Tue, 16 Jun 2026 15:14:09 -0400 Subject: [PATCH] fix(install): restart stack containers that crash on first start (#25) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apps could fail install when a stack member exited on its first start because a dependency (db/redis/the bitcoin node) was not ready yet — a transient crash, not a broken install. wait_for_stack_containers now restarts each exited/dead container up to 3 times before declaring the install failed; the runtime supervisor keeps it alive afterwards. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../archipelago/src/api/rpc/package/stacks.rs | 53 ++++++++++++++----- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/core/archipelago/src/api/rpc/package/stacks.rs b/core/archipelago/src/api/rpc/package/stacks.rs index 84c69ab4..b871839a 100644 --- a/core/archipelago/src/api/rpc/package/stacks.rs +++ b/core/archipelago/src/api/rpc/package/stacks.rs @@ -434,6 +434,13 @@ async fn wait_for_stack_containers( containers: &[&str], timeout_secs: u64, ) -> Result<()> { + // A container can exit on its first start because a dependency (db, redis, + // the bitcoin node) was not quite ready — a transient crash, not a broken + // install. Restart each exited container a bounded number of times before + // declaring the install failed (#25). The runtime supervisor keeps it alive + // afterwards, but we want a healthy state by the time install returns. + const MAX_RESTARTS: u32 = 3; + let mut restarts: std::collections::HashMap = std::collections::HashMap::new(); let deadline = std::time::Instant::now() + std::time::Duration::from_secs(timeout_secs); loop { let mut pending = Vec::new(); @@ -449,20 +456,38 @@ async fn wait_for_stack_containers( match state.as_str() { "running" => {} "exited" | "dead" => { - let logs = stack_container_logs(container, 40).await; - install_log(&format!( - "INSTALL CRASH: {} - container {} exited. Logs:\n{}", - stack_name, - container, - logs.chars().take(1000).collect::() - )) - .await; - return Err(anyhow::anyhow!( - "{} container {} exited after install. Logs: {}", - stack_name, - container, - logs.chars().take(500).collect::() - )); + let attempts = restarts.entry(container.to_string()).or_insert(0); + if *attempts < MAX_RESTARTS { + *attempts += 1; + install_log(&format!( + "INSTALL RESTART: {} - container {} exited, restart attempt {}/{}", + stack_name, container, *attempts, MAX_RESTARTS + )) + .await; + let _ = podman_stack_output( + &["start", container], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await; + pending.push(format!("{}=restarting({}/{})", container, *attempts, MAX_RESTARTS)); + } else { + let logs = stack_container_logs(container, 40).await; + install_log(&format!( + "INSTALL CRASH: {} - container {} exited after {} restarts. Logs:\n{}", + stack_name, + container, + MAX_RESTARTS, + logs.chars().take(1000).collect::() + )) + .await; + return Err(anyhow::anyhow!( + "{} container {} exited after install ({} restarts). Logs: {}", + stack_name, + container, + MAX_RESTARTS, + logs.chars().take(500).collect::() + )); + } } other => pending.push(format!("{}={}", container, other)), }