diff --git a/core/archipelago/src/api/rpc/package/stacks.rs b/core/archipelago/src/api/rpc/package/stacks.rs index 84c69ab4..b871839a 100644 --- a/core/archipelago/src/api/rpc/package/stacks.rs +++ b/core/archipelago/src/api/rpc/package/stacks.rs @@ -434,6 +434,13 @@ async fn wait_for_stack_containers( containers: &[&str], timeout_secs: u64, ) -> Result<()> { + // A container can exit on its first start because a dependency (db, redis, + // the bitcoin node) was not quite ready — a transient crash, not a broken + // install. Restart each exited container a bounded number of times before + // declaring the install failed (#25). The runtime supervisor keeps it alive + // afterwards, but we want a healthy state by the time install returns. + const MAX_RESTARTS: u32 = 3; + let mut restarts: std::collections::HashMap = std::collections::HashMap::new(); let deadline = std::time::Instant::now() + std::time::Duration::from_secs(timeout_secs); loop { let mut pending = Vec::new(); @@ -449,20 +456,38 @@ async fn wait_for_stack_containers( match state.as_str() { "running" => {} "exited" | "dead" => { - let logs = stack_container_logs(container, 40).await; - install_log(&format!( - "INSTALL CRASH: {} - container {} exited. Logs:\n{}", - stack_name, - container, - logs.chars().take(1000).collect::() - )) - .await; - return Err(anyhow::anyhow!( - "{} container {} exited after install. Logs: {}", - stack_name, - container, - logs.chars().take(500).collect::() - )); + let attempts = restarts.entry(container.to_string()).or_insert(0); + if *attempts < MAX_RESTARTS { + *attempts += 1; + install_log(&format!( + "INSTALL RESTART: {} - container {} exited, restart attempt {}/{}", + stack_name, container, *attempts, MAX_RESTARTS + )) + .await; + let _ = podman_stack_output( + &["start", container], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await; + pending.push(format!("{}=restarting({}/{})", container, *attempts, MAX_RESTARTS)); + } else { + let logs = stack_container_logs(container, 40).await; + install_log(&format!( + "INSTALL CRASH: {} - container {} exited after {} restarts. Logs:\n{}", + stack_name, + container, + MAX_RESTARTS, + logs.chars().take(1000).collect::() + )) + .await; + return Err(anyhow::anyhow!( + "{} container {} exited after install ({} restarts). Logs: {}", + stack_name, + container, + MAX_RESTARTS, + logs.chars().take(500).collect::() + )); + } } other => pending.push(format!("{}={}", container, other)), }