fix(install): restart stack containers that crash on first start (#25)

Apps could fail install when a stack member exited on its first start
because a dependency (db/redis/the bitcoin node) was not ready yet — a
transient crash, not a broken install. wait_for_stack_containers now
restarts each exited/dead container up to 3 times before declaring the
install failed; the runtime supervisor keeps it alive afterwards.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
archipelago 2026-06-16 15:14:09 -04:00
parent 83b77796fc
commit 1843739e0c

View File

@ -434,6 +434,13 @@ async fn wait_for_stack_containers(
containers: &[&str], containers: &[&str],
timeout_secs: u64, timeout_secs: u64,
) -> Result<()> { ) -> Result<()> {
// A container can exit on its first start because a dependency (db, redis,
// the bitcoin node) was not quite ready — a transient crash, not a broken
// install. Restart each exited container a bounded number of times before
// declaring the install failed (#25). The runtime supervisor keeps it alive
// afterwards, but we want a healthy state by the time install returns.
const MAX_RESTARTS: u32 = 3;
let mut restarts: std::collections::HashMap<String, u32> = std::collections::HashMap::new();
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(timeout_secs); let deadline = std::time::Instant::now() + std::time::Duration::from_secs(timeout_secs);
loop { loop {
let mut pending = Vec::new(); let mut pending = Vec::new();
@ -449,20 +456,38 @@ async fn wait_for_stack_containers(
match state.as_str() { match state.as_str() {
"running" => {} "running" => {}
"exited" | "dead" => { "exited" | "dead" => {
let logs = stack_container_logs(container, 40).await; let attempts = restarts.entry(container.to_string()).or_insert(0);
install_log(&format!( if *attempts < MAX_RESTARTS {
"INSTALL CRASH: {} - container {} exited. Logs:\n{}", *attempts += 1;
stack_name, install_log(&format!(
container, "INSTALL RESTART: {} - container {} exited, restart attempt {}/{}",
logs.chars().take(1000).collect::<String>() stack_name, container, *attempts, MAX_RESTARTS
)) ))
.await; .await;
return Err(anyhow::anyhow!( let _ = podman_stack_output(
"{} container {} exited after install. Logs: {}", &["start", container],
stack_name, PODMAN_STACK_PROBE_TIMEOUT,
container, )
logs.chars().take(500).collect::<String>() .await;
)); pending.push(format!("{}=restarting({}/{})", container, *attempts, MAX_RESTARTS));
} else {
let logs = stack_container_logs(container, 40).await;
install_log(&format!(
"INSTALL CRASH: {} - container {} exited after {} restarts. Logs:\n{}",
stack_name,
container,
MAX_RESTARTS,
logs.chars().take(1000).collect::<String>()
))
.await;
return Err(anyhow::anyhow!(
"{} container {} exited after install ({} restarts). Logs: {}",
stack_name,
container,
MAX_RESTARTS,
logs.chars().take(500).collect::<String>()
));
}
} }
other => pending.push(format!("{}={}", container, other)), other => pending.push(format!("{}={}", container, other)),
} }