fix(install): restart stack containers that crash on first start (#25)
Apps could fail install when a stack member exited on its first start because a dependency (db/redis/the bitcoin node) was not ready yet — a transient crash, not a broken install. wait_for_stack_containers now restarts each exited/dead container up to 3 times before declaring the install failed; the runtime supervisor keeps it alive afterwards. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
83b77796fc
commit
1843739e0c
@ -434,6 +434,13 @@ async fn wait_for_stack_containers(
|
||||
containers: &[&str],
|
||||
timeout_secs: u64,
|
||||
) -> Result<()> {
|
||||
// A container can exit on its first start because a dependency (db, redis,
|
||||
// the bitcoin node) was not quite ready — a transient crash, not a broken
|
||||
// install. Restart each exited container a bounded number of times before
|
||||
// declaring the install failed (#25). The runtime supervisor keeps it alive
|
||||
// afterwards, but we want a healthy state by the time install returns.
|
||||
const MAX_RESTARTS: u32 = 3;
|
||||
let mut restarts: std::collections::HashMap<String, u32> = std::collections::HashMap::new();
|
||||
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(timeout_secs);
|
||||
loop {
|
||||
let mut pending = Vec::new();
|
||||
@ -449,20 +456,38 @@ async fn wait_for_stack_containers(
|
||||
match state.as_str() {
|
||||
"running" => {}
|
||||
"exited" | "dead" => {
|
||||
let logs = stack_container_logs(container, 40).await;
|
||||
install_log(&format!(
|
||||
"INSTALL CRASH: {} - container {} exited. Logs:\n{}",
|
||||
stack_name,
|
||||
container,
|
||||
logs.chars().take(1000).collect::<String>()
|
||||
))
|
||||
.await;
|
||||
return Err(anyhow::anyhow!(
|
||||
"{} container {} exited after install. Logs: {}",
|
||||
stack_name,
|
||||
container,
|
||||
logs.chars().take(500).collect::<String>()
|
||||
));
|
||||
let attempts = restarts.entry(container.to_string()).or_insert(0);
|
||||
if *attempts < MAX_RESTARTS {
|
||||
*attempts += 1;
|
||||
install_log(&format!(
|
||||
"INSTALL RESTART: {} - container {} exited, restart attempt {}/{}",
|
||||
stack_name, container, *attempts, MAX_RESTARTS
|
||||
))
|
||||
.await;
|
||||
let _ = podman_stack_output(
|
||||
&["start", container],
|
||||
PODMAN_STACK_PROBE_TIMEOUT,
|
||||
)
|
||||
.await;
|
||||
pending.push(format!("{}=restarting({}/{})", container, *attempts, MAX_RESTARTS));
|
||||
} else {
|
||||
let logs = stack_container_logs(container, 40).await;
|
||||
install_log(&format!(
|
||||
"INSTALL CRASH: {} - container {} exited after {} restarts. Logs:\n{}",
|
||||
stack_name,
|
||||
container,
|
||||
MAX_RESTARTS,
|
||||
logs.chars().take(1000).collect::<String>()
|
||||
))
|
||||
.await;
|
||||
return Err(anyhow::anyhow!(
|
||||
"{} container {} exited after install ({} restarts). Logs: {}",
|
||||
stack_name,
|
||||
container,
|
||||
MAX_RESTARTS,
|
||||
logs.chars().take(500).collect::<String>()
|
||||
));
|
||||
}
|
||||
}
|
||||
other => pending.push(format!("{}={}", container, other)),
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user