fix(stop): honour per-app graceful-stop grace in orchestrator stop path

package.stop left slow-to-SIGTERM apps (fedimint/electrumx/bitcoin/btcpay/immich)
running: the orchestrator path hardcoded podman API ?t=10 / CLI -t 30 and the CLI
wrapper deadline (30s) equalled the -t grace, so the await fired exactly as podman
SIGKILLed -> stop reported failed -> state reverted to running. Reproduced live on
clean .198 (fedimint).

- container/runtime.rs: add ContainerRuntime::stop_container_with_grace (defaulted
  so mock/dev impls are unchanged); PodmanRuntime honours grace for API + CLI with
  deadline = grace + 15s buffer; AutoRuntime delegates. New canonical per-app table
  stop_grace_secs_for() + DEFAULT_STOP_GRACE_SECS / STOP_GRACE_DEADLINE_BUFFER_SECS.
- podman_client.rs: stop_container_with_grace uses ?t=<grace> + longer HTTP deadline.
- prod_orchestrator::stop: resolve grace = manifest stop_grace_secs (north-star) else
  the table; pass to quadlet::stop_service_with_timeout AND stop_container_with_grace.
- quadlet.rs: stop_service_with_timeout so slow apps aren't SIGKILLed at 45s.
- rpc/package/runtime.rs: doc-note its &str stop_timeout_secs mirrors the canonical table.
- tests: resolve_stop_grace_secs (manifest field wins / table fallback / default 30).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
archipelago 2026-06-22 06:59:40 -04:00
parent 470e3c649a
commit 2dad64b2ee
5 changed files with 152 additions and 8 deletions

View File

@ -22,6 +22,11 @@ const PODMAN_LOG_TIMEOUT: Duration = Duration::from_secs(15);
/// Per-container graceful shutdown timeout in seconds.
/// Bitcoin Core needs 600s to flush UTXO set, LND 330s for channel state,
/// indexers 300s for index flush, databases 120s for WAL/transaction commit.
///
/// MIRRORS `archipelago_container::runtime::stop_grace_secs_for` (which returns
/// `u64` and is the canonical table used by the orchestrator stop path). This
/// `&str` variant exists for the legacy `podman stop -t <s>` call sites here —
/// keep the two tables in sync until those are migrated to the orchestrator.
pub fn stop_timeout_secs(container_name: &str) -> &'static str {
let id = container_name
.strip_prefix("archy-")

View File

@ -171,6 +171,22 @@ pub fn compute_container_name(manifest: &AppManifest) -> String {
}
}
/// Resolve the graceful-stop grace (seconds) for an app: the manifest
/// `stop_grace_secs` extension if declared (manifest-driven, north-star), else
/// the historical per-app `stop_timeout_secs` table keyed by container name.
pub fn resolve_stop_grace_secs(manifest: &AppManifest, container_name: &str) -> u64 {
if let Some(v) = manifest.app.extensions.get("stop_grace_secs") {
// Accept either a YAML integer or a numeric string.
if let Some(n) = v.as_u64() {
return n;
}
if let Some(n) = v.as_str().and_then(|s| s.trim().parse::<u64>().ok()) {
return n;
}
}
archipelago_container::runtime::stop_grace_secs_for(container_name)
}
/// Fingerprint a local build context so a changed source tree (e.g. a rebuilt
/// `neode-ui` dist copied into `docker/<ui>/`) forces an image rebuild even
/// when the image tag already exists (#34). Walks the context directory and
@ -2896,13 +2912,25 @@ impl ContainerOrchestrator for ProdContainerOrchestrator {
let lock = self.app_lock(app_id).await;
let _guard = lock.lock().await;
let name = compute_container_name(&lm.manifest);
// Per-app graceful-stop grace: manifest `stop_grace_secs` if declared,
// else the historical per-app table. Slow-to-SIGTERM apps (bitcoin-core
// 600s, lnd 330s, electrumx 300s, fedimint 60s…) otherwise get a too-short
// `podman stop -t` and the stop is reported failed while the container
// keeps running. See PRODUCTION-MASTER-PLAN §8b.
let grace_secs = resolve_stop_grace_secs(&lm.manifest, &name);
// Quadlet-owned containers are restarted by systemd if only `podman stop`
// is used. Stop the user service first, then stop the container as a
// defensive fallback for legacy/non-Quadlet installs.
if let Err(err) = quadlet::stop_service(&format!("{name}.service")).await {
// defensive fallback for legacy/non-Quadlet installs. Give systemd the
// per-app grace before it force-kills the app-scoped unit.
let quadlet_timeout = std::time::Duration::from_secs(
grace_secs + archipelago_container::runtime::STOP_GRACE_DEADLINE_BUFFER_SECS,
);
if let Err(err) =
quadlet::stop_service_with_timeout(&format!("{name}.service"), quadlet_timeout).await
{
tracing::debug!(container = %name, error = %err, "quadlet stop skipped/failed");
}
match self.runtime.stop_container(&name).await {
match self.runtime.stop_container_with_grace(&name, grace_secs).await {
Ok(()) => Ok(()),
Err(err) => {
let stuck_stopping = self
@ -3467,6 +3495,37 @@ app:
assert_eq!(compute_container_name(&m), "legacy-bitcoin-ui");
}
fn manifest_with_stop_grace(id: &str, grace: &str) -> AppManifest {
let yaml = format!(
"app:\n id: {id}\n name: {id}\n version: 1.0.0\n stop_grace_secs: {grace}\n container:\n image: foo:1\n"
);
AppManifest::parse(&yaml).unwrap()
}
#[test]
fn stop_grace_manifest_field_wins() {
// An explicit stop_grace_secs overrides the per-app table (fedimint=60).
let m = manifest_with_stop_grace("fedimint", "180");
assert_eq!(resolve_stop_grace_secs(&m, "fedimint"), 180);
}
#[test]
fn stop_grace_falls_back_to_table() {
// No manifest field → the historical per-app table by container name.
let m = pull_manifest("fedimint", "foo:1");
assert_eq!(resolve_stop_grace_secs(&m, "fedimint"), 60);
let m = pull_manifest("bitcoin-knots", "foo:1");
assert_eq!(resolve_stop_grace_secs(&m, "bitcoin-knots"), 600);
let m = pull_manifest("electrumx", "foo:1");
assert_eq!(resolve_stop_grace_secs(&m, "electrumx"), 300);
}
#[test]
fn stop_grace_unknown_app_defaults_to_30() {
let m = pull_manifest("some-unknown-app", "foo:1");
assert_eq!(resolve_stop_grace_secs(&m, "some-unknown-app"), 30);
}
async fn orch_with(runtime: Arc<MockRuntime>) -> ProdContainerOrchestrator {
let mut orch = ProdContainerOrchestrator::with_runtime(
runtime,

View File

@ -642,7 +642,17 @@ pub async fn restart_service(service: &str) -> Result<()> {
/// Stop a generated Quadlet service without removing its unit file.
pub async fn stop_service(service: &str) -> Result<()> {
match systemctl_user_status(&["stop", service], QUADLET_STOP_TIMEOUT).await {
stop_service_with_timeout(service, QUADLET_STOP_TIMEOUT).await
}
/// Stop a user service, waiting up to `timeout` for a graceful stop before
/// force-killing the app-scoped unit. Slow-to-SIGTERM apps (bitcoin-core ~600s,
/// lnd ~330s) must not be SIGKILLed at the default 45s — that risks data
/// corruption — so the orchestrator passes the per-app grace here. Never waits
/// less than `QUADLET_STOP_TIMEOUT`.
pub async fn stop_service_with_timeout(service: &str, timeout: Duration) -> Result<()> {
let timeout = timeout.max(QUADLET_STOP_TIMEOUT);
match systemctl_user_status(&["stop", service], timeout).await {
Ok(status) if status.success() => Ok(()),
Ok(status) => Err(anyhow!("systemctl --user stop {service} exited {status}")),
Err(err) => {

View File

@ -422,11 +422,22 @@ impl PodmanClient {
}
pub async fn stop_container(&self, name: &str) -> Result<()> {
self.stop_container_with_grace(name, 10).await
}
/// Stop via libpod honouring a per-app grace (seconds). The HTTP deadline is
/// kept above the grace so the post-grace SIGKILL lands before we give up —
/// otherwise slow-to-SIGTERM apps (fedimint, bitcoin-core, electrumx…) time
/// out at exactly the grace boundary and the stop is reported as failed.
pub async fn stop_container_with_grace(&self, name: &str, grace_secs: u64) -> Result<()> {
let deadline = std::time::Duration::from_secs(
grace_secs + crate::runtime::STOP_GRACE_DEADLINE_BUFFER_SECS,
);
self.api_request(
"POST",
&format!("libpod/containers/{}/stop?t=10", name),
&format!("libpod/containers/{}/stop?t={}", name, grace_secs),
None,
DEFAULT_TIMEOUT,
deadline,
)
.await
.map(|_| ())

View File

@ -10,6 +10,35 @@ const PODMAN_CLI_DEFAULT_TIMEOUT: Duration = Duration::from_secs(30);
const PODMAN_CLI_IMAGE_CHECK_TIMEOUT: Duration = Duration::from_secs(10);
const PODMAN_CLI_BUILD_TIMEOUT: Duration = Duration::from_secs(900);
/// Default graceful-stop grace (seconds) when a caller doesn't supply a per-app
/// value. Mirrors the historical `podman stop -t 30`.
pub const DEFAULT_STOP_GRACE_SECS: u64 = 30;
/// Headroom added to a stop grace to form the await/HTTP deadline, so podman's
/// post-grace SIGKILL completes before the wrapper times out.
pub const STOP_GRACE_DEADLINE_BUFFER_SECS: u64 = 15;
/// Canonical per-app graceful-stop grace (seconds), keyed by container name.
/// Slow-to-SIGTERM apps need far longer than the 30s default: bitcoin-core
/// flushes its chainstate, lnd closes channels, electrumx finishes indexing,
/// stack DBs checkpoint. Used as the fallback when a manifest doesn't declare
/// `stop_grace_secs`. NOTE: the RPC layer's `stop_timeout_secs` mirrors this
/// (returns the same values as `&str` for legacy `podman stop -t` call sites) —
/// keep the two in sync until that path is retired.
pub fn stop_grace_secs_for(container_name: &str) -> u64 {
let id = container_name
.strip_prefix("archy-")
.unwrap_or(container_name);
match id {
"bitcoin-knots" | "bitcoin-core" | "bitcoin" => 600,
"lnd" => 330,
"electrumx" | "electrs" | "mempool-electrs" => 300,
"btcpay-db" | "mempool-db" | "penpot-postgres" | "immich_postgres" | "nextcloud-db"
| "endurain-db" => 120,
"btcpay-server" | "nbxplorer" | "fedimint" | "fedimint-gateway" => 60,
_ => DEFAULT_STOP_GRACE_SECS,
}
}
#[async_trait]
pub trait ContainerRuntime: Send + Sync {
async fn pull_image(&self, image: &str, signature: Option<&str>) -> Result<()>;
@ -21,6 +50,19 @@ pub trait ContainerRuntime: Send + Sync {
) -> Result<String>;
async fn start_container(&self, name: &str) -> Result<()>;
async fn stop_container(&self, name: &str) -> Result<()>;
/// Stop a container honouring a per-app graceful-shutdown grace (seconds).
///
/// Slow-to-SIGTERM apps (bitcoin-core, lnd, electrumx, fedimint, immich…)
/// need a longer `podman stop -t` than the default 30s, or `podman stop`
/// returns before the container exits and the orchestrator treats the stop
/// as failed (the container keeps running). The wrapping deadline is always
/// kept strictly greater than `grace_secs` so podman's post-grace SIGKILL
/// lands inside the await. The default impl ignores the grace and calls
/// `stop_container` — only the real podman runtime honours it.
async fn stop_container_with_grace(&self, name: &str, grace_secs: u64) -> Result<()> {
let _ = grace_secs;
self.stop_container(name).await
}
async fn remove_container(&self, name: &str) -> Result<()>;
async fn get_container_status(&self, name: &str) -> Result<ContainerStatus>;
async fn get_container_logs(&self, name: &str, lines: u32) -> Result<Vec<String>>;
@ -122,10 +164,23 @@ impl ContainerRuntime for PodmanRuntime {
}
async fn stop_container(&self, name: &str) -> Result<()> {
match self.client.stop_container(name).await {
self.stop_container_with_grace(name, DEFAULT_STOP_GRACE_SECS)
.await
}
async fn stop_container_with_grace(&self, name: &str, grace_secs: u64) -> Result<()> {
match self.client.stop_container_with_grace(name, grace_secs).await {
Ok(()) => Ok(()),
Err(api_err) => {
let output = self.podman_cli(&["stop", "-t", "30", name]).await?;
// CLI fallback. Keep the wrapper deadline strictly above the
// `-t` grace so podman's post-grace SIGKILL completes before the
// await gives up (otherwise a deadline == grace races the kill
// and reports a spurious timeout).
let grace = grace_secs.to_string();
let deadline = Duration::from_secs(grace_secs + STOP_GRACE_DEADLINE_BUFFER_SECS);
let output = self
.podman_cli_timeout(&["stop", "-t", &grace, name], deadline)
.await?;
if output.status.success() {
Ok(())
} else {
@ -841,6 +896,10 @@ impl ContainerRuntime for AutoRuntime {
self.runtime.stop_container(name).await
}
async fn stop_container_with_grace(&self, name: &str, grace_secs: u64) -> Result<()> {
self.runtime.stop_container_with_grace(name, grace_secs).await
}
async fn remove_container(&self, name: &str) -> Result<()> {
self.runtime.remove_container(name).await
}