fix(state): preserve transitional state across container scans

The 30s package scan loop used to blindly overwrite every package
entry from podman inspect. While a user-initiated Stop / Start /
Restart was in flight, the RPC spawn task would flip the state to
Stopping / Starting / Restarting, the next scan would see podman
still reporting "running" (for the duration of the graceful stop,
up to 600s for bitcoin-core), and clobber the transitional state
back to Running. The dashboard would then flip Running -> Stopping
-> Running -> Stopped, making it look like the stop had silently
failed until it eventually completed.

The merge loop now treats transitional variants (Stopping, Starting,
Restarting, Installing, Updating, Removing, and the three backup
variants) as owned by the RPC spawn task. For those variants,
merge_preserving_transitional keeps the existing state while still
taking live observability fields (health, exit_code, installed,
lan_address, manifest, static_files, available_update) from the
fresh scan so the UI continues to see live health readings.

Adds an escape hatch via a per-scan transitional_since side table:
if a package has been in a transitional state for more than 1200s
(2x the longest graceful stop at 600s on bitcoin-core), the scan
loop assumes the spawn task died without cleanup and overrides with
podman's live state. Prevents a crashed background task from wedging
a package in Stopping forever.

Three unit tests cover the merge rule, the observability passthrough,
and the transitional-variant classifier.
This commit is contained in:
archipelago 2026-04-23 05:15:13 -04:00
parent 39dd1d9dcc
commit cd69c3b2f6

View File

@ -16,7 +16,7 @@ use hyper::service::service_fn;
use std::collections::HashMap;
use std::net::SocketAddr;
use std::sync::Arc;
use std::time::Duration;
use std::time::{Duration, Instant};
use tokio::net::TcpListener;
use tracing::{debug, error, info, warn};
@ -322,11 +322,18 @@ impl Server {
// Tracks how many consecutive scans each container has been absent from.
// Prevents UI flapping when podman intermittently returns incomplete results.
let mut absence_tracker: HashMap<String, u32> = HashMap::new();
// Tracks when each container first entered a transitional state
// (Stopping / Starting / Restarting / ...). Used by the merge
// loop below to ignore podman's live state during a pending
// lifecycle op, and to break out if the spawned task dies
// without ever writing a final state.
let mut transitional_since: HashMap<String, Instant> = HashMap::new();
if let Err(e) = scan_and_update_packages(
&scanner,
&state,
identity_clone.as_ref(),
&mut absence_tracker,
&mut transitional_since,
)
.await
{
@ -352,6 +359,7 @@ impl Server {
&state,
identity_clone.as_ref(),
&mut absence_tracker,
&mut transitional_since,
)
.await
{
@ -809,11 +817,65 @@ async fn refresh_tor_address(state: &StateManager, identity: &NodeIdentity) -> R
/// 3 scans × 30s = 90 seconds of absence before removal.
const CONTAINER_ABSENCE_THRESHOLD: u32 = 3;
/// Maximum time a package entry may remain stuck in a transitional state
/// before the scan loop overrides it with podman's live state.
///
/// Rationale: the longest single-container stop timeout is bitcoin-core at
/// 600s. 2× that gives the spawned task ample margin before we assume it
/// died (panic, OOM, process restart mid-stop) and fall back to the
/// scanner's authoritative view. Applies to all transitional variants.
const TRANSITIONAL_STUCK_TIMEOUT: Duration = Duration::from_secs(1200);
/// Returns true if `state` is one of the transitional variants that a
/// `spawn_transitional`-style background task owns. While such a state is
/// set, the package scanner must not overwrite it with whatever podman
/// reports (see `merge_preserving_transitional`).
fn is_transitional(state: &crate::data_model::PackageState) -> bool {
use crate::data_model::PackageState::*;
matches!(
state,
Installing
| Stopping
| Starting
| Restarting
| Updating
| Removing
| CreatingBackup
| RestoringBackup
| BackingUp
)
}
/// Merge a fresh scan entry `fresh` into `existing` while preserving
/// `existing.state` (which is transitional — the RPC spawn task owns it).
/// Non-state observability fields are taken from `fresh` so the UI still
/// sees live health / exit_code / lan_address readings during a transition.
fn merge_preserving_transitional(
existing: &crate::data_model::PackageDataEntry,
fresh: &crate::data_model::PackageDataEntry,
) -> crate::data_model::PackageDataEntry {
crate::data_model::PackageDataEntry {
state: existing.state.clone(),
// install_progress and uninstall_stage are also owned by the
// initiating op (same reason as state) — keep them.
install_progress: existing.install_progress.clone(),
uninstall_stage: existing.uninstall_stage.clone(),
// Everything else comes from the fresh scan.
health: fresh.health.clone(),
exit_code: fresh.exit_code,
static_files: fresh.static_files.clone(),
manifest: fresh.manifest.clone(),
installed: fresh.installed.clone(),
available_update: fresh.available_update.clone(),
}
}
async fn scan_and_update_packages(
scanner: &DockerPackageScanner,
state: &StateManager,
identity: &NodeIdentity,
absence_tracker: &mut HashMap<String, u32>,
transitional_since: &mut HashMap<String, Instant>,
) -> Result<()> {
let packages = scanner.scan_containers().await?;
@ -847,10 +909,61 @@ async fn scan_and_update_packages(
let mut merged = current_data.package_data.clone();
let mut changed = false;
// Update/add containers found in this scan
// Update/add containers found in this scan.
//
// Transitional states (Stopping, Starting, Restarting, Installing,
// Updating, Removing, backup variants) are owned by the RPC spawn_task
// that initiated the operation — podman's live state during the op is
// meaningless ("running" during a graceful stop, "exited" during a
// restart, etc.) and must not be written back. See
// `merge_preserving_transitional` for the exact rule.
//
// Escape hatch: if a package has been in a transitional state for
// longer than TRANSITIONAL_STUCK_TIMEOUT we assume the spawned task
// died without cleanup and let the scan override it.
let now = Instant::now();
for (id, pkg) in &packages {
absence_tracker.remove(id);
if merged.get(id) != Some(pkg) {
let existing = merged.get(id);
let overwrite = match existing {
Some(existing_entry) if is_transitional(&existing_entry.state) => {
let entered = *transitional_since.entry(id.clone()).or_insert(now);
let stuck = now.duration_since(entered) > TRANSITIONAL_STUCK_TIMEOUT;
if stuck {
warn!(
"Container {} stuck in {:?} for >{}s; overriding with scan state {:?}",
id,
existing_entry.state,
TRANSITIONAL_STUCK_TIMEOUT.as_secs(),
pkg.state
);
transitional_since.remove(id);
true
} else {
// Keep existing transitional state, but merge non-state
// observability fields (health, exit_code, lan_address
// via installed) from the fresh scan so the UI still
// sees live readings.
let merged_entry = merge_preserving_transitional(existing_entry, pkg);
if existing.cloned() != Some(merged_entry.clone()) {
merged.insert(id.clone(), merged_entry);
changed = true;
}
false
}
}
Some(_) => {
// Not transitional: the side-table may hold a stale entry
// from a previous transition on this id; drop it.
transitional_since.remove(id);
existing != Some(pkg)
}
None => {
transitional_since.remove(id);
true
}
};
if overwrite && merged.get(id) != Some(pkg) {
merged.insert(id.clone(), pkg.clone());
changed = true;
}
@ -870,6 +983,7 @@ async fn scan_and_update_packages(
);
merged.remove(&id);
absence_tracker.remove(&id);
transitional_since.remove(&id);
changed = true;
}
}
@ -957,3 +1071,112 @@ async fn check_peer_health(state: &StateManager, data_dir: &std::path::Path) ->
Ok(())
}
#[cfg(test)]
mod merge_tests {
use super::*;
use crate::data_model::{
Description, Manifest, PackageDataEntry, PackageState, StaticFiles,
};
fn make_manifest() -> Manifest {
Manifest {
id: "lnd".to_string(),
title: "LND".to_string(),
version: "0.18.4".to_string(),
description: Description {
short: "".to_string(),
long: "".to_string(),
},
release_notes: "".to_string(),
license: "".to_string(),
wrapper_repo: "".to_string(),
upstream_repo: "".to_string(),
support_site: "".to_string(),
marketing_site: "".to_string(),
donation_url: None,
author: None,
website: None,
interfaces: None,
tier: None,
}
}
fn make_static() -> StaticFiles {
StaticFiles {
license: "".to_string(),
instructions: "".to_string(),
icon: "".to_string(),
}
}
fn make_entry(state: PackageState, health: Option<&str>) -> PackageDataEntry {
PackageDataEntry {
state,
health: health.map(|s| s.to_string()),
exit_code: None,
static_files: make_static(),
manifest: make_manifest(),
installed: None,
install_progress: None,
uninstall_stage: None,
available_update: None,
}
}
#[test]
fn preserves_transitional_state_on_merge() {
// existing: user initiated a stop, spawn_transitional set Stopping.
// fresh: podman hasn't finished the stop yet, still reports Running.
// Expected: merged state stays Stopping — podman's live view must
// not clobber the transitional state owned by the RPC spawn task.
let existing = make_entry(PackageState::Stopping, Some("healthy"));
let fresh = make_entry(PackageState::Running, Some("starting"));
let merged = merge_preserving_transitional(&existing, &fresh);
assert_eq!(merged.state, PackageState::Stopping);
}
#[test]
fn merges_fresh_observability_fields() {
// Non-state observability fields (health, exit_code, installed)
// MUST come from the fresh scan even while state is preserved —
// the UI still shows live health/health during a transition.
let mut existing = make_entry(PackageState::Stopping, Some("healthy"));
existing.exit_code = None;
let mut fresh = make_entry(PackageState::Running, Some("unhealthy"));
fresh.exit_code = Some(0);
let merged = merge_preserving_transitional(&existing, &fresh);
assert_eq!(merged.state, PackageState::Stopping);
assert_eq!(merged.health.as_deref(), Some("unhealthy"));
assert_eq!(merged.exit_code, Some(0));
}
#[test]
fn is_transitional_covers_all_variants() {
for s in [
PackageState::Installing,
PackageState::Stopping,
PackageState::Starting,
PackageState::Restarting,
PackageState::Updating,
PackageState::Removing,
PackageState::CreatingBackup,
PackageState::RestoringBackup,
PackageState::BackingUp,
] {
assert!(is_transitional(&s), "{:?} should be transitional", s);
}
for s in [
PackageState::Installed,
PackageState::Stopped,
PackageState::Exited,
PackageState::Running,
] {
assert!(
!is_transitional(&s),
"{:?} should NOT be transitional",
s
);
}
}
}