diff --git a/core/archipelago/src/server.rs b/core/archipelago/src/server.rs index 2d054ea2..01766e55 100644 --- a/core/archipelago/src/server.rs +++ b/core/archipelago/src/server.rs @@ -16,7 +16,7 @@ use hyper::service::service_fn; use std::collections::HashMap; use std::net::SocketAddr; use std::sync::Arc; -use std::time::Duration; +use std::time::{Duration, Instant}; use tokio::net::TcpListener; use tracing::{debug, error, info, warn}; @@ -322,11 +322,18 @@ impl Server { // Tracks how many consecutive scans each container has been absent from. // Prevents UI flapping when podman intermittently returns incomplete results. let mut absence_tracker: HashMap = HashMap::new(); + // Tracks when each container first entered a transitional state + // (Stopping / Starting / Restarting / ...). Used by the merge + // loop below to ignore podman's live state during a pending + // lifecycle op, and to break out if the spawned task dies + // without ever writing a final state. + let mut transitional_since: HashMap = HashMap::new(); if let Err(e) = scan_and_update_packages( &scanner, &state, identity_clone.as_ref(), &mut absence_tracker, + &mut transitional_since, ) .await { @@ -352,6 +359,7 @@ impl Server { &state, identity_clone.as_ref(), &mut absence_tracker, + &mut transitional_since, ) .await { @@ -809,11 +817,65 @@ async fn refresh_tor_address(state: &StateManager, identity: &NodeIdentity) -> R /// 3 scans × 30s = 90 seconds of absence before removal. const CONTAINER_ABSENCE_THRESHOLD: u32 = 3; +/// Maximum time a package entry may remain stuck in a transitional state +/// before the scan loop overrides it with podman's live state. +/// +/// Rationale: the longest single-container stop timeout is bitcoin-core at +/// 600s. 2× that gives the spawned task ample margin before we assume it +/// died (panic, OOM, process restart mid-stop) and fall back to the +/// scanner's authoritative view. Applies to all transitional variants. +const TRANSITIONAL_STUCK_TIMEOUT: Duration = Duration::from_secs(1200); + +/// Returns true if `state` is one of the transitional variants that a +/// `spawn_transitional`-style background task owns. While such a state is +/// set, the package scanner must not overwrite it with whatever podman +/// reports (see `merge_preserving_transitional`). +fn is_transitional(state: &crate::data_model::PackageState) -> bool { + use crate::data_model::PackageState::*; + matches!( + state, + Installing + | Stopping + | Starting + | Restarting + | Updating + | Removing + | CreatingBackup + | RestoringBackup + | BackingUp + ) +} + +/// Merge a fresh scan entry `fresh` into `existing` while preserving +/// `existing.state` (which is transitional — the RPC spawn task owns it). +/// Non-state observability fields are taken from `fresh` so the UI still +/// sees live health / exit_code / lan_address readings during a transition. +fn merge_preserving_transitional( + existing: &crate::data_model::PackageDataEntry, + fresh: &crate::data_model::PackageDataEntry, +) -> crate::data_model::PackageDataEntry { + crate::data_model::PackageDataEntry { + state: existing.state.clone(), + // install_progress and uninstall_stage are also owned by the + // initiating op (same reason as state) — keep them. + install_progress: existing.install_progress.clone(), + uninstall_stage: existing.uninstall_stage.clone(), + // Everything else comes from the fresh scan. + health: fresh.health.clone(), + exit_code: fresh.exit_code, + static_files: fresh.static_files.clone(), + manifest: fresh.manifest.clone(), + installed: fresh.installed.clone(), + available_update: fresh.available_update.clone(), + } +} + async fn scan_and_update_packages( scanner: &DockerPackageScanner, state: &StateManager, identity: &NodeIdentity, absence_tracker: &mut HashMap, + transitional_since: &mut HashMap, ) -> Result<()> { let packages = scanner.scan_containers().await?; @@ -847,10 +909,61 @@ async fn scan_and_update_packages( let mut merged = current_data.package_data.clone(); let mut changed = false; - // Update/add containers found in this scan + // Update/add containers found in this scan. + // + // Transitional states (Stopping, Starting, Restarting, Installing, + // Updating, Removing, backup variants) are owned by the RPC spawn_task + // that initiated the operation — podman's live state during the op is + // meaningless ("running" during a graceful stop, "exited" during a + // restart, etc.) and must not be written back. See + // `merge_preserving_transitional` for the exact rule. + // + // Escape hatch: if a package has been in a transitional state for + // longer than TRANSITIONAL_STUCK_TIMEOUT we assume the spawned task + // died without cleanup and let the scan override it. + let now = Instant::now(); for (id, pkg) in &packages { absence_tracker.remove(id); - if merged.get(id) != Some(pkg) { + let existing = merged.get(id); + let overwrite = match existing { + Some(existing_entry) if is_transitional(&existing_entry.state) => { + let entered = *transitional_since.entry(id.clone()).or_insert(now); + let stuck = now.duration_since(entered) > TRANSITIONAL_STUCK_TIMEOUT; + if stuck { + warn!( + "Container {} stuck in {:?} for >{}s; overriding with scan state {:?}", + id, + existing_entry.state, + TRANSITIONAL_STUCK_TIMEOUT.as_secs(), + pkg.state + ); + transitional_since.remove(id); + true + } else { + // Keep existing transitional state, but merge non-state + // observability fields (health, exit_code, lan_address + // via installed) from the fresh scan so the UI still + // sees live readings. + let merged_entry = merge_preserving_transitional(existing_entry, pkg); + if existing.cloned() != Some(merged_entry.clone()) { + merged.insert(id.clone(), merged_entry); + changed = true; + } + false + } + } + Some(_) => { + // Not transitional: the side-table may hold a stale entry + // from a previous transition on this id; drop it. + transitional_since.remove(id); + existing != Some(pkg) + } + None => { + transitional_since.remove(id); + true + } + }; + if overwrite && merged.get(id) != Some(pkg) { merged.insert(id.clone(), pkg.clone()); changed = true; } @@ -870,6 +983,7 @@ async fn scan_and_update_packages( ); merged.remove(&id); absence_tracker.remove(&id); + transitional_since.remove(&id); changed = true; } } @@ -957,3 +1071,112 @@ async fn check_peer_health(state: &StateManager, data_dir: &std::path::Path) -> Ok(()) } + +#[cfg(test)] +mod merge_tests { + use super::*; + use crate::data_model::{ + Description, Manifest, PackageDataEntry, PackageState, StaticFiles, + }; + + fn make_manifest() -> Manifest { + Manifest { + id: "lnd".to_string(), + title: "LND".to_string(), + version: "0.18.4".to_string(), + description: Description { + short: "".to_string(), + long: "".to_string(), + }, + release_notes: "".to_string(), + license: "".to_string(), + wrapper_repo: "".to_string(), + upstream_repo: "".to_string(), + support_site: "".to_string(), + marketing_site: "".to_string(), + donation_url: None, + author: None, + website: None, + interfaces: None, + tier: None, + } + } + + fn make_static() -> StaticFiles { + StaticFiles { + license: "".to_string(), + instructions: "".to_string(), + icon: "".to_string(), + } + } + + fn make_entry(state: PackageState, health: Option<&str>) -> PackageDataEntry { + PackageDataEntry { + state, + health: health.map(|s| s.to_string()), + exit_code: None, + static_files: make_static(), + manifest: make_manifest(), + installed: None, + install_progress: None, + uninstall_stage: None, + available_update: None, + } + } + + #[test] + fn preserves_transitional_state_on_merge() { + // existing: user initiated a stop, spawn_transitional set Stopping. + // fresh: podman hasn't finished the stop yet, still reports Running. + // Expected: merged state stays Stopping — podman's live view must + // not clobber the transitional state owned by the RPC spawn task. + let existing = make_entry(PackageState::Stopping, Some("healthy")); + let fresh = make_entry(PackageState::Running, Some("starting")); + let merged = merge_preserving_transitional(&existing, &fresh); + assert_eq!(merged.state, PackageState::Stopping); + } + + #[test] + fn merges_fresh_observability_fields() { + // Non-state observability fields (health, exit_code, installed) + // MUST come from the fresh scan even while state is preserved — + // the UI still shows live health/health during a transition. + let mut existing = make_entry(PackageState::Stopping, Some("healthy")); + existing.exit_code = None; + let mut fresh = make_entry(PackageState::Running, Some("unhealthy")); + fresh.exit_code = Some(0); + let merged = merge_preserving_transitional(&existing, &fresh); + assert_eq!(merged.state, PackageState::Stopping); + assert_eq!(merged.health.as_deref(), Some("unhealthy")); + assert_eq!(merged.exit_code, Some(0)); + } + + #[test] + fn is_transitional_covers_all_variants() { + for s in [ + PackageState::Installing, + PackageState::Stopping, + PackageState::Starting, + PackageState::Restarting, + PackageState::Updating, + PackageState::Removing, + PackageState::CreatingBackup, + PackageState::RestoringBackup, + PackageState::BackingUp, + ] { + assert!(is_transitional(&s), "{:?} should be transitional", s); + } + for s in [ + PackageState::Installed, + PackageState::Stopped, + PackageState::Exited, + PackageState::Running, + ] { + assert!( + !is_transitional(&s), + "{:?} should NOT be transitional", + s + ); + } + } +}