// Container Health Monitor // Checks container health every 120s, auto-restarts unhealthy containers (max 10 times) // with exponential backoff (10s..120s), dependency-aware restart ordering (deps first), // handles "created" state containers, resets dependent counters when deps recover, // and sends WebSocket notifications to the UI on failure. use crate::data_model::{Notification, NotificationLevel, PackageState}; use crate::state::StateManager; use crate::webhooks::{self, WebhookEvent}; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::path::{Path, PathBuf}; use std::sync::Arc; use std::time::Instant; use tracing::{debug, info, warn}; const MAX_RESTART_ATTEMPTS: u32 = 10; const CHECK_INTERVAL_SECS: u64 = 120; /// Backoff delays per attempt — escalating from 10s to 120s const BACKOFF_DELAYS_SECS: [u64; 10] = [10, 15, 20, 30, 30, 45, 60, 60, 90, 120]; /// Reset restart counter after 1 hour of stability const STABILITY_RESET_SECS: u64 = 3600; /// Container startup tier for dependency ordering. /// Lower tiers start first. Containers in the same tier start together. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] enum StartupTier { /// Databases: postgres, redis, mariadb, mysql Database = 0, /// Core infrastructure: bitcoin-knots, bitcoin-core CoreInfra = 1, /// Services depending on core: lnd, electrs, nbxplorer DependentService = 2, /// Application layer: mempool-api, btcpay-server, fedimint, nextcloud, etc. Application = 3, /// UI/frontend containers: mempool-web, bitcoin-ui, lnd-ui Frontend = 4, } fn container_tier(name: &str) -> StartupTier { let id = name.strip_prefix("archy-").unwrap_or(name); match id { // Tier 0: Databases and data stores "btcpay-db" | "mempool-db" | "mysql-mempool" | "penpot-postgres" | "immich_postgres" | "immich_redis" | "penpot-valkey" | "endurain-db" | "nextcloud-db" | "indeedhub-postgres" | "indeedhub-redis" | "indeedhub-minio" => StartupTier::Database, // Tier 1: Core infrastructure "bitcoin-knots" | "bitcoin-core" | "bitcoin" => StartupTier::CoreInfra, // Tier 2: Dependent services (need databases or bitcoin) "lnd" | "electrumx" | "mempool-electrs" | "electrs" | "nbxplorer" | "mempool-api" | "indeedhub-api" => StartupTier::DependentService, // Tier 4: Frontend/UI "mempool-web" | "bitcoin-ui" | "lnd-ui" | "electrs-ui" | "penpot-frontend" | "penpot-exporter" | "indeedhub" => StartupTier::Frontend, // Tier 3: Application layer (everything else) _ => StartupTier::Application, } } /// Map containers to their required dependencies. /// When a dependent fails, check and restart its dependencies first. fn container_dependencies(name: &str) -> &'static [&'static str] { let id = name.strip_prefix("archy-").unwrap_or(name); match id { // Bitcoin-dependent chain "lnd" => &["bitcoin"], "electrumx" | "mempool-electrs" | "electrs" => &["bitcoin"], "nbxplorer" => &["bitcoin"], "btcpay-server" => &["btcpay-db", "nbxplorer"], "mempool-api" => &["mempool-db", "electrumx"], "mempool-web" => &["mempool-api"], "fedimint" => &["bitcoin"], "fedimint-gateway" => &["bitcoin", "fedimint"], // IndeedHub stack. The API needs MinIO (object storage) up before it // can serve — without listing it the health monitor would restart the // API while MinIO was still coming up, which is the "needs 1-2 restarts // to recover" symptom (#41). MinIO has no deps of its own, so the // monitor restarts it independently first; no deadlock. "indeedhub-api" => &["indeedhub-postgres", "indeedhub-redis", "indeedhub-minio"], "indeedhub" => &["indeedhub-api"], "indeedhub-relay" => &["indeedhub-postgres"], "indeedhub-ffmpeg" => &["indeedhub-api"], // Multi-container stacks "immich_server" => &["immich_postgres", "immich_redis"], "penpot-backend" => &["penpot-postgres", "penpot-valkey"], "penpot-frontend" => &["penpot-backend"], // UI containers "bitcoin-ui" => &["bitcoin"], "lnd-ui" => &["lnd"], "electrs-ui" => &["electrumx"], _ => &[], } } /// Check if all of a container's dependencies are currently running. fn deps_are_running(name: &str, containers: &[ContainerHealth]) -> bool { let deps = container_dependencies(name); if deps.is_empty() { return true; } for dep in deps { if *dep == "bitcoin" { let bitcoin_running = containers.iter().any(|c| { let c_id = c.name.strip_prefix("archy-").unwrap_or(&c.name); matches!(c_id, "bitcoin" | "bitcoin-knots" | "bitcoin-core") && c.state == "running" }); if !bitcoin_running { return false; } continue; } // Check both plain name and archy- prefixed name let dep_running = containers.iter().any(|c| { let c_id = c.name.strip_prefix("archy-").unwrap_or(&c.name); (c_id == *dep || c.name == *dep) && c.state == "running" }); if !dep_running { return false; } } true } fn conflicting_bitcoin_variant(name: &str) -> Option<&'static str> { match name.strip_prefix("archy-").unwrap_or(name) { "bitcoin-core" => Some("bitcoin-knots"), "bitcoin-knots" | "bitcoin" => Some("bitcoin-core"), _ => None, } } fn has_running_bitcoin_conflict(name: &str, containers: &[ContainerHealth]) -> bool { let Some(conflict) = conflicting_bitcoin_variant(name) else { return false; }; containers.iter().any(|c| { let id = c.name.strip_prefix("archy-").unwrap_or(&c.name); id == conflict && c.state == "running" }) } /// Track restart attempts per container with exponential backoff and stability reset. struct RestartTracker { attempts: HashMap, last_failure: HashMap, last_healthy: HashMap, } impl RestartTracker { fn new() -> Self { Self { attempts: HashMap::new(), last_failure: HashMap::new(), last_healthy: HashMap::new(), } } /// Record a restart attempt. Returns false if max attempts exceeded. fn record_attempt(&mut self, name: &str) -> bool { let count = self.attempts.entry(name.to_string()).or_insert(0); *count += 1; self.last_failure.insert(name.to_string(), Instant::now()); *count <= MAX_RESTART_ATTEMPTS } /// Clear restart count when a container is healthy again. fn clear(&mut self, name: &str) { self.attempts.remove(name); self.last_failure.remove(name); self.last_healthy.insert(name.to_string(), Instant::now()); } fn attempt_count(&self, name: &str) -> u32 { *self.attempts.get(name).unwrap_or(&0) } /// Get the backoff delay in seconds for the current attempt number. fn backoff_delay_secs(&self, name: &str) -> u64 { let attempts = self.attempt_count(name); if attempts == 0 { return BACKOFF_DELAYS_SECS[0]; } let idx = (attempts as usize) .saturating_sub(1) .min(BACKOFF_DELAYS_SECS.len() - 1); BACKOFF_DELAYS_SECS[idx] } /// Check if enough time has passed since last failure for the backoff delay. fn backoff_elapsed(&self, name: &str) -> bool { let delay = self.backoff_delay_secs(name); match self.last_failure.get(name) { Some(last) => last.elapsed().as_secs() >= delay, None => true, } } /// Check if a failed container should have its counter reset (1h stability window). fn should_reset_failed(&self, name: &str) -> bool { if self.attempt_count(name) < MAX_RESTART_ATTEMPTS { return false; } match self.last_failure.get(name) { Some(last) => last.elapsed().as_secs() >= STABILITY_RESET_SECS, None => false, } } } #[derive(Debug, Clone)] struct ContainerHealth { name: String, app_id: String, state: String, podman_health: Option, host_port_ready: Option, healthy: bool, } /// Track container memory usage over time for leak detection. struct MemoryTracker { /// Per-container memory samples: (timestamp, rss_bytes) samples: HashMap>, } impl MemoryTracker { fn new() -> Self { Self { samples: HashMap::new(), } } /// Record a memory sample for a container. fn record(&mut self, name: &str, rss_bytes: u64) { let entry = self.samples.entry(name.to_string()).or_default(); entry.push((Instant::now(), rss_bytes)); // Keep only last 288 samples (24h at 5min intervals) if entry.len() > 288 { entry.remove(0); } } /// Check if a container's memory has grown by more than 50% over the tracking period. /// Returns Some(growth_percent) if a leak is detected, None otherwise. fn check_leak(&self, name: &str) -> Option { let samples = self.samples.get(name)?; if samples.len() < 12 { return None; // Need at least 1 hour of data } let (oldest_time, oldest_rss) = samples.first()?; let (_, latest_rss) = samples.last()?; let elapsed_hours = oldest_time.elapsed().as_secs() as f64 / 3600.0; if elapsed_hours < 1.0 || *oldest_rss == 0 { return None; } let growth = (*latest_rss as f64 - *oldest_rss as f64) / *oldest_rss as f64 * 100.0; if growth > 50.0 { Some(growth) } else { None } } } // ── Persistent restart tracking ──────────────────────────────────────── // Survives process restarts so a container can't loop infinitely by // crashing 3 times → triggering process restart → resetting counter → repeat. const RESTART_HISTORY_FILE: &str = "restart-tracker.json"; #[derive(Serialize, Deserialize, Default)] struct RestartHistory { containers: HashMap, } #[derive(Serialize, Deserialize, Clone)] struct ContainerRestartRecord { attempts: u32, last_failure_epoch: i64, } impl RestartHistory { async fn load(data_dir: &Path) -> Self { let path = data_dir.join(RESTART_HISTORY_FILE); match tokio::fs::read_to_string(&path).await { Ok(content) => serde_json::from_str(&content).unwrap_or_default(), Err(_) => Self::default(), } } async fn save(&self, data_dir: &Path) { let path = data_dir.join(RESTART_HISTORY_FILE); if let Ok(json) = serde_json::to_string(self) { let _ = tokio::fs::write(&path, json).await; } } /// Seed the in-memory RestartTracker from persisted history. fn seed_tracker(&self, tracker: &mut RestartTracker) { let now_epoch = chrono::Utc::now().timestamp(); for (name, record) in &self.containers { // Only seed if last failure was within the stability window let secs_since_failure = now_epoch - record.last_failure_epoch; if secs_since_failure < STABILITY_RESET_SECS as i64 && record.attempts > 0 { tracker.attempts.insert(name.clone(), record.attempts); info!( "Restored restart counter for {}: {} attempts ({}s ago)", name, record.attempts, secs_since_failure ); } } } fn record_attempt(&mut self, name: &str) { let entry = self .containers .entry(name.to_string()) .or_insert(ContainerRestartRecord { attempts: 0, last_failure_epoch: 0, }); entry.attempts += 1; entry.last_failure_epoch = chrono::Utc::now().timestamp(); } fn clear(&mut self, name: &str) { self.containers.remove(name); } } /// Query container memory stats from podman. async fn check_container_memory() -> HashMap { let output = match tokio::time::timeout( std::time::Duration::from_secs(30), tokio::process::Command::new("podman") .args([ "stats", "--no-stream", "--format", "{{.Name}} {{.MemUsage}}", ]) .output(), ) .await { Ok(Ok(o)) if o.status.success() => o, Ok(Err(e)) => { debug!("podman stats failed: {}", e); return HashMap::new(); } Err(_) => { debug!("podman stats timed out (30s)"); return HashMap::new(); } _ => return HashMap::new(), }; let stdout = String::from_utf8_lossy(&output.stdout); let mut result = HashMap::new(); for line in stdout.lines() { let parts: Vec<&str> = line.split_whitespace().collect(); if parts.len() >= 2 { let name = parts[0].to_string(); // Parse memory like "123.4MiB", "1.2GiB", "45.6kB" let mem_str = parts[1]; if let Some(bytes) = parse_memory_string(mem_str) { result.insert(name, bytes); } } } result } /// Parse memory string like "123.4MiB" or "1.2GiB" to bytes. fn parse_memory_string(s: &str) -> Option { let s = s.trim(); if s.ends_with("GiB") { let num: f64 = s.strip_suffix("GiB")?.parse().ok()?; Some((num * 1_073_741_824.0) as u64) } else if s.ends_with("MiB") { let num: f64 = s.strip_suffix("MiB")?.parse().ok()?; Some((num * 1_048_576.0) as u64) } else if s.ends_with("KiB") || s.ends_with("kB") { let suffix = if s.ends_with("KiB") { "KiB" } else { "kB" }; let num: f64 = s.strip_suffix(suffix)?.parse().ok()?; Some((num * 1024.0) as u64) } else if s.ends_with("B") { let num: f64 = s.strip_suffix('B')?.parse().ok()?; Some(num as u64) } else { None } } /// Query all containers and their health status. async fn check_containers() -> Vec { let output = match tokio::time::timeout( std::time::Duration::from_secs(60), tokio::process::Command::new("podman") .args(["ps", "-a", "--format", "json"]) .output(), ) .await { Ok(Ok(o)) if o.status.success() => o, Ok(Err(e)) => { debug!("podman ps failed: {}", e); return Vec::new(); } Err(_) => { debug!("podman ps timed out (60s)"); return Vec::new(); } _ => return Vec::new(), }; let stdout = String::from_utf8_lossy(&output.stdout); let containers: Vec = serde_json::from_str(&stdout).unwrap_or_default(); let live_container_ids = live_container_ids(&containers); cleanup_stale_podman_healthcheck_units(&live_container_ids).await; // Monitor ALL long-running containers for health — backend services (databases, // nbxplorer, mempool-api) and UI containers need auto-restart too. // Only skip ephemeral containers (build infrastructure, init one-shots). let mut out = Vec::new(); for c in &containers { let name = c.get("Names").and_then(|v| { if let Some(arr) = v.as_array() { arr.first().and_then(|n| n.as_str()).map(|s| s.to_string()) } else { v.as_str().map(|s| s.to_string()) } }); let Some(name) = name else { continue; }; // Skip podman-compose infrastructure and one-shot init containers if name.starts_with("indeedhub-build_") || name.contains("-init") { continue; } let app_id = name.strip_prefix("archy-").unwrap_or(&name).to_string(); let state = c .get("State") .and_then(|v| v.as_str()) .unwrap_or("unknown") .to_lowercase(); let podman_health = parse_podman_health(c, &state); let host_ports = host_tcp_ports_from_container(c); let host_port_ready = if host_ports.is_empty() { None } else { Some(host_ports_ready(&host_ports).await) }; let healthy = state == "running" && podman_health.as_deref() != Some("unhealthy") && host_port_ready != Some(false); out.push(ContainerHealth { name, app_id, state, podman_health, host_port_ready, healthy, }); } out } fn host_tcp_ports_from_container(c: &serde_json::Value) -> Vec { let Some(ports) = c.get("Ports").and_then(|v| v.as_array()) else { return Vec::new(); }; let mut out: Vec = ports .iter() .filter(|p| { p.get("protocol") .and_then(|v| v.as_str()) .unwrap_or("tcp") .eq_ignore_ascii_case("tcp") }) .filter_map(|p| { p.get("host_port") .and_then(|v| v.as_u64()) .and_then(|port| u16::try_from(port).ok()) }) .collect(); out.sort_unstable(); out.dedup(); out } async fn host_ports_ready(ports: &[u16]) -> bool { for port in ports { let ready = tokio::time::timeout( std::time::Duration::from_secs(2), tokio::net::TcpStream::connect(("127.0.0.1", *port)), ) .await .is_ok_and(|r| r.is_ok()); if !ready { return false; } } true } fn live_container_ids(containers: &[serde_json::Value]) -> HashSet { containers .iter() .filter_map(|c| { c.get("Id") .or_else(|| c.get("ID")) .and_then(|v| v.as_str()) .map(|s| s.to_string()) }) .collect() } async fn cleanup_stale_podman_healthcheck_units(live_container_ids: &HashSet) { if live_container_ids.is_empty() { return; } let mut units = stale_healthcheck_units_from_systemd(live_container_ids).await; if units.is_empty() { return; } units.sort(); units.dedup(); let mut cleaned = 0; for unit in units { let Some(container_id) = parse_podman_healthcheck_unit(&unit) else { continue; }; let service = format!("{}.service", unit.trim_end_matches(".timer")); if stop_user_unit(&unit).await { cleaned += 1; } let _ = stop_user_unit(&service).await; let _ = reset_failed_user_unit(&service).await; debug!( "Stopped stale Podman healthcheck unit {} for removed container {}", unit, container_id ); } if cleaned > 0 { info!("Cleaned {} stale Podman healthcheck timer(s)", cleaned); } } async fn stale_healthcheck_units_from_systemd(live_container_ids: &HashSet) -> Vec { let mut units = Vec::new(); for args in [ [ "--user", "list-timers", "--all", "--no-legend", "--no-pager", ] .as_slice(), ["--user", "list-units", "--all", "--no-legend", "--no-pager"].as_slice(), ] { let output = match tokio::time::timeout( std::time::Duration::from_secs(20), tokio::process::Command::new("systemctl") .args(args.iter().copied()) .output(), ) .await { Ok(Ok(output)) if output.status.success() => output, Ok(Ok(output)) => { let stderr = String::from_utf8_lossy(&output.stderr); debug!("systemctl {} failed: {}", args.join(" "), stderr.trim()); continue; } Ok(Err(e)) => { debug!("Failed to run systemctl {}: {}", args.join(" "), e); continue; } Err(_) => { debug!("systemctl {} timed out", args.join(" ")); continue; } }; let stdout = String::from_utf8_lossy(&output.stdout); units.extend(stale_healthcheck_units(&stdout, live_container_ids)); } units } fn stale_healthcheck_units(output: &str, live_container_ids: &HashSet) -> Vec { output .lines() .flat_map(|line| line.split_whitespace()) .filter_map(|token| { let unit = token.trim_start_matches('●'); let id = parse_podman_healthcheck_unit(unit)?; (!live_container_ids.contains(id)).then(|| unit.to_string()) }) .collect() } fn parse_podman_healthcheck_unit(unit: &str) -> Option<&str> { let unit = unit .strip_suffix(".timer") .or_else(|| unit.strip_suffix(".service"))?; let (container_id, _suffix) = unit.split_once('-')?; if container_id.len() == 64 && container_id.bytes().all(|b| b.is_ascii_hexdigit()) { Some(container_id) } else { None } } async fn stop_user_unit(unit: &str) -> bool { run_systemctl_user(["stop", unit]).await } async fn reset_failed_user_unit(unit: &str) -> bool { run_systemctl_user(["reset-failed", unit]).await } async fn run_systemctl_user(args: [&str; N]) -> bool { let output = match tokio::time::timeout( std::time::Duration::from_secs(10), tokio::process::Command::new("systemctl") .arg("--user") .args(args.iter().copied()) .output(), ) .await { Ok(Ok(output)) => output, Ok(Err(e)) => { debug!("Failed to run systemctl --user {}: {}", args.join(" "), e); return false; } Err(_) => { debug!("systemctl --user {} timed out", args.join(" ")); return false; } }; if output.status.success() { true } else { let stderr = String::from_utf8_lossy(&output.stderr); debug!( "systemctl --user {} failed: {}", args.join(" "), stderr.trim() ); false } } fn parse_podman_health(c: &serde_json::Value, state: &str) -> Option { c.get("Status") .and_then(|v| v.as_str()) .and_then(parse_health_from_status) .or_else(|| { c.get("State") .and_then(|v| v.get("Health")) .and_then(|v| v.get("Status")) .and_then(|v| v.as_str()) .map(|s| s.to_string()) }) .or_else(|| state.contains("unhealthy").then(|| "unhealthy".to_string())) } fn parse_health_from_status(status: &str) -> Option { let start = status.rfind('(')?; let end = status.rfind(')')?; (start < end).then(|| status[start + 1..end].to_string()) } /// Try to recover a container. Running containers need a real restart so /// rootless network helpers such as pasta are recreated; `podman start` is a /// no-op for a running container with a missing host listener. async fn restart_container(name: &str, state: &str) -> bool { let action = if state == "running" { "restart" } else { "start" }; info!("Auto-{}ing unhealthy container: {}", action, name); let result = tokio::time::timeout( std::time::Duration::from_secs(120), tokio::process::Command::new("systemd-run") .args(["--user", "--scope", "--quiet", "--collect", "podman"]) .args([action, name]) .output(), ) .await; match result { Ok(Ok(output)) if output.status.success() => { info!("Successfully recovered container: {}", name); true } Ok(Ok(output)) => { let stderr = String::from_utf8_lossy(&output.stderr); warn!("Failed to {} container {}: {}", action, name, stderr.trim()); false } Ok(Err(e)) => { warn!("Failed to execute podman {} for {}: {}", action, name, e); false } Err(_) => { warn!("Timeout {}ing container {} (120s)", action, name); false } } } /// ElectrumX/electrs on-disk data dir. Wiped to force a clean resync when its /// LevelDB is detected corrupt (see `maybe_recover_corrupt_electrumx`). const ELECTRUMX_DATA_DIR: &str = "/var/lib/archipelago/electrumx"; /// Restart attempt at which we check for — and recover from — a corrupt /// ElectrumX database. Late enough that a transient restart won't trigger a /// destructive resync, early enough to self-heal before MAX_RESTART_ATTEMPTS. const ELECTRUMX_DB_RESET_ATTEMPT: u32 = 3; fn is_electrumx(name: &str) -> bool { let id = name.strip_prefix("archy-").unwrap_or(name); matches!(id, "electrumx" | "electrs" | "mempool-electrs") } /// True when a container's logs show the specific corrupt-LevelDB signature. /// ElectrumX exit-loops with a plyvel error when its `hist`/`utxo` LevelDB /// loses its CURRENT/MANIFEST pointer — typically after an unclean SIGKILL /// (e.g. the cgroup cascade on a service restart). We match the exact failure /// so a normal restart never triggers the destructive resync below. fn looks_like_corrupt_electrumx_db(logs: &str) -> bool { logs.contains("create_if_missing is false") || logs.contains("Corruption:") || (logs.contains("plyvel") && logs.contains("does not exist")) } async fn electrumx_db_corrupt(name: &str) -> bool { let out = tokio::time::timeout( std::time::Duration::from_secs(15), tokio::process::Command::new("podman") .args(["logs", "--tail", "60", name]) .output(), ) .await; match out { Ok(Ok(output)) => { let logs = format!( "{}{}", String::from_utf8_lossy(&output.stdout), String::from_utf8_lossy(&output.stderr), ); looks_like_corrupt_electrumx_db(&logs) } _ => false, } } /// Wipe the ElectrumX LevelDB stores so the next start resyncs from scratch. /// Files are owned by the container's mapped UID, so removal needs host sudo. /// The mount point itself is preserved (the container expects it to exist). /// Returns true if the reset succeeded. async fn reset_electrumx_data() -> bool { let rm = format!( "rm -rf {dir}/hist {dir}/utxo {dir}/meta {dir}/COIN", dir = ELECTRUMX_DATA_DIR, ); matches!( crate::update::host_sudo(&["sh", "-c", &rm]).await, Ok(status) if status.success() ) } /// Self-heal a wedged ElectrumX: if it's exit-looping on a corrupt database, /// wipe its data dir once (at `ELECTRUMX_DB_RESET_ATTEMPT`) so the impending /// restart resyncs cleanly. Bounded to electrs containers, gated on the exact /// corruption signature, and fired once per failure streak — when the resync /// stabilises the restart tracker clears, so a future corruption can heal too. async fn maybe_recover_corrupt_electrumx(name: &str, attempt: u32) { if attempt != ELECTRUMX_DB_RESET_ATTEMPT || !is_electrumx(name) { return; } if !electrumx_db_corrupt(name).await { return; } warn!( "ElectrumX {} is exit-looping on a corrupt database — resetting its data dir to force a clean resync", name ); if reset_electrumx_data().await { info!("ElectrumX data dir reset; a fresh resync will begin on restart"); } else { warn!("Failed to reset ElectrumX data dir (host sudo rm failed) — manual recovery may be needed"); } } /// Spawn the health monitor background task. pub fn spawn_health_monitor(state: Arc, data_dir: PathBuf) { tokio::spawn(async move { // Wait for boot recovery to complete before starting health checks. // This prevents the health monitor from fighting with crash_recovery // which is starting containers in tier order. info!("Health monitor: waiting for boot recovery to complete..."); let wait_start = std::time::Instant::now(); loop { if crate::crash_recovery::is_recovery_complete() { break; } // Safety timeout: start anyway after 30 minutes even if recovery hangs. // Stack recovery can take many minutes on low-resource nodes after reboot. if wait_start.elapsed().as_secs() > 1800 { warn!("Health monitor: boot recovery did not complete within 30 minutes, starting anyway"); break; } tokio::time::sleep(std::time::Duration::from_secs(5)).await; } // Additional cooldown after recovery to let containers stabilize info!("Health monitor: recovery done, waiting 60s for containers to stabilize..."); tokio::time::sleep(std::time::Duration::from_secs(60)).await; info!("Health monitor: starting health checks"); let mut tracker = RestartTracker::new(); let mut mem_tracker = MemoryTracker::new(); let mut mem_check_counter: u32 = 0; let mut interval = tokio::time::interval(std::time::Duration::from_secs(CHECK_INTERVAL_SECS)); // Skip missed ticks — prevents burst of health checks after slow podman response interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); // Load persistent restart history and seed the in-memory tracker let mut restart_history = RestartHistory::load(&data_dir).await; restart_history.seed_tracker(&mut tracker); #[allow(unused_assignments)] let mut history_dirty = false; loop { interval.tick().await; mem_check_counter += 1; // Check container memory every 5 minutes (every 5th health check) if mem_check_counter.is_multiple_of(5) { let mem_stats = check_container_memory().await; for (name, rss) in &mem_stats { mem_tracker.record(name, *rss); if let Some(growth) = mem_tracker.check_leak(name) { warn!( "Potential memory leak in {}: {:.0}% growth over tracking period", name, growth ); } } } let containers = check_containers().await; if containers.is_empty() { continue; } // Load user-stopped list to skip intentionally stopped containers let user_stopped = crate::crash_recovery::load_user_stopped(&data_dir).await; // Sort containers by startup tier so databases restart before dependent services let mut unhealthy: Vec<&ContainerHealth> = Vec::new(); let mut state_changed = false; let (mut data, _) = state.get_snapshot().await; for container in &containers { // Skip optional/marketplace containers that aren't installed if let Some(pkg) = data.package_data.get(&container.app_id) { if pkg.installed.is_none() { debug!("Skipping uninstalled container: {}", container.name); continue; } if matches!( pkg.state, PackageState::Starting | PackageState::Stopping | PackageState::Restarting ) { debug!( "Skipping container during package lifecycle transition: {} ({:?})", container.name, pkg.state ); continue; } } else { // Orphan: container exists in podman but archipelago has // no package_data entry for it. Common after a variant // switch (bitcoin-core ↔ bitcoin-knots) where the // uninstall removed the package entry but the prior // variant's container survived in stopped state. Without // this guard the health monitor pages every minute with // "Auto-restart failed (attempt N/10)" for an app the // user can no longer see in the dashboard. debug!( "Skipping orphan container (not in package_data): {}", container.name ); let before = data.notifications.len(); data.notifications.retain(|n| { n.app_id.as_deref() != Some(&container.app_id) && !n.title.contains(&container.app_id) }); if data.notifications.len() != before { state_changed = true; } continue; } if container.healthy { let before = data.notifications.len(); data.notifications.retain(|n| { n.app_id.as_deref() != Some(&container.app_id) && !n.title.contains(&container.app_id) }); if data.notifications.len() != before { state_changed = true; } if tracker.attempt_count(&container.name) > 0 { info!( "Container {} is healthy again after restart", container.name ); // Reset attempt counters for containers that depend on this one, // since their previous failures may have been caused by this // dependency being down let recovered_id = container .name .strip_prefix("archy-") .unwrap_or(&container.name) .to_string(); for other in &containers { let deps = container_dependencies(&other.name); if deps .iter() .any(|d| *d == recovered_id || *d == container.name) && tracker.attempt_count(&other.name) > 0 { info!( "Resetting restart counter for {} (dependency {} recovered)", other.name, container.name ); tracker.clear(&other.name); restart_history.clear(&other.name); } } tracker.clear(&container.name); restart_history.clear(&container.name); history_dirty = true; } continue; } // Handle exited, stopped, created, and Podman-unhealthy running containers. if container.podman_health.as_deref() == Some("unhealthy") || container.host_port_ready == Some(false) || container.state == "exited" || container.state == "stopped" || container.state == "created" { // Skip user-stopped containers if user_stopped.contains(&container.name) { debug!("Skipping user-stopped container: {}", container.name); continue; } unhealthy.push(container); } } let unhealthy_app_ids: HashSet<&str> = unhealthy .iter() .map(|container| container.app_id.as_str()) .collect(); let before = data.notifications.len(); data.notifications.retain(|n| { !n.id.starts_with("health-") || n.app_id .as_deref() .is_some_and(|app_id| unhealthy_app_ids.contains(app_id)) }); if data.notifications.len() != before { state_changed = true; } // Sort by startup tier: databases first, then core, then dependent, then apps, then UIs unhealthy.sort_by_key(|c| container_tier(&c.name)); let mut prev_tier: Option = None; for container in &unhealthy { let tier = container_tier(&container.name); // Reset counter after 1 hour for permanently failed containers if tracker.should_reset_failed(&container.name) { info!( "Resetting restart counter for {} after {}s stability window", container.name, STABILITY_RESET_SECS ); tracker.clear(&container.name); restart_history.clear(&container.name); history_dirty = true; } if tracker.attempt_count(&container.name) >= MAX_RESTART_ATTEMPTS { debug!( "Container {} exceeded max restart attempts ({})", container.name, MAX_RESTART_ATTEMPTS ); continue; } // Wait for backoff delay before retrying if !tracker.backoff_elapsed(&container.name) { let delay = tracker.backoff_delay_secs(&container.name); debug!( "Container {} waiting for backoff ({}s)", container.name, delay ); continue; } // Skip if dependencies aren't running — they need to start first if !deps_are_running(&container.name, &containers) { let deps = container_dependencies(&container.name); debug!( "Container {} waiting for dependencies {:?}", container.name, deps ); continue; } if has_running_bitcoin_conflict(&container.name, &containers) { debug!( "Skipping auto-restart for {} because the other Bitcoin implementation is running", container.name ); continue; } // When transitioning to a higher tier, wait briefly for previous tier to stabilize if let Some(prev) = prev_tier { if tier > prev { tokio::time::sleep(std::time::Duration::from_secs(5)).await; } } prev_tier = Some(tier); if tracker.record_attempt(&container.name) { restart_history.record_attempt(&container.name); history_dirty = true; let attempt = tracker.attempt_count(&container.name); info!( "Restarting {} (tier {:?}, attempt {}/{}, backoff {}s)", container.name, tier, attempt, MAX_RESTART_ATTEMPTS, BACKOFF_DELAYS_SECS .get(attempt.saturating_sub(1) as usize) .unwrap_or(&90) ); // Before restarting, self-heal a corrupt ElectrumX DB so // the restart resyncs cleanly instead of crash-looping. maybe_recover_corrupt_electrumx(&container.name, attempt).await; let restarted = restart_container(&container.name, &container.state).await; if !restarted || attempt >= MAX_RESTART_ATTEMPTS { let notification = Notification { id: format!( "health-{}-{}", container.app_id, chrono::Utc::now().timestamp() ), level: NotificationLevel::Error, title: format!("{} is unhealthy", container.app_id), message: if restarted { format!( "Container restarted ({}/{} attempts). May need manual attention.", attempt, MAX_RESTART_ATTEMPTS ) } else { format!( "Auto-restart failed (attempt {}/{}). Container state: {}", attempt, MAX_RESTART_ATTEMPTS, container.state ) }, timestamp: chrono::Utc::now().to_rfc3339(), app_id: Some(container.app_id.clone()), }; data.notifications.push(notification.clone()); if data.notifications.len() > 20 { data.notifications = data.notifications.split_off(data.notifications.len() - 20); } state_changed = true; let webhook_payload = webhooks::WebhookPayload { event: WebhookEvent::ContainerCrash, title: notification.title, message: notification.message, timestamp: notification.timestamp, node_id: String::new(), details: Some(serde_json::json!({ "container": container.name, "app_id": container.app_id, "state": container.state, "attempt": attempt, "tier": format!("{:?}", tier), })), }; webhooks::send_webhook(&data_dir, webhook_payload).await; } } } if state_changed { state.update_data(data).await; debug!("Health monitor: state updated with notifications"); } // Persist restart history to disk (debounced: once per check cycle) if history_dirty { restart_history.save(&data_dir).await; history_dirty = false; } } }); } #[cfg(test)] mod tests { use super::*; #[test] fn test_restart_tracker_new_is_empty() { let tracker = RestartTracker::new(); assert_eq!(tracker.attempt_count("any-container"), 0); } #[test] fn test_restart_tracker_record_attempt_increments() { let mut tracker = RestartTracker::new(); assert!(tracker.record_attempt("test-container")); assert_eq!(tracker.attempt_count("test-container"), 1); assert!(tracker.record_attempt("test-container")); assert_eq!(tracker.attempt_count("test-container"), 2); assert!(tracker.record_attempt("test-container")); assert_eq!(tracker.attempt_count("test-container"), 3); } #[test] fn test_restart_tracker_max_attempts_exceeded() { let mut tracker = RestartTracker::new(); for i in 1..=MAX_RESTART_ATTEMPTS { assert!( tracker.record_attempt("container-a"), "Attempt {} should be allowed", i ); } assert!(!tracker.record_attempt("container-a")); assert_eq!( tracker.attempt_count("container-a"), MAX_RESTART_ATTEMPTS + 1 ); } #[test] fn test_restart_tracker_independent_containers() { let mut tracker = RestartTracker::new(); tracker.record_attempt("container-a"); tracker.record_attempt("container-a"); tracker.record_attempt("container-b"); assert_eq!(tracker.attempt_count("container-a"), 2); assert_eq!(tracker.attempt_count("container-b"), 1); assert_eq!(tracker.attempt_count("container-c"), 0); } #[test] fn test_restart_tracker_clear_resets_count() { let mut tracker = RestartTracker::new(); tracker.record_attempt("container-x"); tracker.record_attempt("container-x"); assert_eq!(tracker.attempt_count("container-x"), 2); tracker.clear("container-x"); assert_eq!(tracker.attempt_count("container-x"), 0); } #[test] fn test_restart_tracker_clear_allows_new_attempts() { let mut tracker = RestartTracker::new(); for _ in 0..=MAX_RESTART_ATTEMPTS { tracker.record_attempt("container-y"); } assert!(!tracker.record_attempt("container-y")); tracker.clear("container-y"); assert!(tracker.record_attempt("container-y")); assert_eq!(tracker.attempt_count("container-y"), 1); } #[test] fn test_restart_tracker_clear_nonexistent_is_safe() { let mut tracker = RestartTracker::new(); tracker.clear("nonexistent"); assert_eq!(tracker.attempt_count("nonexistent"), 0); } #[test] fn test_container_health_struct() { let health = ContainerHealth { name: "archy-bitcoin-knots".to_string(), app_id: "bitcoin-knots".to_string(), state: "running".to_string(), podman_health: Some("healthy".to_string()), host_port_ready: None, healthy: true, }; assert!(health.healthy); assert_eq!(health.name, "archy-bitcoin-knots"); assert_eq!(health.app_id, "bitcoin-knots"); assert_eq!(health.state, "running"); } #[test] fn test_container_health_unhealthy() { let health = ContainerHealth { name: "archy-mempool-web".to_string(), app_id: "mempool-web".to_string(), state: "exited".to_string(), podman_health: None, host_port_ready: None, healthy: false, }; assert!(!health.healthy); assert_eq!(health.state, "exited"); } #[test] fn test_max_restart_attempts_constant() { assert!(MAX_RESTART_ATTEMPTS >= 1); assert!(MAX_RESTART_ATTEMPTS <= 20); assert_eq!(MAX_RESTART_ATTEMPTS, 10); } #[test] fn test_check_interval_constant() { assert_eq!(CHECK_INTERVAL_SECS, 120); } #[test] fn test_backoff_delays() { let tracker = RestartTracker::new(); // Before any attempts, delay is first backoff assert_eq!(tracker.backoff_delay_secs("test"), BACKOFF_DELAYS_SECS[0]); } #[test] fn test_backoff_delays_escalate() { let mut tracker = RestartTracker::new(); tracker.record_attempt("test"); assert_eq!(tracker.backoff_delay_secs("test"), BACKOFF_DELAYS_SECS[0]); // 10s tracker.record_attempt("test"); assert_eq!(tracker.backoff_delay_secs("test"), BACKOFF_DELAYS_SECS[1]); // 30s tracker.record_attempt("test"); assert_eq!(tracker.backoff_delay_secs("test"), BACKOFF_DELAYS_SECS[2]); // 90s } #[test] fn test_backoff_elapsed_true_for_new() { let tracker = RestartTracker::new(); assert!(tracker.backoff_elapsed("test")); } #[test] fn test_stability_reset_not_triggered_early() { let mut tracker = RestartTracker::new(); tracker.record_attempt("test"); assert!(!tracker.should_reset_failed("test")); } #[test] fn test_container_tier_database() { assert_eq!(container_tier("archy-btcpay-db"), StartupTier::Database); assert_eq!(container_tier("immich_postgres"), StartupTier::Database); assert_eq!(container_tier("penpot-valkey"), StartupTier::Database); assert_eq!(container_tier("indeedhub-postgres"), StartupTier::Database); assert_eq!(container_tier("indeedhub-redis"), StartupTier::Database); assert_eq!(container_tier("indeedhub-minio"), StartupTier::Database); } #[test] fn test_container_tier_indeedhub_api() { assert_eq!( container_tier("indeedhub-api"), StartupTier::DependentService ); } #[test] fn test_container_tier_mempool_api() { assert_eq!(container_tier("mempool-api"), StartupTier::DependentService); } #[test] fn test_container_dependencies() { assert!(container_dependencies("lnd").contains(&"bitcoin")); assert!(container_dependencies("indeedhub-api").contains(&"indeedhub-postgres")); assert!(container_dependencies("indeedhub-api").contains(&"indeedhub-redis")); assert!(container_dependencies("mempool-api").contains(&"mempool-db")); assert!(container_dependencies("mempool-api").contains(&"electrumx")); assert!(container_dependencies("nextcloud").is_empty()); } #[test] fn test_deps_are_running() { let containers = vec![ ContainerHealth { name: "indeedhub-postgres".into(), app_id: "indeedhub-postgres".into(), state: "running".into(), podman_health: None, host_port_ready: None, healthy: true, }, ContainerHealth { name: "indeedhub-redis".into(), app_id: "indeedhub-redis".into(), state: "running".into(), podman_health: None, host_port_ready: None, healthy: true, }, ContainerHealth { name: "indeedhub-api".into(), app_id: "indeedhub-api".into(), state: "exited".into(), podman_health: None, host_port_ready: None, healthy: false, }, ]; assert!(deps_are_running("indeedhub-api", &containers)); // Missing postgres let partial = vec![ContainerHealth { name: "indeedhub-redis".into(), app_id: "indeedhub-redis".into(), state: "running".into(), podman_health: None, host_port_ready: None, healthy: true, }]; assert!(!deps_are_running("indeedhub-api", &partial)); } #[test] fn test_bitcoin_dependency_accepts_core_or_knots() { let core = vec![ContainerHealth { name: "bitcoin-core".into(), app_id: "bitcoin-core".into(), state: "running".into(), podman_health: None, host_port_ready: None, healthy: true, }]; assert!(deps_are_running("lnd", &core)); let knots = vec![ContainerHealth { name: "bitcoin-knots".into(), app_id: "bitcoin-knots".into(), state: "running".into(), podman_health: None, host_port_ready: None, healthy: true, }]; assert!(deps_are_running("fedimint", &knots)); let stopped = vec![ContainerHealth { name: "bitcoin-core".into(), app_id: "bitcoin-core".into(), state: "stopped".into(), podman_health: None, host_port_ready: None, healthy: false, }]; assert!(!deps_are_running("electrumx", &stopped)); } #[test] fn test_bitcoin_conflict_detection() { let containers = vec![ContainerHealth { name: "bitcoin-core".into(), app_id: "bitcoin-core".into(), state: "running".into(), podman_health: None, host_port_ready: None, healthy: true, }]; assert!(has_running_bitcoin_conflict("bitcoin-knots", &containers)); assert!(!has_running_bitcoin_conflict("bitcoin-core", &containers)); assert!(!has_running_bitcoin_conflict("lnd", &containers)); } #[test] fn test_bitcoin_conflict_ignores_stopped_sibling() { let containers = vec![ContainerHealth { name: "bitcoin-core".into(), app_id: "bitcoin-core".into(), state: "stopped".into(), podman_health: None, host_port_ready: None, healthy: false, }]; assert!(!has_running_bitcoin_conflict("bitcoin-knots", &containers)); } #[test] fn test_container_tier_core() { assert_eq!(container_tier("bitcoin-knots"), StartupTier::CoreInfra); } #[test] fn test_container_tier_dependent() { assert_eq!(container_tier("lnd"), StartupTier::DependentService); assert_eq!( container_tier("mempool-electrs"), StartupTier::DependentService ); assert_eq!( container_tier("archy-nbxplorer"), StartupTier::DependentService ); } #[test] fn test_container_tier_frontend() { assert_eq!(container_tier("archy-mempool-web"), StartupTier::Frontend); assert_eq!(container_tier("archy-bitcoin-ui"), StartupTier::Frontend); } #[test] fn test_container_tier_application_default() { assert_eq!(container_tier("nextcloud"), StartupTier::Application); assert_eq!(container_tier("grafana"), StartupTier::Application); assert_eq!(container_tier("btcpay-server"), StartupTier::Application); } #[test] fn test_tier_ordering() { assert!(StartupTier::Database < StartupTier::CoreInfra); assert!(StartupTier::CoreInfra < StartupTier::DependentService); assert!(StartupTier::DependentService < StartupTier::Application); assert!(StartupTier::Application < StartupTier::Frontend); } #[test] fn test_parse_memory_gib() { assert_eq!(parse_memory_string("1.5GiB"), Some(1_610_612_736)); } #[test] fn test_parse_memory_mib() { assert_eq!(parse_memory_string("256MiB"), Some(268_435_456)); } #[test] fn test_parse_memory_kib() { assert_eq!(parse_memory_string("512KiB"), Some(524_288)); } #[test] fn test_parse_memory_invalid() { assert_eq!(parse_memory_string("abc"), None); } #[test] fn test_memory_tracker_no_leak_few_samples() { let mut tracker = MemoryTracker::new(); tracker.record("test", 100_000_000); assert!(tracker.check_leak("test").is_none()); } #[test] fn podman_unhealthy_makes_running_container_unhealthy() { let c = serde_json::json!({ "Status": "Up 5 minutes (unhealthy)" }); assert_eq!( parse_podman_health(&c, "running").as_deref(), Some("unhealthy") ); } #[test] fn parses_podman_healthcheck_systemd_units() { let id = "c1f44a6369c91d65f9e9f6134a5591aa02792cff2f1a4e0f689b5a6c03b6c77c"; assert_eq!( parse_podman_healthcheck_unit(&format!("{}-15c66ddfefa8a763.timer", id)), Some(id) ); assert_eq!( parse_podman_healthcheck_unit(&format!("{}-15c66ddfefa8a763.service", id)), Some(id) ); assert_eq!(parse_podman_healthcheck_unit("grafana.service"), None); assert_eq!( parse_podman_healthcheck_unit( "nothexzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz-x.timer" ), None ); } #[test] fn stale_healthcheck_units_filters_only_removed_container_ids() { let live = "6467e25fd87d791a63fe9dbf6e2fabc7bf26533aa2c402b1089effeacf7ebbba"; let stale = "c1f44a6369c91d65f9e9f6134a5591aa02792cff2f1a4e0f689b5a6c03b6c77c"; let mut live_ids = HashSet::new(); live_ids.insert(live.to_string()); let output = format!( " {live}-6fdc497fd3ba3b62.timer loaded active waiting\n\ ● {stale}-15c66ddfefa8a763.service loaded failed failed\n\ grafana.service loaded active running\n\ {stale}-1898d85de0bb707f.timer loaded active waiting\n" ); let mut units = stale_healthcheck_units(&output, &live_ids); units.sort(); assert_eq!( units, vec![ format!("{stale}-15c66ddfefa8a763.service"), format!("{stale}-1898d85de0bb707f.timer"), ] ); } #[test] fn is_electrumx_matches_electrs_variants_only() { assert!(is_electrumx("electrumx")); assert!(is_electrumx("archy-electrumx")); assert!(is_electrumx("electrs")); assert!(is_electrumx("mempool-electrs")); assert!(!is_electrumx("bitcoin-knots")); assert!(!is_electrumx("lnd")); assert!(!is_electrumx("electrs-ui")); // the web UI, not the indexer } #[test] fn corrupt_db_detection_matches_plyvel_signature() { // The exact line ElectrumX exit-loops on (observed on .116). let corrupt = "plyvel._plyvel.Error: b'Invalid argument: hist: does not exist (create_if_missing is false)'"; assert!(looks_like_corrupt_electrumx_db(corrupt)); assert!(looks_like_corrupt_electrumx_db( "Corruption: bad block in hist" )); } #[test] fn corrupt_db_detection_ignores_healthy_logs() { let healthy = "INFO:BlockProcessor:our height: 117,009 daemon: 953,480 UTXOs 28MB\n\ INFO:SessionManager:RPC server listening on 0.0.0.0:8000"; assert!(!looks_like_corrupt_electrumx_db(healthy)); // "catching up" / normal restart noise must not trigger a destructive wipe. assert!(!looks_like_corrupt_electrumx_db( "Prefetcher:catching up to daemon height 953,480" )); } }