// Container Health Monitor // Checks container health every 60s, auto-restarts unhealthy containers (max 3 times), // and sends WebSocket notifications to the UI on failure. use crate::data_model::{Notification, NotificationLevel}; use crate::state::StateManager; use crate::webhooks::{self, WebhookEvent}; use std::collections::HashMap; use std::path::PathBuf; use std::sync::Arc; use tracing::{debug, info, warn}; const MAX_RESTART_ATTEMPTS: u32 = 3; const CHECK_INTERVAL_SECS: u64 = 60; /// Track restart attempts per container to avoid infinite restart loops. struct RestartTracker { attempts: HashMap, } impl RestartTracker { fn new() -> Self { Self { attempts: HashMap::new(), } } /// Record a restart attempt. Returns false if max attempts exceeded. fn record_attempt(&mut self, name: &str) -> bool { let count = self.attempts.entry(name.to_string()).or_insert(0); *count += 1; *count <= MAX_RESTART_ATTEMPTS } /// Clear restart count when a container is healthy again. fn clear(&mut self, name: &str) { self.attempts.remove(name); } fn attempt_count(&self, name: &str) -> u32 { *self.attempts.get(name).unwrap_or(&0) } } #[derive(Debug, Clone)] struct ContainerHealth { name: String, app_id: String, state: String, healthy: bool, } /// Query all containers and their health status. async fn check_containers() -> Vec { let output = match tokio::process::Command::new("sudo") .args(["podman", "ps", "-a", "--format", "json"]) .output() .await { Ok(o) if o.status.success() => o, _ => return Vec::new(), }; let stdout = String::from_utf8_lossy(&output.stdout); let containers: Vec = serde_json::from_str(&stdout).unwrap_or_default(); // Backend services to skip let skip = [ "btcpay-db", "nbxplorer", "mempool-db", "mempool-api", "penpot-postgres", "penpot-backend", "penpot-exporter", "penpot-valkey", "penpot-mailcatch", "immich_postgres", "immich_redis", "endurain-db", "nextcloud-db", ]; containers .iter() .filter_map(|c| { let name = c.get("Names").and_then(|v| { if let Some(arr) = v.as_array() { arr.first().and_then(|n| n.as_str()).map(|s| s.to_string()) } else { v.as_str().map(|s| s.to_string()) } })?; let app_id = name .strip_prefix("archy-") .unwrap_or(&name) .to_string(); if skip.contains(&app_id.as_str()) || app_id.ends_with("-ui") { return None; } let state = c.get("State") .and_then(|v| v.as_str()) .unwrap_or("unknown") .to_lowercase(); let healthy = state == "running"; Some(ContainerHealth { name, app_id, state, healthy, }) }) .collect() } /// Try to restart a container. async fn restart_container(name: &str) -> bool { info!("Auto-restarting unhealthy container: {}", name); let result = tokio::process::Command::new("sudo") .args(["podman", "start", name]) .output() .await; match result { Ok(output) if output.status.success() => { info!("Successfully restarted container: {}", name); true } Ok(output) => { let stderr = String::from_utf8_lossy(&output.stderr); warn!("Failed to restart container {}: {}", name, stderr.trim()); false } Err(e) => { warn!("Failed to execute podman start for {}: {}", name, e); false } } } /// Spawn the health monitor background task. pub fn spawn_health_monitor(state: Arc, data_dir: PathBuf) { tokio::spawn(async move { // Wait 2 minutes for containers to start up tokio::time::sleep(std::time::Duration::from_secs(120)).await; let mut tracker = RestartTracker::new(); let mut interval = tokio::time::interval(std::time::Duration::from_secs(CHECK_INTERVAL_SECS)); loop { interval.tick().await; // Check webhook config — if webhooks are disabled or ContainerCrash // isn't subscribed, skip all health monitoring (no restarts, no notifications) let webhook_config = webhooks::load_config(&data_dir).await.unwrap_or_default(); if !webhook_config.enabled || !webhook_config.events.contains(&WebhookEvent::ContainerCrash) { debug!("Health monitor: skipping — webhooks disabled or ContainerCrash not subscribed"); continue; } let containers = check_containers().await; if containers.is_empty() { continue; } let mut state_changed = false; let (mut data, _) = state.get_snapshot().await; for container in &containers { if container.healthy { // Clear restart tracker if container recovered if tracker.attempt_count(&container.name) > 0 { info!("Container {} is healthy again after restart", container.name); tracker.clear(&container.name); } continue; } // Container is unhealthy (exited/stopped) // Only try auto-restart if we haven't exceeded max attempts if container.state == "exited" || container.state == "stopped" { let attempts = tracker.attempt_count(&container.name); if attempts >= MAX_RESTART_ATTEMPTS { // Already notified, skip debug!("Container {} exceeded max restart attempts ({})", container.name, MAX_RESTART_ATTEMPTS); continue; } if tracker.record_attempt(&container.name) { let restarted = restart_container(&container.name).await; let attempt = tracker.attempt_count(&container.name); if !restarted || attempt >= MAX_RESTART_ATTEMPTS { // Push notification to UI let notification = Notification { id: format!("health-{}-{}", container.app_id, chrono::Utc::now().timestamp()), level: NotificationLevel::Error, title: format!("{} is unhealthy", container.app_id), message: if restarted { format!( "Container restarted ({}/{} attempts). May need manual attention.", attempt, MAX_RESTART_ATTEMPTS ) } else { format!( "Auto-restart failed (attempt {}/{}). Container state: {}", attempt, MAX_RESTART_ATTEMPTS, container.state ) }, timestamp: chrono::Utc::now().to_rfc3339(), app_id: Some(container.app_id.clone()), }; // Keep only the latest 20 notifications data.notifications.push(notification); if data.notifications.len() > 20 { data.notifications = data.notifications.split_off(data.notifications.len() - 20); } state_changed = true; } } } } if state_changed { state.update_data(data).await; debug!("Health monitor: state updated with notifications"); } } }); } #[cfg(test)] mod tests { use super::*; #[test] fn test_restart_tracker_new_is_empty() { let tracker = RestartTracker::new(); assert_eq!(tracker.attempt_count("any-container"), 0); } #[test] fn test_restart_tracker_record_attempt_increments() { let mut tracker = RestartTracker::new(); assert!(tracker.record_attempt("test-container")); assert_eq!(tracker.attempt_count("test-container"), 1); assert!(tracker.record_attempt("test-container")); assert_eq!(tracker.attempt_count("test-container"), 2); assert!(tracker.record_attempt("test-container")); assert_eq!(tracker.attempt_count("test-container"), 3); } #[test] fn test_restart_tracker_max_attempts_exceeded() { let mut tracker = RestartTracker::new(); // First MAX_RESTART_ATTEMPTS attempts should return true for i in 1..=MAX_RESTART_ATTEMPTS { assert!( tracker.record_attempt("container-a"), "Attempt {} should be allowed", i ); } // Next attempt exceeds max, returns false assert!(!tracker.record_attempt("container-a")); assert_eq!(tracker.attempt_count("container-a"), MAX_RESTART_ATTEMPTS + 1); } #[test] fn test_restart_tracker_independent_containers() { let mut tracker = RestartTracker::new(); tracker.record_attempt("container-a"); tracker.record_attempt("container-a"); tracker.record_attempt("container-b"); assert_eq!(tracker.attempt_count("container-a"), 2); assert_eq!(tracker.attempt_count("container-b"), 1); assert_eq!(tracker.attempt_count("container-c"), 0); } #[test] fn test_restart_tracker_clear_resets_count() { let mut tracker = RestartTracker::new(); tracker.record_attempt("container-x"); tracker.record_attempt("container-x"); assert_eq!(tracker.attempt_count("container-x"), 2); tracker.clear("container-x"); assert_eq!(tracker.attempt_count("container-x"), 0); } #[test] fn test_restart_tracker_clear_allows_new_attempts() { let mut tracker = RestartTracker::new(); // Exhaust attempts for _ in 0..=MAX_RESTART_ATTEMPTS { tracker.record_attempt("container-y"); } assert!(!tracker.record_attempt("container-y")); // Clear and try again tracker.clear("container-y"); assert!(tracker.record_attempt("container-y")); assert_eq!(tracker.attempt_count("container-y"), 1); } #[test] fn test_restart_tracker_clear_nonexistent_is_safe() { let mut tracker = RestartTracker::new(); // Should not panic tracker.clear("nonexistent"); assert_eq!(tracker.attempt_count("nonexistent"), 0); } #[test] fn test_container_health_struct() { let health = ContainerHealth { name: "archy-bitcoin-knots".to_string(), app_id: "bitcoin-knots".to_string(), state: "running".to_string(), healthy: true, }; assert!(health.healthy); assert_eq!(health.name, "archy-bitcoin-knots"); assert_eq!(health.app_id, "bitcoin-knots"); assert_eq!(health.state, "running"); } #[test] fn test_container_health_unhealthy() { let health = ContainerHealth { name: "archy-mempool-web".to_string(), app_id: "mempool-web".to_string(), state: "exited".to_string(), healthy: false, }; assert!(!health.healthy); assert_eq!(health.state, "exited"); } #[test] fn test_max_restart_attempts_constant() { // Ensure the constant is a reasonable value (not 0, not too high) assert!(MAX_RESTART_ATTEMPTS >= 1); assert!(MAX_RESTART_ATTEMPTS <= 10); assert_eq!(MAX_RESTART_ATTEMPTS, 3); } #[test] fn test_check_interval_constant() { assert_eq!(CHECK_INTERVAL_SECS, 60); } }