// Crash Recovery Module // Detects unexpected shutdowns and restarts containers that were running before the crash. // // How it works: // 1. On startup, write a PID file as a "running" marker // 2. Periodically snapshot which containers are running to a state file // 3. On clean shutdown, remove the PID file // 4. On next startup, if the PID file exists → previous instance crashed // 5. On crash recovery: read the saved container list, restart them, log actions use anyhow::{Context, Result}; use serde::{Deserialize, Serialize}; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Instant; use tokio::fs; use tracing::{info, warn}; const PID_FILE: &str = "archipelago.pid"; const CONTAINER_STATE_FILE: &str = "running-containers.json"; const USER_STOPPED_FILE: &str = "user-stopped.json"; /// Shared flag: true once boot recovery is complete. Health monitor should wait for this. pub static RECOVERY_COMPLETE: AtomicBool = AtomicBool::new(false); /// Process start time for uptime calculation. static START_TIME: std::sync::OnceLock = std::sync::OnceLock::new(); /// Initialize the start time. Call once at startup. pub fn init_start_time() { START_TIME.get_or_init(Instant::now); } /// Get uptime in seconds since process start. pub fn uptime_seconds() -> u64 { START_TIME .get() .map(|t| t.elapsed().as_secs()) .unwrap_or(0) } /// Mark boot recovery as complete. Call after crash recovery + start_stopped_containers finish. pub fn mark_recovery_complete() { RECOVERY_COMPLETE.store(true, Ordering::SeqCst); info!("Boot recovery complete — health monitor may proceed"); } /// Check if boot recovery is done. pub fn is_recovery_complete() -> bool { RECOVERY_COMPLETE.load(Ordering::SeqCst) } // ── User-stopped tracking ─────────────────────────────────────────────── // When a user explicitly stops a container via the UI, we record it here // so crash recovery and health monitor don't auto-restart it. /// Load the set of user-stopped containers from disk. pub async fn load_user_stopped(data_dir: &Path) -> std::collections::HashSet { let path = data_dir.join(USER_STOPPED_FILE); match fs::read_to_string(&path).await { Ok(content) => serde_json::from_str(&content).unwrap_or_default(), Err(_) => std::collections::HashSet::new(), } } /// Save the set of user-stopped containers to disk. pub async fn save_user_stopped(data_dir: &Path, stopped: &std::collections::HashSet) { let path = data_dir.join(USER_STOPPED_FILE); if let Ok(json) = serde_json::to_string_pretty(stopped) { let _ = fs::write(&path, json).await; } } /// Mark a container as user-stopped (won't be auto-restarted). pub async fn mark_user_stopped(data_dir: &Path, name: &str) { let mut stopped = load_user_stopped(data_dir).await; stopped.insert(name.to_string()); save_user_stopped(data_dir, &stopped).await; } /// Clear user-stopped flag (container was manually started by user). pub async fn clear_user_stopped(data_dir: &Path, name: &str) { let mut stopped = load_user_stopped(data_dir).await; if stopped.remove(name) { save_user_stopped(data_dir, &stopped).await; } } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct RunningContainerRecord { pub name: String, pub image: String, } #[derive(Debug, Clone, Serialize, Deserialize)] struct ContainerSnapshot { pub timestamp: u64, pub containers: Vec, } /// Check if the previous instance crashed (PID file exists without a clean shutdown). /// Returns the list of containers that were running before the crash, if any. pub async fn check_for_crash(data_dir: &Path) -> Result>> { let pid_path = data_dir.join(PID_FILE); if !pid_path.exists() { return Ok(None); } // PID file exists — previous instance didn't shut down cleanly let old_pid = fs::read_to_string(&pid_path) .await .unwrap_or_default() .trim() .to_string(); warn!("Crash detected: previous instance (PID {}) did not shut down cleanly", old_pid); // Check if that PID is actually still running (zombie/stuck process) if !old_pid.is_empty() { if let Ok(pid) = old_pid.parse::() { if is_process_running(pid) { warn!("Previous process (PID {}) is still running — not a crash, skipping recovery", pid); // Remove stale PID file and skip recovery let _ = fs::remove_file(&pid_path).await; return Ok(None); } } } // Load the saved container snapshot let state_path = data_dir.join(CONTAINER_STATE_FILE); let containers = if state_path.exists() { match fs::read_to_string(&state_path).await { Ok(content) => { match serde_json::from_str::(&content) { Ok(snapshot) => { info!( "Found {} containers from pre-crash snapshot (saved at {})", snapshot.containers.len(), snapshot.timestamp ); snapshot.containers } Err(e) => { warn!("Failed to parse container snapshot: {}", e); Vec::new() } } } Err(e) => { warn!("Failed to read container snapshot: {}", e); Vec::new() } } } else { info!("No container snapshot found — cannot determine which containers were running"); Vec::new() }; // Clean up the stale PID file let _ = fs::remove_file(&pid_path).await; if containers.is_empty() { Ok(None) } else { Ok(Some(containers)) } } /// Write the PID file to mark the current instance as running. pub async fn write_pid_marker(data_dir: &Path) -> Result<()> { let pid = std::process::id(); let pid_path = data_dir.join(PID_FILE); fs::write(&pid_path, pid.to_string()) .await .context("Failed to write PID marker")?; Ok(()) } /// Remove the PID file on clean shutdown. pub async fn remove_pid_marker(data_dir: &Path) { let pid_path = data_dir.join(PID_FILE); if let Err(e) = fs::remove_file(&pid_path).await { warn!("Failed to remove PID marker: {}", e); } } /// Save a snapshot of currently running containers to disk. /// Called periodically so we know what to restart after a crash. pub async fn save_container_snapshot(data_dir: &Path) -> Result<()> { let output = tokio::time::timeout( std::time::Duration::from_secs(30), tokio::process::Command::new("podman") .args(["ps", "--format", "json"]) .output(), ) .await .context("podman ps timed out (30s)")? .context("Failed to run podman ps")?; if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); anyhow::bail!("podman ps failed: {}", stderr); } let stdout = String::from_utf8_lossy(&output.stdout); let containers: Vec = serde_json::from_str(&stdout).unwrap_or_default(); let records: Vec = containers .iter() .filter_map(|c| { let name = c.get("Names") .and_then(|v| { // Podman returns Names as an array if let Some(arr) = v.as_array() { arr.first().and_then(|n| n.as_str()).map(|s| s.to_string()) } else { v.as_str().map(|s| s.to_string()) } })?; let image = c.get("Image") .and_then(|v| v.as_str()) .unwrap_or("unknown") .to_string(); Some(RunningContainerRecord { name, image }) }) .collect(); let snapshot = ContainerSnapshot { timestamp: std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap_or_default() .as_secs(), containers: records, }; let state_path = data_dir.join(CONTAINER_STATE_FILE); let json = serde_json::to_string_pretty(&snapshot) .context("Failed to serialize container snapshot")?; fs::write(&state_path, json) .await .context("Failed to write container snapshot")?; Ok(()) } /// Recover containers that were running before a crash. /// Attempts to start each container, logging success/failure. pub async fn recover_containers(containers: &[RunningContainerRecord]) -> RecoveryReport { let mut report = RecoveryReport { total: containers.len(), recovered: 0, failed: Vec::new(), }; for (i, record) in containers.iter().enumerate() { info!("Recovering container: {} (image: {})", record.name, record.image); // Rate-limit container starts to avoid overwhelming podman on low-resource systems if i > 0 { tokio::time::sleep(std::time::Duration::from_secs(3)).await; } let result = tokio::time::timeout( std::time::Duration::from_secs(30), tokio::process::Command::new("podman") .args(["start", &record.name]) .output(), ) .await; match result { Ok(Ok(output)) if output.status.success() => { info!("Successfully restarted container: {}", record.name); report.recovered += 1; } Ok(Ok(output)) => { let stderr = String::from_utf8_lossy(&output.stderr); warn!("Failed to restart container {}: {}", record.name, stderr.trim()); report.failed.push(record.name.clone()); } Ok(Err(e)) => { warn!("Failed to execute podman start for {}: {}", record.name, e); report.failed.push(record.name.clone()); } Err(_) => { warn!("Timeout starting container {} (30s)", record.name); report.failed.push(record.name.clone()); } } } report } #[derive(Debug)] pub struct RecoveryReport { pub total: usize, pub recovered: usize, pub failed: Vec, } /// Check if a process with the given PID is still running. fn is_process_running(pid: u32) -> bool { // Check /proc/{pid} on Linux std::path::Path::new(&format!("/proc/{}", pid)).exists() } /// Start all stopped containers that were previously installed. /// Runs on every startup to ensure containers come back after clean reboots. /// The crash recovery (PID-based) handles dirty shutdowns; this handles clean ones. /// Skips containers that the user intentionally stopped via the UI. pub async fn start_stopped_containers(data_dir: &Path) -> RecoveryReport { let output = match tokio::time::timeout( std::time::Duration::from_secs(30), tokio::process::Command::new("podman") .args(["ps", "-a", "--filter", "status=exited", "--filter", "status=created", "--format", "{{.Names}}"]) .output(), ) .await { Ok(result) => result, Err(_) => { warn!("Timeout listing stopped containers (30s)"); return RecoveryReport { total: 0, recovered: 0, failed: Vec::new() }; } }; let all_names: Vec = match output { Ok(o) if o.status.success() => { String::from_utf8_lossy(&o.stdout) .lines() .filter(|l| !l.is_empty()) .map(|s| s.to_string()) .collect() } _ => Vec::new(), }; if all_names.is_empty() { return RecoveryReport { total: 0, recovered: 0, failed: Vec::new() }; } // Filter out user-stopped containers let user_stopped = load_user_stopped(data_dir).await; let names: Vec = all_names.into_iter() .filter(|n| { if user_stopped.contains(n) { info!("Skipping user-stopped container: {}", n); false } else { true } }) .collect(); if names.is_empty() { return RecoveryReport { total: 0, recovered: 0, failed: Vec::new() }; } // Sort by startup tier: databases first, then core, then dependent services, then apps let mut records: Vec = names.iter() .map(|n| RunningContainerRecord { name: n.clone(), image: String::new() }) .collect(); records.sort_by_key(|r| container_boot_tier(&r.name)); info!("Starting {} stopped containers after boot (skipped {} user-stopped)...", records.len(), user_stopped.len()); recover_containers(&records).await } /// Simple tier ordering for boot recovery (mirrors health_monitor tiers). fn container_boot_tier(name: &str) -> u8 { let id = name.strip_prefix("archy-").unwrap_or(name); match id { "btcpay-db" | "mempool-db" | "penpot-postgres" | "immich_postgres" | "immich_redis" | "penpot-valkey" => 0, "bitcoin-knots" | "bitcoin-core" | "bitcoin" => 1, "lnd" | "electrumx" | "mempool-electrs" | "electrs" | "nbxplorer" => 2, "mempool-web" | "bitcoin-ui" | "lnd-ui" | "electrs-ui" | "penpot-frontend" | "penpot-exporter" => 4, _ => 3, } } /// Spawn a background task that periodically saves the container snapshot. pub fn spawn_snapshot_task(data_dir: PathBuf) { tokio::spawn(async move { // Wait 2 minutes before first snapshot (let crash recovery finish and containers stabilize) tokio::time::sleep(std::time::Duration::from_secs(120)).await; let mut interval = tokio::time::interval(std::time::Duration::from_secs(120)); loop { interval.tick().await; if let Err(e) = save_container_snapshot(&data_dir).await { tracing::debug!("Container snapshot (non-fatal): {}", e); } } }); } #[cfg(test)] mod tests { use super::*; use tempfile::TempDir; #[tokio::test] async fn test_no_crash_without_pid_file() { let tmp = TempDir::new().unwrap(); let result = check_for_crash(tmp.path()).await.unwrap(); assert!(result.is_none()); } #[tokio::test] async fn test_crash_detected_with_pid_file() { let tmp = TempDir::new().unwrap(); // Write a PID file with a non-existent PID fs::write(tmp.path().join(PID_FILE), "999999999").await.unwrap(); let result = check_for_crash(tmp.path()).await.unwrap(); // No snapshot file → crash detected but no containers to recover assert!(result.is_none()); } #[tokio::test] async fn test_crash_with_snapshot() { let tmp = TempDir::new().unwrap(); // Write PID file fs::write(tmp.path().join(PID_FILE), "999999999").await.unwrap(); // Write container snapshot let snapshot = ContainerSnapshot { timestamp: 1000, containers: vec![ RunningContainerRecord { name: "archy-bitcoin-knots".to_string(), image: "bitcoin-knots:27.1".to_string(), }, RunningContainerRecord { name: "archy-mempool-web".to_string(), image: "mempool/frontend:3.0".to_string(), }, ], }; let json = serde_json::to_string(&snapshot).unwrap(); fs::write(tmp.path().join(CONTAINER_STATE_FILE), json).await.unwrap(); let result = check_for_crash(tmp.path()).await.unwrap(); assert!(result.is_some()); let containers = result.unwrap(); assert_eq!(containers.len(), 2); assert_eq!(containers[0].name, "archy-bitcoin-knots"); assert_eq!(containers[1].name, "archy-mempool-web"); } #[tokio::test] async fn test_write_and_remove_pid_marker() { let tmp = TempDir::new().unwrap(); write_pid_marker(tmp.path()).await.unwrap(); assert!(tmp.path().join(PID_FILE).exists()); remove_pid_marker(tmp.path()).await; assert!(!tmp.path().join(PID_FILE).exists()); } #[tokio::test] async fn test_pid_marker_contains_current_pid() { let tmp = TempDir::new().unwrap(); write_pid_marker(tmp.path()).await.unwrap(); let content = fs::read_to_string(tmp.path().join(PID_FILE)).await.unwrap(); let saved_pid: u32 = content.parse().unwrap(); assert_eq!(saved_pid, std::process::id()); } #[tokio::test] async fn test_clean_shutdown_no_crash_on_restart() { let tmp = TempDir::new().unwrap(); // Simulate: startup → write PID → clean shutdown → remove PID write_pid_marker(tmp.path()).await.unwrap(); remove_pid_marker(tmp.path()).await; // Next startup should detect no crash let result = check_for_crash(tmp.path()).await.unwrap(); assert!(result.is_none()); } #[tokio::test] async fn test_corrupt_snapshot_handled() { let tmp = TempDir::new().unwrap(); fs::write(tmp.path().join(PID_FILE), "999999999").await.unwrap(); fs::write(tmp.path().join(CONTAINER_STATE_FILE), "not valid json").await.unwrap(); // Should not crash, returns None (no recoverable containers) let result = check_for_crash(tmp.path()).await.unwrap(); assert!(result.is_none()); } }