// Crash Recovery Module // Detects unexpected shutdowns and restarts containers that were running before the crash. // // How it works: // 1. On startup, write a PID file as a "running" marker // 2. Periodically snapshot which containers are running to a state file // 3. On clean shutdown, remove the PID file // 4. On next startup, if the PID file exists → previous instance crashed // 5. On crash recovery: read the saved container list, restart them, log actions use anyhow::{Context, Result}; use serde::{Deserialize, Serialize}; use std::path::{Path, PathBuf}; use tokio::fs; use tracing::{info, warn}; const PID_FILE: &str = "archipelago.pid"; const CONTAINER_STATE_FILE: &str = "running-containers.json"; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct RunningContainerRecord { pub name: String, pub image: String, } #[derive(Debug, Clone, Serialize, Deserialize)] struct ContainerSnapshot { pub timestamp: u64, pub containers: Vec, } /// Check if the previous instance crashed (PID file exists without a clean shutdown). /// Returns the list of containers that were running before the crash, if any. pub async fn check_for_crash(data_dir: &Path) -> Result>> { let pid_path = data_dir.join(PID_FILE); if !pid_path.exists() { return Ok(None); } // PID file exists — previous instance didn't shut down cleanly let old_pid = fs::read_to_string(&pid_path) .await .unwrap_or_default() .trim() .to_string(); warn!("Crash detected: previous instance (PID {}) did not shut down cleanly", old_pid); // Check if that PID is actually still running (zombie/stuck process) if !old_pid.is_empty() { if let Ok(pid) = old_pid.parse::() { if is_process_running(pid) { warn!("Previous process (PID {}) is still running — not a crash, skipping recovery", pid); // Remove stale PID file and skip recovery let _ = fs::remove_file(&pid_path).await; return Ok(None); } } } // Load the saved container snapshot let state_path = data_dir.join(CONTAINER_STATE_FILE); let containers = if state_path.exists() { match fs::read_to_string(&state_path).await { Ok(content) => { match serde_json::from_str::(&content) { Ok(snapshot) => { info!( "Found {} containers from pre-crash snapshot (saved at {})", snapshot.containers.len(), snapshot.timestamp ); snapshot.containers } Err(e) => { warn!("Failed to parse container snapshot: {}", e); Vec::new() } } } Err(e) => { warn!("Failed to read container snapshot: {}", e); Vec::new() } } } else { info!("No container snapshot found — cannot determine which containers were running"); Vec::new() }; // Clean up the stale PID file let _ = fs::remove_file(&pid_path).await; if containers.is_empty() { Ok(None) } else { Ok(Some(containers)) } } /// Write the PID file to mark the current instance as running. pub async fn write_pid_marker(data_dir: &Path) -> Result<()> { let pid = std::process::id(); let pid_path = data_dir.join(PID_FILE); fs::write(&pid_path, pid.to_string()) .await .context("Failed to write PID marker")?; Ok(()) } /// Remove the PID file on clean shutdown. pub async fn remove_pid_marker(data_dir: &Path) { let pid_path = data_dir.join(PID_FILE); if let Err(e) = fs::remove_file(&pid_path).await { warn!("Failed to remove PID marker: {}", e); } } /// Save a snapshot of currently running containers to disk. /// Called periodically so we know what to restart after a crash. pub async fn save_container_snapshot(data_dir: &Path) -> Result<()> { let output = tokio::process::Command::new("sudo") .args(["podman", "ps", "--format", "json"]) .output() .await .context("Failed to run podman ps")?; if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); anyhow::bail!("podman ps failed: {}", stderr); } let stdout = String::from_utf8_lossy(&output.stdout); let containers: Vec = serde_json::from_str(&stdout).unwrap_or_default(); let records: Vec = containers .iter() .filter_map(|c| { let name = c.get("Names") .and_then(|v| { // Podman returns Names as an array if let Some(arr) = v.as_array() { arr.first().and_then(|n| n.as_str()).map(|s| s.to_string()) } else { v.as_str().map(|s| s.to_string()) } })?; let image = c.get("Image") .and_then(|v| v.as_str()) .unwrap_or("unknown") .to_string(); Some(RunningContainerRecord { name, image }) }) .collect(); let snapshot = ContainerSnapshot { timestamp: std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) .unwrap_or_default() .as_secs(), containers: records, }; let state_path = data_dir.join(CONTAINER_STATE_FILE); let json = serde_json::to_string_pretty(&snapshot) .context("Failed to serialize container snapshot")?; fs::write(&state_path, json) .await .context("Failed to write container snapshot")?; Ok(()) } /// Recover containers that were running before a crash. /// Attempts to start each container, logging success/failure. pub async fn recover_containers(containers: &[RunningContainerRecord]) -> RecoveryReport { let mut report = RecoveryReport { total: containers.len(), recovered: 0, failed: Vec::new(), }; for record in containers { info!("Recovering container: {} (image: {})", record.name, record.image); let result = tokio::process::Command::new("sudo") .args(["podman", "start", &record.name]) .output() .await; match result { Ok(output) if output.status.success() => { info!("Successfully restarted container: {}", record.name); report.recovered += 1; } Ok(output) => { let stderr = String::from_utf8_lossy(&output.stderr); warn!("Failed to restart container {}: {}", record.name, stderr.trim()); report.failed.push(record.name.clone()); } Err(e) => { warn!("Failed to execute podman start for {}: {}", record.name, e); report.failed.push(record.name.clone()); } } } report } #[derive(Debug)] pub struct RecoveryReport { pub total: usize, pub recovered: usize, pub failed: Vec, } /// Check if a process with the given PID is still running. fn is_process_running(pid: u32) -> bool { // Check /proc/{pid} on Linux std::path::Path::new(&format!("/proc/{}", pid)).exists() } /// Spawn a background task that periodically saves the container snapshot. pub fn spawn_snapshot_task(data_dir: PathBuf) { tokio::spawn(async move { // Wait 30s before first snapshot (let containers stabilize after startup) tokio::time::sleep(std::time::Duration::from_secs(30)).await; let mut interval = tokio::time::interval(std::time::Duration::from_secs(60)); loop { interval.tick().await; if let Err(e) = save_container_snapshot(&data_dir).await { tracing::debug!("Container snapshot (non-fatal): {}", e); } } }); } #[cfg(test)] mod tests { use super::*; use tempfile::TempDir; #[tokio::test] async fn test_no_crash_without_pid_file() { let tmp = TempDir::new().unwrap(); let result = check_for_crash(tmp.path()).await.unwrap(); assert!(result.is_none()); } #[tokio::test] async fn test_crash_detected_with_pid_file() { let tmp = TempDir::new().unwrap(); // Write a PID file with a non-existent PID fs::write(tmp.path().join(PID_FILE), "999999999").await.unwrap(); let result = check_for_crash(tmp.path()).await.unwrap(); // No snapshot file → crash detected but no containers to recover assert!(result.is_none()); } #[tokio::test] async fn test_crash_with_snapshot() { let tmp = TempDir::new().unwrap(); // Write PID file fs::write(tmp.path().join(PID_FILE), "999999999").await.unwrap(); // Write container snapshot let snapshot = ContainerSnapshot { timestamp: 1000, containers: vec![ RunningContainerRecord { name: "archy-bitcoin-knots".to_string(), image: "bitcoin-knots:27.1".to_string(), }, RunningContainerRecord { name: "archy-mempool-web".to_string(), image: "mempool/frontend:3.0".to_string(), }, ], }; let json = serde_json::to_string(&snapshot).unwrap(); fs::write(tmp.path().join(CONTAINER_STATE_FILE), json).await.unwrap(); let result = check_for_crash(tmp.path()).await.unwrap(); assert!(result.is_some()); let containers = result.unwrap(); assert_eq!(containers.len(), 2); assert_eq!(containers[0].name, "archy-bitcoin-knots"); assert_eq!(containers[1].name, "archy-mempool-web"); } #[tokio::test] async fn test_write_and_remove_pid_marker() { let tmp = TempDir::new().unwrap(); write_pid_marker(tmp.path()).await.unwrap(); assert!(tmp.path().join(PID_FILE).exists()); remove_pid_marker(tmp.path()).await; assert!(!tmp.path().join(PID_FILE).exists()); } #[tokio::test] async fn test_pid_marker_contains_current_pid() { let tmp = TempDir::new().unwrap(); write_pid_marker(tmp.path()).await.unwrap(); let content = fs::read_to_string(tmp.path().join(PID_FILE)).await.unwrap(); let saved_pid: u32 = content.parse().unwrap(); assert_eq!(saved_pid, std::process::id()); } #[tokio::test] async fn test_clean_shutdown_no_crash_on_restart() { let tmp = TempDir::new().unwrap(); // Simulate: startup → write PID → clean shutdown → remove PID write_pid_marker(tmp.path()).await.unwrap(); remove_pid_marker(tmp.path()).await; // Next startup should detect no crash let result = check_for_crash(tmp.path()).await.unwrap(); assert!(result.is_none()); } #[tokio::test] async fn test_corrupt_snapshot_handled() { let tmp = TempDir::new().unwrap(); fs::write(tmp.path().join(PID_FILE), "999999999").await.unwrap(); fs::write(tmp.path().join(CONTAINER_STATE_FILE), "not valid json").await.unwrap(); // Should not crash, returns None (no recoverable containers) let result = check_for_crash(tmp.path()).await.unwrap(); assert!(result.is_none()); } }