archy/core/archipelago/src/crash_recovery.rs

// Crash Recovery Module
// Detects unexpected shutdowns and restarts containers that were running before the crash.
//
// How it works:
// 1. On startup, write a PID file as a "running" marker
// 2. Periodically snapshot which containers are running to a state file
// 3. On clean shutdown, remove the PID file
// 4. On next startup, if the PID file exists → previous instance crashed
// 5. On crash recovery: read the saved container list, restart them, log actions

use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
use tokio::fs;
use tracing::{info, warn};

const PID_FILE: &str = "archipelago.pid";
const CONTAINER_STATE_FILE: &str = "running-containers.json";

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RunningContainerRecord {
    pub name: String,
    pub image: String,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
struct ContainerSnapshot {
    pub timestamp: u64,
    pub containers: Vec<RunningContainerRecord>,
}

/// Check if the previous instance crashed (PID file exists without a clean shutdown).
/// Returns the list of containers that were running before the crash, if any.
pub async fn check_for_crash(data_dir: &Path) -> Result<Option<Vec<RunningContainerRecord>>> {
    let pid_path = data_dir.join(PID_FILE);

    if !pid_path.exists() {
        return Ok(None);
    }

    // PID file exists — previous instance didn't shut down cleanly
    let old_pid = fs::read_to_string(&pid_path)
        .await
        .unwrap_or_default()
        .trim()
        .to_string();
    warn!("Crash detected: previous instance (PID {}) did not shut down cleanly", old_pid);

    // Check if that PID is actually still running (zombie/stuck process)
    if !old_pid.is_empty() {
        if let Ok(pid) = old_pid.parse::<u32>() {
            if is_process_running(pid) {
                warn!("Previous process (PID {}) is still running — not a crash, skipping recovery", pid);
                // Remove stale PID file and skip recovery
                let _ = fs::remove_file(&pid_path).await;
                return Ok(None);
            }
        }
    }

    // Load the saved container snapshot
    let state_path = data_dir.join(CONTAINER_STATE_FILE);
    let containers = if state_path.exists() {
        match fs::read_to_string(&state_path).await {
            Ok(content) => {
                match serde_json::from_str::<ContainerSnapshot>(&content) {
                    Ok(snapshot) => {
                        info!(
                            "Found {} containers from pre-crash snapshot (saved at {})",
                            snapshot.containers.len(),
                            snapshot.timestamp
                        );
                        snapshot.containers
                    }
                    Err(e) => {
                        warn!("Failed to parse container snapshot: {}", e);
                        Vec::new()
                    }
                }
            }
            Err(e) => {
                warn!("Failed to read container snapshot: {}", e);
                Vec::new()
            }
        }
    } else {
        info!("No container snapshot found — cannot determine which containers were running");
        Vec::new()
    };

    // Clean up the stale PID file
    let _ = fs::remove_file(&pid_path).await;

    if containers.is_empty() {
        Ok(None)
    } else {
        Ok(Some(containers))
    }
}

/// Write the PID file to mark the current instance as running.
pub async fn write_pid_marker(data_dir: &Path) -> Result<()> {
    let pid = std::process::id();
    let pid_path = data_dir.join(PID_FILE);
    fs::write(&pid_path, pid.to_string())
        .await
        .context("Failed to write PID marker")?;
    Ok(())
}

/// Remove the PID file on clean shutdown.
pub async fn remove_pid_marker(data_dir: &Path) {
    let pid_path = data_dir.join(PID_FILE);
    if let Err(e) = fs::remove_file(&pid_path).await {
        warn!("Failed to remove PID marker: {}", e);
    }
}

/// Save a snapshot of currently running containers to disk.
/// Called periodically so we know what to restart after a crash.
pub async fn save_container_snapshot(data_dir: &Path) -> Result<()> {
    let output = tokio::time::timeout(
        std::time::Duration::from_secs(30),
        tokio::process::Command::new("podman")
            .args(["ps", "--format", "json"])
            .output(),
    )
    .await
    .context("podman ps timed out (30s)")?
    .context("Failed to run podman ps")?;

    if !output.status.success() {
        let stderr = String::from_utf8_lossy(&output.stderr);
        anyhow::bail!("podman ps failed: {}", stderr);
    }

    let stdout = String::from_utf8_lossy(&output.stdout);
    let containers: Vec<serde_json::Value> =
        serde_json::from_str(&stdout).unwrap_or_default();

    let records: Vec<RunningContainerRecord> = containers
        .iter()
        .filter_map(|c| {
            let name = c.get("Names")
                .and_then(|v| {
                    // Podman returns Names as an array
                    if let Some(arr) = v.as_array() {
                        arr.first().and_then(|n| n.as_str()).map(|s| s.to_string())
                    } else {
                        v.as_str().map(|s| s.to_string())
                    }
                })?;
            let image = c.get("Image")
                .and_then(|v| v.as_str())
                .unwrap_or("unknown")
                .to_string();
            Some(RunningContainerRecord { name, image })
        })
        .collect();

    let snapshot = ContainerSnapshot {
        timestamp: std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .unwrap_or_default()
            .as_secs(),
        containers: records,
    };

    let state_path = data_dir.join(CONTAINER_STATE_FILE);
    let json = serde_json::to_string_pretty(&snapshot)
        .context("Failed to serialize container snapshot")?;
    fs::write(&state_path, json)
        .await
        .context("Failed to write container snapshot")?;

    Ok(())
}

/// Recover containers that were running before a crash.
/// Attempts to start each container, logging success/failure.
pub async fn recover_containers(containers: &[RunningContainerRecord]) -> RecoveryReport {
    let mut report = RecoveryReport {
        total: containers.len(),
        recovered: 0,
        failed: Vec::new(),
    };

    for (i, record) in containers.iter().enumerate() {
        info!("Recovering container: {} (image: {})", record.name, record.image);

        // Rate-limit container starts to avoid overwhelming podman on low-resource systems
        if i > 0 {
            tokio::time::sleep(std::time::Duration::from_secs(3)).await;
        }

        let result = tokio::time::timeout(
            std::time::Duration::from_secs(30),
            tokio::process::Command::new("podman")
                .args(["start", &record.name])
                .output(),
        )
        .await;

        match result {
            Ok(Ok(output)) if output.status.success() => {
                info!("Successfully restarted container: {}", record.name);
                report.recovered += 1;
            }
            Ok(Ok(output)) => {
                let stderr = String::from_utf8_lossy(&output.stderr);
                warn!("Failed to restart container {}: {}", record.name, stderr.trim());
                report.failed.push(record.name.clone());
            }
            Ok(Err(e)) => {
                warn!("Failed to execute podman start for {}: {}", record.name, e);
                report.failed.push(record.name.clone());
            }
            Err(_) => {
                warn!("Timeout starting container {} (30s)", record.name);
                report.failed.push(record.name.clone());
            }
        }
    }

    report
}

#[derive(Debug)]
pub struct RecoveryReport {
    pub total: usize,
    pub recovered: usize,
    pub failed: Vec<String>,
}

/// Check if a process with the given PID is still running.
fn is_process_running(pid: u32) -> bool {
    // Check /proc/{pid} on Linux
    std::path::Path::new(&format!("/proc/{}", pid)).exists()
}

/// Start all stopped containers that were previously installed.
/// Runs on every startup to ensure containers come back after clean reboots.
/// The crash recovery (PID-based) handles dirty shutdowns; this handles clean ones.
pub async fn start_stopped_containers() -> RecoveryReport {
    let output = match tokio::time::timeout(
        std::time::Duration::from_secs(30),
        tokio::process::Command::new("podman")
            .args(["ps", "-a", "--filter", "status=exited", "--filter", "status=created", "--format", "{{.Names}}"])
            .output(),
    )
    .await
    {
        Ok(result) => result,
        Err(_) => {
            warn!("Timeout listing stopped containers (30s)");
            return RecoveryReport { total: 0, recovered: 0, failed: Vec::new() };
        }
    };

    let names: Vec<String> = match output {
        Ok(o) if o.status.success() => {
            String::from_utf8_lossy(&o.stdout)
                .lines()
                .filter(|l| !l.is_empty())
                .map(|s| s.to_string())
                .collect()
        }
        _ => Vec::new(),
    };

    if names.is_empty() {
        return RecoveryReport { total: 0, recovered: 0, failed: Vec::new() };
    }

    info!("Starting {} stopped containers after boot...", names.len());
    let records: Vec<RunningContainerRecord> = names.iter()
        .map(|n| RunningContainerRecord { name: n.clone(), image: String::new() })
        .collect();
    recover_containers(&records).await
}

/// Spawn a background task that periodically saves the container snapshot.
pub fn spawn_snapshot_task(data_dir: PathBuf) {
    tokio::spawn(async move {
        // Wait 2 minutes before first snapshot (let crash recovery finish and containers stabilize)
        tokio::time::sleep(std::time::Duration::from_secs(120)).await;

        let mut interval = tokio::time::interval(std::time::Duration::from_secs(120));
        loop {
            interval.tick().await;
            if let Err(e) = save_container_snapshot(&data_dir).await {
                tracing::debug!("Container snapshot (non-fatal): {}", e);
            }
        }
    });
}

#[cfg(test)]
mod tests {
    use super::*;
    use tempfile::TempDir;

    #[tokio::test]
    async fn test_no_crash_without_pid_file() {
        let tmp = TempDir::new().unwrap();
        let result = check_for_crash(tmp.path()).await.unwrap();
        assert!(result.is_none());
    }

    #[tokio::test]
    async fn test_crash_detected_with_pid_file() {
        let tmp = TempDir::new().unwrap();
        // Write a PID file with a non-existent PID
        fs::write(tmp.path().join(PID_FILE), "999999999").await.unwrap();
        let result = check_for_crash(tmp.path()).await.unwrap();
        // No snapshot file → crash detected but no containers to recover
        assert!(result.is_none());
    }

    #[tokio::test]
    async fn test_crash_with_snapshot() {
        let tmp = TempDir::new().unwrap();
        // Write PID file
        fs::write(tmp.path().join(PID_FILE), "999999999").await.unwrap();
        // Write container snapshot
        let snapshot = ContainerSnapshot {
            timestamp: 1000,
            containers: vec![
                RunningContainerRecord {
                    name: "archy-bitcoin-knots".to_string(),
                    image: "bitcoin-knots:27.1".to_string(),
                },
                RunningContainerRecord {
                    name: "archy-mempool-web".to_string(),
                    image: "mempool/frontend:3.0".to_string(),
                },
            ],
        };
        let json = serde_json::to_string(&snapshot).unwrap();
        fs::write(tmp.path().join(CONTAINER_STATE_FILE), json).await.unwrap();

        let result = check_for_crash(tmp.path()).await.unwrap();
        assert!(result.is_some());
        let containers = result.unwrap();
        assert_eq!(containers.len(), 2);
        assert_eq!(containers[0].name, "archy-bitcoin-knots");
        assert_eq!(containers[1].name, "archy-mempool-web");
    }

    #[tokio::test]
    async fn test_write_and_remove_pid_marker() {
        let tmp = TempDir::new().unwrap();
        write_pid_marker(tmp.path()).await.unwrap();
        assert!(tmp.path().join(PID_FILE).exists());

        remove_pid_marker(tmp.path()).await;
        assert!(!tmp.path().join(PID_FILE).exists());
    }

    #[tokio::test]
    async fn test_pid_marker_contains_current_pid() {
        let tmp = TempDir::new().unwrap();
        write_pid_marker(tmp.path()).await.unwrap();
        let content = fs::read_to_string(tmp.path().join(PID_FILE)).await.unwrap();
        let saved_pid: u32 = content.parse().unwrap();
        assert_eq!(saved_pid, std::process::id());
    }

    #[tokio::test]
    async fn test_clean_shutdown_no_crash_on_restart() {
        let tmp = TempDir::new().unwrap();

        // Simulate: startup → write PID → clean shutdown → remove PID
        write_pid_marker(tmp.path()).await.unwrap();
        remove_pid_marker(tmp.path()).await;

        // Next startup should detect no crash
        let result = check_for_crash(tmp.path()).await.unwrap();
        assert!(result.is_none());
    }

    #[tokio::test]
    async fn test_corrupt_snapshot_handled() {
        let tmp = TempDir::new().unwrap();
        fs::write(tmp.path().join(PID_FILE), "999999999").await.unwrap();
        fs::write(tmp.path().join(CONTAINER_STATE_FILE), "not valid json").await.unwrap();

        // Should not crash, returns None (no recoverable containers)
        let result = check_for_crash(tmp.path()).await.unwrap();
        assert!(result.is_none());
    }
}