archy/core/archipelago/src/crash_recovery.rs
2026-05-05 11:29:18 -04:00

636 lines
22 KiB
Rust

// Crash Recovery Module
// Detects unexpected shutdowns and restarts containers that were running before the crash.
//
// How it works:
// 1. On startup, write a PID file as a "running" marker
// 2. Periodically snapshot which containers are running to a state file
// 3. On clean shutdown, remove the PID file
// 4. On next startup, if the PID file exists → previous instance crashed
// 5. On crash recovery: read the saved container list, restart them, log actions
use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::Instant;
use tokio::fs;
use tracing::{info, warn};
const PID_FILE: &str = "archipelago.pid";
const CONTAINER_STATE_FILE: &str = "running-containers.json";
const USER_STOPPED_FILE: &str = "user-stopped.json";
/// Shared flag: true once boot recovery is complete. Health monitor should wait for this.
pub static RECOVERY_COMPLETE: AtomicBool = AtomicBool::new(false);
/// Process start time for uptime calculation.
static START_TIME: std::sync::OnceLock<Instant> = std::sync::OnceLock::new();
/// Initialize the start time. Call once at startup.
pub fn init_start_time() {
START_TIME.get_or_init(Instant::now);
}
/// Get uptime in seconds since process start.
pub fn uptime_seconds() -> u64 {
START_TIME.get().map(|t| t.elapsed().as_secs()).unwrap_or(0)
}
/// Mark boot recovery as complete. Call after crash recovery + start_stopped_containers finish.
pub fn mark_recovery_complete() {
RECOVERY_COMPLETE.store(true, Ordering::SeqCst);
info!("Boot recovery complete — health monitor may proceed");
}
/// Check if boot recovery is done.
pub fn is_recovery_complete() -> bool {
RECOVERY_COMPLETE.load(Ordering::SeqCst)
}
// ── User-stopped tracking ───────────────────────────────────────────────
// When a user explicitly stops a container via the UI, we record it here
// so crash recovery and health monitor don't auto-restart it.
/// Load the set of user-stopped containers from disk.
pub async fn load_user_stopped(data_dir: &Path) -> std::collections::HashSet<String> {
let path = data_dir.join(USER_STOPPED_FILE);
match fs::read_to_string(&path).await {
Ok(content) => serde_json::from_str(&content).unwrap_or_default(),
Err(_) => std::collections::HashSet::new(),
}
}
/// Save the set of user-stopped containers to disk.
pub async fn save_user_stopped(data_dir: &Path, stopped: &std::collections::HashSet<String>) {
let path = data_dir.join(USER_STOPPED_FILE);
if let Ok(json) = serde_json::to_string_pretty(stopped) {
let _ = fs::write(&path, json).await;
}
}
/// Mark a container as user-stopped (won't be auto-restarted).
pub async fn mark_user_stopped(data_dir: &Path, name: &str) {
let mut stopped = load_user_stopped(data_dir).await;
stopped.insert(name.to_string());
save_user_stopped(data_dir, &stopped).await;
}
/// Clear user-stopped flag (container was manually started by user).
pub async fn clear_user_stopped(data_dir: &Path, name: &str) {
let mut stopped = load_user_stopped(data_dir).await;
if stopped.remove(name) {
save_user_stopped(data_dir, &stopped).await;
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RunningContainerRecord {
pub name: String,
pub image: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct ContainerSnapshot {
pub timestamp: u64,
pub containers: Vec<RunningContainerRecord>,
}
/// Check if the previous instance crashed (PID file exists without a clean shutdown).
/// Returns the list of containers that were running before the crash, if any.
pub async fn check_for_crash(data_dir: &Path) -> Result<Option<Vec<RunningContainerRecord>>> {
let pid_path = data_dir.join(PID_FILE);
if !pid_path.exists() {
return Ok(None);
}
// PID file exists — previous instance didn't shut down cleanly
let old_pid = fs::read_to_string(&pid_path)
.await
.unwrap_or_default()
.trim()
.to_string();
warn!(
"Crash detected: previous instance (PID {}) did not shut down cleanly",
old_pid
);
// Check if that PID is actually still running (zombie/stuck process)
if !old_pid.is_empty() {
if let Ok(pid) = old_pid.parse::<u32>() {
if is_process_running(pid) {
warn!(
"Previous process (PID {}) is still running — not a crash, skipping recovery",
pid
);
// Remove stale PID file and skip recovery
let _ = fs::remove_file(&pid_path).await;
return Ok(None);
}
}
}
// Load the saved container snapshot
let state_path = data_dir.join(CONTAINER_STATE_FILE);
let containers = if state_path.exists() {
match fs::read_to_string(&state_path).await {
Ok(content) => match serde_json::from_str::<ContainerSnapshot>(&content) {
Ok(snapshot) => {
info!(
"Found {} containers from pre-crash snapshot (saved at {})",
snapshot.containers.len(),
snapshot.timestamp
);
snapshot.containers
}
Err(e) => {
warn!("Failed to parse container snapshot: {}", e);
Vec::new()
}
},
Err(e) => {
warn!("Failed to read container snapshot: {}", e);
Vec::new()
}
}
} else {
info!("No container snapshot found — cannot determine which containers were running");
Vec::new()
};
// Clean up the stale PID file
let _ = fs::remove_file(&pid_path).await;
if containers.is_empty() {
Ok(None)
} else {
Ok(Some(containers))
}
}
/// Write the PID file to mark the current instance as running.
pub async fn write_pid_marker(data_dir: &Path) -> Result<()> {
let pid = std::process::id();
let pid_path = data_dir.join(PID_FILE);
fs::write(&pid_path, pid.to_string())
.await
.context("Failed to write PID marker")?;
Ok(())
}
/// Remove the PID file on clean shutdown.
pub async fn remove_pid_marker(data_dir: &Path) {
let pid_path = data_dir.join(PID_FILE);
if let Err(e) = fs::remove_file(&pid_path).await {
warn!("Failed to remove PID marker: {}", e);
}
}
/// Save a snapshot of currently running containers to disk.
/// Called periodically so we know what to restart after a crash.
pub async fn save_container_snapshot(data_dir: &Path) -> Result<()> {
let output = tokio::time::timeout(
std::time::Duration::from_secs(30),
tokio::process::Command::new("podman")
.args(["ps", "--format", "json"])
.output(),
)
.await
.context("podman ps timed out (30s)")?
.context("Failed to run podman ps")?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
anyhow::bail!("podman ps failed: {}", stderr);
}
let stdout = String::from_utf8_lossy(&output.stdout);
let containers: Vec<serde_json::Value> = serde_json::from_str(&stdout).unwrap_or_default();
let records: Vec<RunningContainerRecord> = containers
.iter()
.filter_map(|c| {
let name = c.get("Names").and_then(|v| {
// Podman returns Names as an array
if let Some(arr) = v.as_array() {
arr.first().and_then(|n| n.as_str()).map(|s| s.to_string())
} else {
v.as_str().map(|s| s.to_string())
}
})?;
let image = c
.get("Image")
.and_then(|v| v.as_str())
.unwrap_or("unknown")
.to_string();
Some(RunningContainerRecord { name, image })
})
.collect();
let snapshot = ContainerSnapshot {
timestamp: std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs(),
containers: records,
};
let state_path = data_dir.join(CONTAINER_STATE_FILE);
let json = serde_json::to_string_pretty(&snapshot)
.context("Failed to serialize container snapshot")?;
fs::write(&state_path, json)
.await
.context("Failed to write container snapshot")?;
Ok(())
}
/// Recover containers that were running before a crash.
/// Attempts to start each container, logging success/failure.
pub async fn recover_containers(containers: &[RunningContainerRecord]) -> RecoveryReport {
let mut report = RecoveryReport {
total: containers.len(),
recovered: 0,
failed: Vec::new(),
};
for (i, record) in containers.iter().enumerate() {
info!(
"Recovering container: {} (image: {})",
record.name, record.image
);
// Rate-limit container starts to avoid overwhelming podman on low-resource systems
if i > 0 {
tokio::time::sleep(std::time::Duration::from_secs(3)).await;
}
// Try up to 2 attempts with increasing timeout (120s first, 180s retry)
let mut started = false;
for attempt in 0..2u32 {
let timeout_secs = if attempt == 0 { 120 } else { 180 };
if attempt > 0 {
info!(
"Retrying container {} (attempt {})",
record.name,
attempt + 1
);
tokio::time::sleep(std::time::Duration::from_secs(10)).await;
}
let result = tokio::time::timeout(
std::time::Duration::from_secs(timeout_secs),
tokio::process::Command::new("podman")
.args(["start", &record.name])
.output(),
)
.await;
match result {
Ok(Ok(output)) if output.status.success() => {
info!("Successfully restarted container: {}", record.name);
report.recovered += 1;
started = true;
break;
}
Ok(Ok(output)) => {
let stderr = String::from_utf8_lossy(&output.stderr);
warn!(
"Failed to restart container {} (attempt {}): {}",
record.name,
attempt + 1,
stderr.trim()
);
}
Ok(Err(e)) => {
warn!(
"Failed to execute podman start for {} (attempt {}): {}",
record.name,
attempt + 1,
e
);
}
Err(_) => {
warn!(
"Timeout starting container {} ({}s, attempt {})",
record.name,
timeout_secs,
attempt + 1
);
}
}
}
if !started {
report.failed.push(record.name.clone());
}
}
report
}
#[derive(Debug)]
pub struct RecoveryReport {
pub total: usize,
pub recovered: usize,
pub failed: Vec<String>,
}
/// Check if a process with the given PID is still running.
fn is_process_running(pid: u32) -> bool {
// Check /proc/{pid} on Linux
std::path::Path::new(&format!("/proc/{}", pid)).exists()
}
/// Start all stopped containers that were previously installed.
/// Runs on every startup to ensure containers come back after clean reboots.
/// The crash recovery (PID-based) handles dirty shutdowns; this handles clean ones.
/// Skips containers that the user intentionally stopped via the UI.
pub async fn start_stopped_containers(data_dir: &Path) -> RecoveryReport {
let output = match tokio::time::timeout(
std::time::Duration::from_secs(60),
tokio::process::Command::new("podman")
.args([
"ps",
"-a",
"--filter",
"status=exited",
"--filter",
"status=created",
"--format",
"{{.Names}}",
])
.output(),
)
.await
{
Ok(result) => result,
Err(_) => {
warn!("Timeout listing stopped containers (60s)");
return RecoveryReport {
total: 0,
recovered: 0,
failed: Vec::new(),
};
}
};
let all_names: Vec<String> = match output {
Ok(o) if o.status.success() => String::from_utf8_lossy(&o.stdout)
.lines()
.filter(|l| !l.is_empty())
.map(|s| s.to_string())
.collect(),
_ => Vec::new(),
};
if all_names.is_empty() {
return RecoveryReport {
total: 0,
recovered: 0,
failed: Vec::new(),
};
}
// Filter out user-stopped containers
let user_stopped = load_user_stopped(data_dir).await;
let names: Vec<String> = all_names
.into_iter()
.filter(|n| {
if user_stopped.contains(n) {
info!("Skipping user-stopped container: {}", n);
false
} else {
true
}
})
.collect();
if names.is_empty() {
return RecoveryReport {
total: 0,
recovered: 0,
failed: Vec::new(),
};
}
let names: Vec<String> = names
.into_iter()
.filter(|n| should_auto_start_stopped_container(n))
.collect();
if names.is_empty() {
return RecoveryReport {
total: 0,
recovered: 0,
failed: Vec::new(),
};
}
// Sort by startup tier: databases first, then core, then dependent services, then apps
let mut records: Vec<RunningContainerRecord> = names
.iter()
.map(|n| RunningContainerRecord {
name: n.clone(),
image: String::new(),
})
.collect();
records.sort_by_key(|r| container_boot_tier(&r.name));
info!(
"Starting {} stopped containers after boot (skipped {} user-stopped)...",
records.len(),
user_stopped.len()
);
recover_containers(&records).await
}
fn should_auto_start_stopped_container(name: &str) -> bool {
// Keep generic boot recovery narrow. The Rust manifest reconciler owns
// managed app stacks; starting every exited Podman container here races
// it and resurrects legacy/orphan helper containers.
matches!(name, "filebrowser" | "nostr-rs-relay")
}
/// Simple tier ordering for boot recovery (mirrors health_monitor tiers).
fn container_boot_tier(name: &str) -> u8 {
let id = name.strip_prefix("archy-").unwrap_or(name);
match id {
// Tier 0: Databases and data stores
"btcpay-db" | "mempool-db" | "mysql-mempool" | "penpot-postgres" | "immich_postgres"
| "immich_redis" | "penpot-valkey" | "endurain-db" | "nextcloud-db"
| "indeedhub-postgres" | "indeedhub-redis" | "indeedhub-minio" => 0,
// Tier 1: Core infrastructure
"bitcoin-knots" | "bitcoin-core" | "bitcoin" => 1,
// Tier 2: Dependent services
"lnd" | "electrumx" | "mempool-electrs" | "electrs" | "nbxplorer" | "mempool-api"
| "indeedhub-api" => 2,
// Tier 4: Frontend/UI
"mempool-web" | "bitcoin-ui" | "lnd-ui" | "electrs-ui" | "penpot-frontend"
| "penpot-exporter" | "indeedhub" => 4,
// Tier 3: Everything else
_ => 3,
}
}
/// Run the reconciliation script after boot to fix any config drift.
/// Ensures all containers match their canonical specs from container-specs.sh.
#[allow(dead_code)]
pub async fn run_boot_reconciliation() {
let script = "/home/archipelago/archy/scripts/reconcile-containers.sh";
if !std::path::Path::new(script).exists() {
info!("Reconciliation script not found (dev mode?) — skipping boot reconciliation");
return;
}
info!("Running boot reconciliation...");
let result = tokio::time::timeout(
std::time::Duration::from_secs(300),
tokio::process::Command::new(script).output(),
)
.await;
match result {
Ok(Ok(output)) if output.status.success() => {
info!("Boot reconciliation complete");
}
Ok(Ok(output)) => {
let stderr = String::from_utf8_lossy(&output.stderr);
warn!(
"Boot reconciliation had failures: {}",
stderr.chars().take(500).collect::<String>()
);
}
Ok(Err(e)) => warn!("Boot reconciliation failed to run: {}", e),
Err(_) => warn!("Boot reconciliation timed out (300s)"),
}
}
/// Spawn a background task that periodically saves the container snapshot.
pub fn spawn_snapshot_task(data_dir: PathBuf) {
tokio::spawn(async move {
// Wait 2 minutes before first snapshot (let crash recovery finish and containers stabilize)
tokio::time::sleep(std::time::Duration::from_secs(120)).await;
let mut interval = tokio::time::interval(std::time::Duration::from_secs(120));
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
loop {
interval.tick().await;
if let Err(e) = save_container_snapshot(&data_dir).await {
tracing::debug!("Container snapshot (non-fatal): {}", e);
}
}
});
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
#[tokio::test]
async fn test_no_crash_without_pid_file() {
let tmp = TempDir::new().unwrap();
let result = check_for_crash(tmp.path()).await.unwrap();
assert!(result.is_none());
}
#[tokio::test]
async fn test_crash_detected_with_pid_file() {
let tmp = TempDir::new().unwrap();
// Write a PID file with a non-existent PID
fs::write(tmp.path().join(PID_FILE), "999999999")
.await
.unwrap();
let result = check_for_crash(tmp.path()).await.unwrap();
// No snapshot file → crash detected but no containers to recover
assert!(result.is_none());
}
#[tokio::test]
async fn test_crash_with_snapshot() {
let tmp = TempDir::new().unwrap();
// Write PID file
fs::write(tmp.path().join(PID_FILE), "999999999")
.await
.unwrap();
// Write container snapshot
let snapshot = ContainerSnapshot {
timestamp: 1000,
containers: vec![
RunningContainerRecord {
name: "archy-bitcoin-knots".to_string(),
image: "bitcoin-knots:27.1".to_string(),
},
RunningContainerRecord {
name: "archy-mempool-web".to_string(),
image: "mempool/frontend:3.0".to_string(),
},
],
};
let json = serde_json::to_string(&snapshot).unwrap();
fs::write(tmp.path().join(CONTAINER_STATE_FILE), json)
.await
.unwrap();
let result = check_for_crash(tmp.path()).await.unwrap();
assert!(result.is_some());
let containers = result.unwrap();
assert_eq!(containers.len(), 2);
assert_eq!(containers[0].name, "archy-bitcoin-knots");
assert_eq!(containers[1].name, "archy-mempool-web");
}
#[tokio::test]
async fn test_write_and_remove_pid_marker() {
let tmp = TempDir::new().unwrap();
write_pid_marker(tmp.path()).await.unwrap();
assert!(tmp.path().join(PID_FILE).exists());
remove_pid_marker(tmp.path()).await;
assert!(!tmp.path().join(PID_FILE).exists());
}
#[tokio::test]
async fn test_pid_marker_contains_current_pid() {
let tmp = TempDir::new().unwrap();
write_pid_marker(tmp.path()).await.unwrap();
let content = fs::read_to_string(tmp.path().join(PID_FILE)).await.unwrap();
let saved_pid: u32 = content.parse().unwrap();
assert_eq!(saved_pid, std::process::id());
}
#[tokio::test]
async fn test_clean_shutdown_no_crash_on_restart() {
let tmp = TempDir::new().unwrap();
// Simulate: startup → write PID → clean shutdown → remove PID
write_pid_marker(tmp.path()).await.unwrap();
remove_pid_marker(tmp.path()).await;
// Next startup should detect no crash
let result = check_for_crash(tmp.path()).await.unwrap();
assert!(result.is_none());
}
#[tokio::test]
async fn test_corrupt_snapshot_handled() {
let tmp = TempDir::new().unwrap();
fs::write(tmp.path().join(PID_FILE), "999999999")
.await
.unwrap();
fs::write(tmp.path().join(CONTAINER_STATE_FILE), "not valid json")
.await
.unwrap();
// Should not crash, returns None (no recoverable containers)
let result = check_for_crash(tmp.path()).await.unwrap();
assert!(result.is_none());
}
#[test]
fn generic_boot_recovery_skips_manifest_owned_and_legacy_stacks() {
assert!(should_auto_start_stopped_container("filebrowser"));
assert!(should_auto_start_stopped_container("nostr-rs-relay"));
assert!(!should_auto_start_stopped_container("bitcoin-knots"));
assert!(!should_auto_start_stopped_container("lnd"));
assert!(!should_auto_start_stopped_container("indeedhub-postgres"));
}
}