- Added new dependencies: `adler2`, `crc32fast`, `flate2`, `miniz_oxide`, and `libredox`. - Updated existing dependencies: `tokio-rustls` to version 0.26.4 and `filetime` to version 0.2.27. - Removed the `backup.rs` file as it is no longer needed. - Introduced tests for configuration and credential management. - Enhanced the `identity` module to generate W3C compliant DID documents. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
336 lines
11 KiB
Rust
336 lines
11 KiB
Rust
// Crash Recovery Module
|
|
// Detects unexpected shutdowns and restarts containers that were running before the crash.
|
|
//
|
|
// How it works:
|
|
// 1. On startup, write a PID file as a "running" marker
|
|
// 2. Periodically snapshot which containers are running to a state file
|
|
// 3. On clean shutdown, remove the PID file
|
|
// 4. On next startup, if the PID file exists → previous instance crashed
|
|
// 5. On crash recovery: read the saved container list, restart them, log actions
|
|
|
|
use anyhow::{Context, Result};
|
|
use serde::{Deserialize, Serialize};
|
|
use std::path::{Path, PathBuf};
|
|
use tokio::fs;
|
|
use tracing::{info, warn};
|
|
|
|
const PID_FILE: &str = "archipelago.pid";
|
|
const CONTAINER_STATE_FILE: &str = "running-containers.json";
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct RunningContainerRecord {
|
|
pub name: String,
|
|
pub image: String,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
struct ContainerSnapshot {
|
|
pub timestamp: u64,
|
|
pub containers: Vec<RunningContainerRecord>,
|
|
}
|
|
|
|
/// Check if the previous instance crashed (PID file exists without a clean shutdown).
|
|
/// Returns the list of containers that were running before the crash, if any.
|
|
pub async fn check_for_crash(data_dir: &Path) -> Result<Option<Vec<RunningContainerRecord>>> {
|
|
let pid_path = data_dir.join(PID_FILE);
|
|
|
|
if !pid_path.exists() {
|
|
return Ok(None);
|
|
}
|
|
|
|
// PID file exists — previous instance didn't shut down cleanly
|
|
let old_pid = fs::read_to_string(&pid_path)
|
|
.await
|
|
.unwrap_or_default()
|
|
.trim()
|
|
.to_string();
|
|
warn!("Crash detected: previous instance (PID {}) did not shut down cleanly", old_pid);
|
|
|
|
// Check if that PID is actually still running (zombie/stuck process)
|
|
if !old_pid.is_empty() {
|
|
if let Ok(pid) = old_pid.parse::<u32>() {
|
|
if is_process_running(pid) {
|
|
warn!("Previous process (PID {}) is still running — not a crash, skipping recovery", pid);
|
|
// Remove stale PID file and skip recovery
|
|
let _ = fs::remove_file(&pid_path).await;
|
|
return Ok(None);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Load the saved container snapshot
|
|
let state_path = data_dir.join(CONTAINER_STATE_FILE);
|
|
let containers = if state_path.exists() {
|
|
match fs::read_to_string(&state_path).await {
|
|
Ok(content) => {
|
|
match serde_json::from_str::<ContainerSnapshot>(&content) {
|
|
Ok(snapshot) => {
|
|
info!(
|
|
"Found {} containers from pre-crash snapshot (saved at {})",
|
|
snapshot.containers.len(),
|
|
snapshot.timestamp
|
|
);
|
|
snapshot.containers
|
|
}
|
|
Err(e) => {
|
|
warn!("Failed to parse container snapshot: {}", e);
|
|
Vec::new()
|
|
}
|
|
}
|
|
}
|
|
Err(e) => {
|
|
warn!("Failed to read container snapshot: {}", e);
|
|
Vec::new()
|
|
}
|
|
}
|
|
} else {
|
|
info!("No container snapshot found — cannot determine which containers were running");
|
|
Vec::new()
|
|
};
|
|
|
|
// Clean up the stale PID file
|
|
let _ = fs::remove_file(&pid_path).await;
|
|
|
|
if containers.is_empty() {
|
|
Ok(None)
|
|
} else {
|
|
Ok(Some(containers))
|
|
}
|
|
}
|
|
|
|
/// Write the PID file to mark the current instance as running.
|
|
pub async fn write_pid_marker(data_dir: &Path) -> Result<()> {
|
|
let pid = std::process::id();
|
|
let pid_path = data_dir.join(PID_FILE);
|
|
fs::write(&pid_path, pid.to_string())
|
|
.await
|
|
.context("Failed to write PID marker")?;
|
|
Ok(())
|
|
}
|
|
|
|
/// Remove the PID file on clean shutdown.
|
|
pub async fn remove_pid_marker(data_dir: &Path) {
|
|
let pid_path = data_dir.join(PID_FILE);
|
|
if let Err(e) = fs::remove_file(&pid_path).await {
|
|
warn!("Failed to remove PID marker: {}", e);
|
|
}
|
|
}
|
|
|
|
/// Save a snapshot of currently running containers to disk.
|
|
/// Called periodically so we know what to restart after a crash.
|
|
pub async fn save_container_snapshot(data_dir: &Path) -> Result<()> {
|
|
let output = tokio::process::Command::new("sudo")
|
|
.args(["podman", "ps", "--format", "json"])
|
|
.output()
|
|
.await
|
|
.context("Failed to run podman ps")?;
|
|
|
|
if !output.status.success() {
|
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
anyhow::bail!("podman ps failed: {}", stderr);
|
|
}
|
|
|
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
let containers: Vec<serde_json::Value> =
|
|
serde_json::from_str(&stdout).unwrap_or_default();
|
|
|
|
let records: Vec<RunningContainerRecord> = containers
|
|
.iter()
|
|
.filter_map(|c| {
|
|
let name = c.get("Names")
|
|
.and_then(|v| {
|
|
// Podman returns Names as an array
|
|
if let Some(arr) = v.as_array() {
|
|
arr.first().and_then(|n| n.as_str()).map(|s| s.to_string())
|
|
} else {
|
|
v.as_str().map(|s| s.to_string())
|
|
}
|
|
})?;
|
|
let image = c.get("Image")
|
|
.and_then(|v| v.as_str())
|
|
.unwrap_or("unknown")
|
|
.to_string();
|
|
Some(RunningContainerRecord { name, image })
|
|
})
|
|
.collect();
|
|
|
|
let snapshot = ContainerSnapshot {
|
|
timestamp: std::time::SystemTime::now()
|
|
.duration_since(std::time::UNIX_EPOCH)
|
|
.unwrap_or_default()
|
|
.as_secs(),
|
|
containers: records,
|
|
};
|
|
|
|
let state_path = data_dir.join(CONTAINER_STATE_FILE);
|
|
let json = serde_json::to_string_pretty(&snapshot)
|
|
.context("Failed to serialize container snapshot")?;
|
|
fs::write(&state_path, json)
|
|
.await
|
|
.context("Failed to write container snapshot")?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Recover containers that were running before a crash.
|
|
/// Attempts to start each container, logging success/failure.
|
|
pub async fn recover_containers(containers: &[RunningContainerRecord]) -> RecoveryReport {
|
|
let mut report = RecoveryReport {
|
|
total: containers.len(),
|
|
recovered: 0,
|
|
failed: Vec::new(),
|
|
};
|
|
|
|
for record in containers {
|
|
info!("Recovering container: {} (image: {})", record.name, record.image);
|
|
|
|
let result = tokio::process::Command::new("sudo")
|
|
.args(["podman", "start", &record.name])
|
|
.output()
|
|
.await;
|
|
|
|
match result {
|
|
Ok(output) if output.status.success() => {
|
|
info!("Successfully restarted container: {}", record.name);
|
|
report.recovered += 1;
|
|
}
|
|
Ok(output) => {
|
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
warn!("Failed to restart container {}: {}", record.name, stderr.trim());
|
|
report.failed.push(record.name.clone());
|
|
}
|
|
Err(e) => {
|
|
warn!("Failed to execute podman start for {}: {}", record.name, e);
|
|
report.failed.push(record.name.clone());
|
|
}
|
|
}
|
|
}
|
|
|
|
report
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub struct RecoveryReport {
|
|
pub total: usize,
|
|
pub recovered: usize,
|
|
pub failed: Vec<String>,
|
|
}
|
|
|
|
/// Check if a process with the given PID is still running.
|
|
fn is_process_running(pid: u32) -> bool {
|
|
// Check /proc/{pid} on Linux
|
|
std::path::Path::new(&format!("/proc/{}", pid)).exists()
|
|
}
|
|
|
|
/// Spawn a background task that periodically saves the container snapshot.
|
|
pub fn spawn_snapshot_task(data_dir: PathBuf) {
|
|
tokio::spawn(async move {
|
|
// Wait 30s before first snapshot (let containers stabilize after startup)
|
|
tokio::time::sleep(std::time::Duration::from_secs(30)).await;
|
|
|
|
let mut interval = tokio::time::interval(std::time::Duration::from_secs(60));
|
|
loop {
|
|
interval.tick().await;
|
|
if let Err(e) = save_container_snapshot(&data_dir).await {
|
|
tracing::debug!("Container snapshot (non-fatal): {}", e);
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use tempfile::TempDir;
|
|
|
|
#[tokio::test]
|
|
async fn test_no_crash_without_pid_file() {
|
|
let tmp = TempDir::new().unwrap();
|
|
let result = check_for_crash(tmp.path()).await.unwrap();
|
|
assert!(result.is_none());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_crash_detected_with_pid_file() {
|
|
let tmp = TempDir::new().unwrap();
|
|
// Write a PID file with a non-existent PID
|
|
fs::write(tmp.path().join(PID_FILE), "999999999").await.unwrap();
|
|
let result = check_for_crash(tmp.path()).await.unwrap();
|
|
// No snapshot file → crash detected but no containers to recover
|
|
assert!(result.is_none());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_crash_with_snapshot() {
|
|
let tmp = TempDir::new().unwrap();
|
|
// Write PID file
|
|
fs::write(tmp.path().join(PID_FILE), "999999999").await.unwrap();
|
|
// Write container snapshot
|
|
let snapshot = ContainerSnapshot {
|
|
timestamp: 1000,
|
|
containers: vec![
|
|
RunningContainerRecord {
|
|
name: "archy-bitcoin-knots".to_string(),
|
|
image: "bitcoin-knots:27.1".to_string(),
|
|
},
|
|
RunningContainerRecord {
|
|
name: "archy-mempool-web".to_string(),
|
|
image: "mempool/frontend:3.0".to_string(),
|
|
},
|
|
],
|
|
};
|
|
let json = serde_json::to_string(&snapshot).unwrap();
|
|
fs::write(tmp.path().join(CONTAINER_STATE_FILE), json).await.unwrap();
|
|
|
|
let result = check_for_crash(tmp.path()).await.unwrap();
|
|
assert!(result.is_some());
|
|
let containers = result.unwrap();
|
|
assert_eq!(containers.len(), 2);
|
|
assert_eq!(containers[0].name, "archy-bitcoin-knots");
|
|
assert_eq!(containers[1].name, "archy-mempool-web");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_write_and_remove_pid_marker() {
|
|
let tmp = TempDir::new().unwrap();
|
|
write_pid_marker(tmp.path()).await.unwrap();
|
|
assert!(tmp.path().join(PID_FILE).exists());
|
|
|
|
remove_pid_marker(tmp.path()).await;
|
|
assert!(!tmp.path().join(PID_FILE).exists());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_pid_marker_contains_current_pid() {
|
|
let tmp = TempDir::new().unwrap();
|
|
write_pid_marker(tmp.path()).await.unwrap();
|
|
let content = fs::read_to_string(tmp.path().join(PID_FILE)).await.unwrap();
|
|
let saved_pid: u32 = content.parse().unwrap();
|
|
assert_eq!(saved_pid, std::process::id());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_clean_shutdown_no_crash_on_restart() {
|
|
let tmp = TempDir::new().unwrap();
|
|
|
|
// Simulate: startup → write PID → clean shutdown → remove PID
|
|
write_pid_marker(tmp.path()).await.unwrap();
|
|
remove_pid_marker(tmp.path()).await;
|
|
|
|
// Next startup should detect no crash
|
|
let result = check_for_crash(tmp.path()).await.unwrap();
|
|
assert!(result.is_none());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_corrupt_snapshot_handled() {
|
|
let tmp = TempDir::new().unwrap();
|
|
fs::write(tmp.path().join(PID_FILE), "999999999").await.unwrap();
|
|
fs::write(tmp.path().join(CONTAINER_STATE_FILE), "not valid json").await.unwrap();
|
|
|
|
// Should not crash, returns None (no recoverable containers)
|
|
let result = check_for_crash(tmp.path()).await.unwrap();
|
|
assert!(result.is_none());
|
|
}
|
|
}
|