archy/core/archipelago/src/health_monitor.rs

355 lines
12 KiB
Rust

// Container Health Monitor
// Checks container health every 60s, auto-restarts unhealthy containers (max 3 times),
// and sends WebSocket notifications to the UI on failure.
use crate::data_model::{Notification, NotificationLevel};
use crate::state::StateManager;
use crate::webhooks::{self, WebhookEvent};
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::Arc;
use tracing::{debug, info, warn};
const MAX_RESTART_ATTEMPTS: u32 = 3;
const CHECK_INTERVAL_SECS: u64 = 60;
/// Track restart attempts per container to avoid infinite restart loops.
struct RestartTracker {
attempts: HashMap<String, u32>,
}
impl RestartTracker {
fn new() -> Self {
Self {
attempts: HashMap::new(),
}
}
/// Record a restart attempt. Returns false if max attempts exceeded.
fn record_attempt(&mut self, name: &str) -> bool {
let count = self.attempts.entry(name.to_string()).or_insert(0);
*count += 1;
*count <= MAX_RESTART_ATTEMPTS
}
/// Clear restart count when a container is healthy again.
fn clear(&mut self, name: &str) {
self.attempts.remove(name);
}
fn attempt_count(&self, name: &str) -> u32 {
*self.attempts.get(name).unwrap_or(&0)
}
}
#[derive(Debug, Clone)]
struct ContainerHealth {
name: String,
app_id: String,
state: String,
healthy: bool,
}
/// Query all containers and their health status.
async fn check_containers() -> Vec<ContainerHealth> {
let output = match tokio::process::Command::new("sudo")
.args(["podman", "ps", "-a", "--format", "json"])
.output()
.await
{
Ok(o) if o.status.success() => o,
_ => return Vec::new(),
};
let stdout = String::from_utf8_lossy(&output.stdout);
let containers: Vec<serde_json::Value> =
serde_json::from_str(&stdout).unwrap_or_default();
// Backend services to skip
let skip = [
"btcpay-db", "nbxplorer", "mempool-db", "mempool-api",
"penpot-postgres", "penpot-backend", "penpot-exporter", "penpot-valkey",
"penpot-mailcatch", "immich_postgres", "immich_redis",
"endurain-db", "nextcloud-db",
];
containers
.iter()
.filter_map(|c| {
let name = c.get("Names").and_then(|v| {
if let Some(arr) = v.as_array() {
arr.first().and_then(|n| n.as_str()).map(|s| s.to_string())
} else {
v.as_str().map(|s| s.to_string())
}
})?;
let app_id = name
.strip_prefix("archy-")
.unwrap_or(&name)
.to_string();
if skip.contains(&app_id.as_str()) || app_id.ends_with("-ui") {
return None;
}
let state = c.get("State")
.and_then(|v| v.as_str())
.unwrap_or("unknown")
.to_lowercase();
let healthy = state == "running";
Some(ContainerHealth {
name,
app_id,
state,
healthy,
})
})
.collect()
}
/// Try to restart a container.
async fn restart_container(name: &str) -> bool {
info!("Auto-restarting unhealthy container: {}", name);
let result = tokio::process::Command::new("sudo")
.args(["podman", "start", name])
.output()
.await;
match result {
Ok(output) if output.status.success() => {
info!("Successfully restarted container: {}", name);
true
}
Ok(output) => {
let stderr = String::from_utf8_lossy(&output.stderr);
warn!("Failed to restart container {}: {}", name, stderr.trim());
false
}
Err(e) => {
warn!("Failed to execute podman start for {}: {}", name, e);
false
}
}
}
/// Spawn the health monitor background task.
pub fn spawn_health_monitor(state: Arc<StateManager>, data_dir: PathBuf) {
tokio::spawn(async move {
// Wait 2 minutes for containers to start up
tokio::time::sleep(std::time::Duration::from_secs(120)).await;
let mut tracker = RestartTracker::new();
let mut interval = tokio::time::interval(std::time::Duration::from_secs(CHECK_INTERVAL_SECS));
loop {
interval.tick().await;
// Check webhook config — if webhooks are disabled or ContainerCrash
// isn't subscribed, skip all health monitoring (no restarts, no notifications)
let webhook_config = webhooks::load_config(&data_dir).await.unwrap_or_default();
if !webhook_config.enabled || !webhook_config.events.contains(&WebhookEvent::ContainerCrash) {
debug!("Health monitor: skipping — webhooks disabled or ContainerCrash not subscribed");
continue;
}
let containers = check_containers().await;
if containers.is_empty() {
continue;
}
let mut state_changed = false;
let (mut data, _) = state.get_snapshot().await;
for container in &containers {
if container.healthy {
// Clear restart tracker if container recovered
if tracker.attempt_count(&container.name) > 0 {
info!("Container {} is healthy again after restart", container.name);
tracker.clear(&container.name);
}
continue;
}
// Container is unhealthy (exited/stopped)
// Only try auto-restart if we haven't exceeded max attempts
if container.state == "exited" || container.state == "stopped" {
let attempts = tracker.attempt_count(&container.name);
if attempts >= MAX_RESTART_ATTEMPTS {
// Already notified, skip
debug!("Container {} exceeded max restart attempts ({})", container.name, MAX_RESTART_ATTEMPTS);
continue;
}
if tracker.record_attempt(&container.name) {
let restarted = restart_container(&container.name).await;
let attempt = tracker.attempt_count(&container.name);
if !restarted || attempt >= MAX_RESTART_ATTEMPTS {
// Push notification to UI
let notification = Notification {
id: format!("health-{}-{}", container.app_id, chrono::Utc::now().timestamp()),
level: NotificationLevel::Error,
title: format!("{} is unhealthy", container.app_id),
message: if restarted {
format!(
"Container restarted ({}/{} attempts). May need manual attention.",
attempt, MAX_RESTART_ATTEMPTS
)
} else {
format!(
"Auto-restart failed (attempt {}/{}). Container state: {}",
attempt, MAX_RESTART_ATTEMPTS, container.state
)
},
timestamp: chrono::Utc::now().to_rfc3339(),
app_id: Some(container.app_id.clone()),
};
// Keep only the latest 20 notifications
data.notifications.push(notification);
if data.notifications.len() > 20 {
data.notifications = data.notifications.split_off(data.notifications.len() - 20);
}
state_changed = true;
}
}
}
}
if state_changed {
state.update_data(data).await;
debug!("Health monitor: state updated with notifications");
}
}
});
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_restart_tracker_new_is_empty() {
let tracker = RestartTracker::new();
assert_eq!(tracker.attempt_count("any-container"), 0);
}
#[test]
fn test_restart_tracker_record_attempt_increments() {
let mut tracker = RestartTracker::new();
assert!(tracker.record_attempt("test-container"));
assert_eq!(tracker.attempt_count("test-container"), 1);
assert!(tracker.record_attempt("test-container"));
assert_eq!(tracker.attempt_count("test-container"), 2);
assert!(tracker.record_attempt("test-container"));
assert_eq!(tracker.attempt_count("test-container"), 3);
}
#[test]
fn test_restart_tracker_max_attempts_exceeded() {
let mut tracker = RestartTracker::new();
// First MAX_RESTART_ATTEMPTS attempts should return true
for i in 1..=MAX_RESTART_ATTEMPTS {
assert!(
tracker.record_attempt("container-a"),
"Attempt {} should be allowed",
i
);
}
// Next attempt exceeds max, returns false
assert!(!tracker.record_attempt("container-a"));
assert_eq!(tracker.attempt_count("container-a"), MAX_RESTART_ATTEMPTS + 1);
}
#[test]
fn test_restart_tracker_independent_containers() {
let mut tracker = RestartTracker::new();
tracker.record_attempt("container-a");
tracker.record_attempt("container-a");
tracker.record_attempt("container-b");
assert_eq!(tracker.attempt_count("container-a"), 2);
assert_eq!(tracker.attempt_count("container-b"), 1);
assert_eq!(tracker.attempt_count("container-c"), 0);
}
#[test]
fn test_restart_tracker_clear_resets_count() {
let mut tracker = RestartTracker::new();
tracker.record_attempt("container-x");
tracker.record_attempt("container-x");
assert_eq!(tracker.attempt_count("container-x"), 2);
tracker.clear("container-x");
assert_eq!(tracker.attempt_count("container-x"), 0);
}
#[test]
fn test_restart_tracker_clear_allows_new_attempts() {
let mut tracker = RestartTracker::new();
// Exhaust attempts
for _ in 0..=MAX_RESTART_ATTEMPTS {
tracker.record_attempt("container-y");
}
assert!(!tracker.record_attempt("container-y"));
// Clear and try again
tracker.clear("container-y");
assert!(tracker.record_attempt("container-y"));
assert_eq!(tracker.attempt_count("container-y"), 1);
}
#[test]
fn test_restart_tracker_clear_nonexistent_is_safe() {
let mut tracker = RestartTracker::new();
// Should not panic
tracker.clear("nonexistent");
assert_eq!(tracker.attempt_count("nonexistent"), 0);
}
#[test]
fn test_container_health_struct() {
let health = ContainerHealth {
name: "archy-bitcoin-knots".to_string(),
app_id: "bitcoin-knots".to_string(),
state: "running".to_string(),
healthy: true,
};
assert!(health.healthy);
assert_eq!(health.name, "archy-bitcoin-knots");
assert_eq!(health.app_id, "bitcoin-knots");
assert_eq!(health.state, "running");
}
#[test]
fn test_container_health_unhealthy() {
let health = ContainerHealth {
name: "archy-mempool-web".to_string(),
app_id: "mempool-web".to_string(),
state: "exited".to_string(),
healthy: false,
};
assert!(!health.healthy);
assert_eq!(health.state, "exited");
}
#[test]
fn test_max_restart_attempts_constant() {
// Ensure the constant is a reasonable value (not 0, not too high)
assert!(MAX_RESTART_ATTEMPTS >= 1);
assert!(MAX_RESTART_ATTEMPTS <= 10);
assert_eq!(MAX_RESTART_ATTEMPTS, 3);
}
#[test]
fn test_check_interval_constant() {
assert_eq!(CHECK_INTERVAL_SECS, 60);
}
}