Miscellaneous improvements: botfights manifest, discover page curated apps, mobile gamepad enhancements, content HTTP handler, package install config updates, health monitor tweaks, shared content UI, container specs and image version updates. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
939 lines
35 KiB
Rust
939 lines
35 KiB
Rust
// Container Health Monitor
|
|
// Checks container health every 120s, auto-restarts unhealthy containers (max 10 times)
|
|
// with exponential backoff (10s..120s), dependency-aware restart ordering (deps first),
|
|
// handles "created" state containers, resets dependent counters when deps recover,
|
|
// and sends WebSocket notifications to the UI on failure.
|
|
|
|
use crate::data_model::{Notification, NotificationLevel};
|
|
use crate::state::StateManager;
|
|
use crate::webhooks::{self, WebhookEvent};
|
|
use serde::{Deserialize, Serialize};
|
|
use std::collections::HashMap;
|
|
use std::path::{Path, PathBuf};
|
|
use std::sync::Arc;
|
|
use std::time::Instant;
|
|
use tracing::{debug, info, warn};
|
|
|
|
const MAX_RESTART_ATTEMPTS: u32 = 10;
|
|
const CHECK_INTERVAL_SECS: u64 = 120;
|
|
/// Backoff delays per attempt — escalating from 10s to 120s
|
|
const BACKOFF_DELAYS_SECS: [u64; 10] = [10, 15, 20, 30, 30, 45, 60, 60, 90, 120];
|
|
/// Reset restart counter after 1 hour of stability
|
|
const STABILITY_RESET_SECS: u64 = 3600;
|
|
|
|
/// Container startup tier for dependency ordering.
|
|
/// Lower tiers start first. Containers in the same tier start together.
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
|
enum StartupTier {
|
|
/// Databases: postgres, redis, mariadb, mysql
|
|
Database = 0,
|
|
/// Core infrastructure: bitcoin-knots, bitcoin-core
|
|
CoreInfra = 1,
|
|
/// Services depending on core: lnd, electrs, nbxplorer
|
|
DependentService = 2,
|
|
/// Application layer: mempool-api, btcpay-server, fedimint, nextcloud, etc.
|
|
Application = 3,
|
|
/// UI/frontend containers: mempool-web, bitcoin-ui, lnd-ui
|
|
Frontend = 4,
|
|
}
|
|
|
|
fn container_tier(name: &str) -> StartupTier {
|
|
let id = name.strip_prefix("archy-").unwrap_or(name);
|
|
match id {
|
|
// Tier 0: Databases and data stores
|
|
"btcpay-db" | "mempool-db" | "mysql-mempool" | "penpot-postgres"
|
|
| "immich_postgres" | "immich_redis" | "penpot-valkey"
|
|
| "endurain-db" | "nextcloud-db"
|
|
| "indeedhub-postgres" | "indeedhub-redis" | "indeedhub-minio" => StartupTier::Database,
|
|
|
|
// Tier 1: Core infrastructure
|
|
"bitcoin-knots" | "bitcoin-core" | "bitcoin" => StartupTier::CoreInfra,
|
|
|
|
// Tier 2: Dependent services (need databases or bitcoin)
|
|
"lnd" | "electrumx" | "mempool-electrs" | "electrs" | "nbxplorer"
|
|
| "mempool-api" | "indeedhub-api" => StartupTier::DependentService,
|
|
|
|
// Tier 4: Frontend/UI
|
|
"mempool-web" | "bitcoin-ui" | "lnd-ui" | "electrs-ui"
|
|
| "penpot-frontend" | "penpot-exporter"
|
|
| "indeedhub" => StartupTier::Frontend,
|
|
|
|
// Tier 3: Application layer (everything else)
|
|
_ => StartupTier::Application,
|
|
}
|
|
}
|
|
|
|
/// Map containers to their required dependencies.
|
|
/// When a dependent fails, check and restart its dependencies first.
|
|
fn container_dependencies(name: &str) -> &'static [&'static str] {
|
|
let id = name.strip_prefix("archy-").unwrap_or(name);
|
|
match id {
|
|
// Bitcoin-dependent chain
|
|
"lnd" => &["bitcoin-knots"],
|
|
"electrumx" | "mempool-electrs" | "electrs" => &["bitcoin-knots"],
|
|
"nbxplorer" => &["bitcoin-knots"],
|
|
"btcpay-server" => &["btcpay-db", "nbxplorer"],
|
|
"mempool-api" => &["mempool-db", "electrumx"],
|
|
"mempool-web" => &["mempool-api"],
|
|
"fedimint" => &["bitcoin-knots"],
|
|
"fedimint-gateway" => &["bitcoin-knots", "fedimint"],
|
|
|
|
// IndeedHub stack
|
|
"indeedhub-api" => &["indeedhub-postgres", "indeedhub-redis"],
|
|
"indeedhub" => &["indeedhub-api"],
|
|
"indeedhub-relay" => &["indeedhub-postgres"],
|
|
"indeedhub-ffmpeg" => &["indeedhub-api"],
|
|
|
|
// Multi-container stacks
|
|
"immich_server" => &["immich_postgres", "immich_redis"],
|
|
"penpot-backend" => &["penpot-postgres", "penpot-valkey"],
|
|
"penpot-frontend" => &["penpot-backend"],
|
|
|
|
// UI containers
|
|
"bitcoin-ui" => &["bitcoin-knots"],
|
|
"lnd-ui" => &["lnd"],
|
|
"electrs-ui" => &["electrumx"],
|
|
|
|
_ => &[],
|
|
}
|
|
}
|
|
|
|
/// Check if all of a container's dependencies are currently running.
|
|
fn deps_are_running(name: &str, containers: &[ContainerHealth]) -> bool {
|
|
let deps = container_dependencies(name);
|
|
if deps.is_empty() {
|
|
return true;
|
|
}
|
|
for dep in deps {
|
|
// Check both plain name and archy- prefixed name
|
|
let dep_running = containers.iter().any(|c| {
|
|
let c_id = c.name.strip_prefix("archy-").unwrap_or(&c.name);
|
|
(c_id == *dep || c.name == *dep) && c.state == "running"
|
|
});
|
|
if !dep_running {
|
|
return false;
|
|
}
|
|
}
|
|
true
|
|
}
|
|
|
|
/// Track restart attempts per container with exponential backoff and stability reset.
|
|
struct RestartTracker {
|
|
attempts: HashMap<String, u32>,
|
|
last_failure: HashMap<String, Instant>,
|
|
last_healthy: HashMap<String, Instant>,
|
|
}
|
|
|
|
impl RestartTracker {
|
|
fn new() -> Self {
|
|
Self {
|
|
attempts: HashMap::new(),
|
|
last_failure: HashMap::new(),
|
|
last_healthy: HashMap::new(),
|
|
}
|
|
}
|
|
|
|
/// Record a restart attempt. Returns false if max attempts exceeded.
|
|
fn record_attempt(&mut self, name: &str) -> bool {
|
|
let count = self.attempts.entry(name.to_string()).or_insert(0);
|
|
*count += 1;
|
|
self.last_failure.insert(name.to_string(), Instant::now());
|
|
*count <= MAX_RESTART_ATTEMPTS
|
|
}
|
|
|
|
/// Clear restart count when a container is healthy again.
|
|
fn clear(&mut self, name: &str) {
|
|
self.attempts.remove(name);
|
|
self.last_failure.remove(name);
|
|
self.last_healthy.insert(name.to_string(), Instant::now());
|
|
}
|
|
|
|
fn attempt_count(&self, name: &str) -> u32 {
|
|
*self.attempts.get(name).unwrap_or(&0)
|
|
}
|
|
|
|
/// Get the backoff delay in seconds for the current attempt number.
|
|
fn backoff_delay_secs(&self, name: &str) -> u64 {
|
|
let attempts = self.attempt_count(name);
|
|
if attempts == 0 {
|
|
return BACKOFF_DELAYS_SECS[0];
|
|
}
|
|
let idx = (attempts as usize).saturating_sub(1).min(BACKOFF_DELAYS_SECS.len() - 1);
|
|
BACKOFF_DELAYS_SECS[idx]
|
|
}
|
|
|
|
/// Check if enough time has passed since last failure for the backoff delay.
|
|
fn backoff_elapsed(&self, name: &str) -> bool {
|
|
let delay = self.backoff_delay_secs(name);
|
|
match self.last_failure.get(name) {
|
|
Some(last) => last.elapsed().as_secs() >= delay,
|
|
None => true,
|
|
}
|
|
}
|
|
|
|
/// Check if a failed container should have its counter reset (1h stability window).
|
|
fn should_reset_failed(&self, name: &str) -> bool {
|
|
if self.attempt_count(name) < MAX_RESTART_ATTEMPTS {
|
|
return false;
|
|
}
|
|
match self.last_failure.get(name) {
|
|
Some(last) => last.elapsed().as_secs() >= STABILITY_RESET_SECS,
|
|
None => false,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
struct ContainerHealth {
|
|
name: String,
|
|
app_id: String,
|
|
state: String,
|
|
healthy: bool,
|
|
}
|
|
|
|
/// Track container memory usage over time for leak detection.
|
|
struct MemoryTracker {
|
|
/// Per-container memory samples: (timestamp, rss_bytes)
|
|
samples: HashMap<String, Vec<(Instant, u64)>>,
|
|
}
|
|
|
|
impl MemoryTracker {
|
|
fn new() -> Self {
|
|
Self {
|
|
samples: HashMap::new(),
|
|
}
|
|
}
|
|
|
|
/// Record a memory sample for a container.
|
|
fn record(&mut self, name: &str, rss_bytes: u64) {
|
|
let entry = self.samples.entry(name.to_string()).or_default();
|
|
entry.push((Instant::now(), rss_bytes));
|
|
// Keep only last 288 samples (24h at 5min intervals)
|
|
if entry.len() > 288 {
|
|
entry.remove(0);
|
|
}
|
|
}
|
|
|
|
/// Check if a container's memory has grown by more than 50% over the tracking period.
|
|
/// Returns Some(growth_percent) if a leak is detected, None otherwise.
|
|
fn check_leak(&self, name: &str) -> Option<f64> {
|
|
let samples = self.samples.get(name)?;
|
|
if samples.len() < 12 {
|
|
return None; // Need at least 1 hour of data
|
|
}
|
|
let (oldest_time, oldest_rss) = samples.first()?;
|
|
let (_, latest_rss) = samples.last()?;
|
|
let elapsed_hours = oldest_time.elapsed().as_secs() as f64 / 3600.0;
|
|
if elapsed_hours < 1.0 || *oldest_rss == 0 {
|
|
return None;
|
|
}
|
|
let growth = (*latest_rss as f64 - *oldest_rss as f64) / *oldest_rss as f64 * 100.0;
|
|
if growth > 50.0 {
|
|
Some(growth)
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
// ── Persistent restart tracking ────────────────────────────────────────
|
|
// Survives process restarts so a container can't loop infinitely by
|
|
// crashing 3 times → triggering process restart → resetting counter → repeat.
|
|
|
|
const RESTART_HISTORY_FILE: &str = "restart-tracker.json";
|
|
|
|
#[derive(Serialize, Deserialize, Default)]
|
|
struct RestartHistory {
|
|
containers: HashMap<String, ContainerRestartRecord>,
|
|
}
|
|
|
|
#[derive(Serialize, Deserialize, Clone)]
|
|
struct ContainerRestartRecord {
|
|
attempts: u32,
|
|
last_failure_epoch: i64,
|
|
}
|
|
|
|
impl RestartHistory {
|
|
async fn load(data_dir: &Path) -> Self {
|
|
let path = data_dir.join(RESTART_HISTORY_FILE);
|
|
match tokio::fs::read_to_string(&path).await {
|
|
Ok(content) => serde_json::from_str(&content).unwrap_or_default(),
|
|
Err(_) => Self::default(),
|
|
}
|
|
}
|
|
|
|
async fn save(&self, data_dir: &Path) {
|
|
let path = data_dir.join(RESTART_HISTORY_FILE);
|
|
if let Ok(json) = serde_json::to_string(self) {
|
|
let _ = tokio::fs::write(&path, json).await;
|
|
}
|
|
}
|
|
|
|
/// Seed the in-memory RestartTracker from persisted history.
|
|
fn seed_tracker(&self, tracker: &mut RestartTracker) {
|
|
let now_epoch = chrono::Utc::now().timestamp();
|
|
for (name, record) in &self.containers {
|
|
// Only seed if last failure was within the stability window
|
|
let secs_since_failure = now_epoch - record.last_failure_epoch;
|
|
if secs_since_failure < STABILITY_RESET_SECS as i64 && record.attempts > 0 {
|
|
tracker.attempts.insert(name.clone(), record.attempts);
|
|
info!(
|
|
"Restored restart counter for {}: {} attempts ({}s ago)",
|
|
name, record.attempts, secs_since_failure
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
fn record_attempt(&mut self, name: &str) {
|
|
let entry = self.containers.entry(name.to_string()).or_insert(ContainerRestartRecord {
|
|
attempts: 0,
|
|
last_failure_epoch: 0,
|
|
});
|
|
entry.attempts += 1;
|
|
entry.last_failure_epoch = chrono::Utc::now().timestamp();
|
|
}
|
|
|
|
fn clear(&mut self, name: &str) {
|
|
self.containers.remove(name);
|
|
}
|
|
}
|
|
|
|
/// Query container memory stats from podman.
|
|
async fn check_container_memory() -> HashMap<String, u64> {
|
|
let output = match tokio::time::timeout(
|
|
std::time::Duration::from_secs(30),
|
|
tokio::process::Command::new("podman")
|
|
.args(["stats", "--no-stream", "--format", "{{.Name}} {{.MemUsage}}"])
|
|
.output(),
|
|
)
|
|
.await
|
|
{
|
|
Ok(Ok(o)) if o.status.success() => o,
|
|
Ok(Err(e)) => {
|
|
debug!("podman stats failed: {}", e);
|
|
return HashMap::new();
|
|
}
|
|
Err(_) => {
|
|
debug!("podman stats timed out (30s)");
|
|
return HashMap::new();
|
|
}
|
|
_ => return HashMap::new(),
|
|
};
|
|
|
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
let mut result = HashMap::new();
|
|
for line in stdout.lines() {
|
|
let parts: Vec<&str> = line.split_whitespace().collect();
|
|
if parts.len() >= 2 {
|
|
let name = parts[0].to_string();
|
|
// Parse memory like "123.4MiB", "1.2GiB", "45.6kB"
|
|
let mem_str = parts[1];
|
|
if let Some(bytes) = parse_memory_string(mem_str) {
|
|
result.insert(name, bytes);
|
|
}
|
|
}
|
|
}
|
|
result
|
|
}
|
|
|
|
/// Parse memory string like "123.4MiB" or "1.2GiB" to bytes.
|
|
fn parse_memory_string(s: &str) -> Option<u64> {
|
|
let s = s.trim();
|
|
if s.ends_with("GiB") {
|
|
let num: f64 = s.strip_suffix("GiB")?.parse().ok()?;
|
|
Some((num * 1_073_741_824.0) as u64)
|
|
} else if s.ends_with("MiB") {
|
|
let num: f64 = s.strip_suffix("MiB")?.parse().ok()?;
|
|
Some((num * 1_048_576.0) as u64)
|
|
} else if s.ends_with("KiB") || s.ends_with("kB") {
|
|
let suffix = if s.ends_with("KiB") { "KiB" } else { "kB" };
|
|
let num: f64 = s.strip_suffix(suffix)?.parse().ok()?;
|
|
Some((num * 1024.0) as u64)
|
|
} else if s.ends_with("B") {
|
|
let num: f64 = s.strip_suffix('B')?.parse().ok()?;
|
|
Some(num as u64)
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
/// Query all containers and their health status.
|
|
async fn check_containers() -> Vec<ContainerHealth> {
|
|
let output = match tokio::time::timeout(
|
|
std::time::Duration::from_secs(60),
|
|
tokio::process::Command::new("podman")
|
|
.args(["ps", "-a", "--format", "json"])
|
|
.output(),
|
|
)
|
|
.await
|
|
{
|
|
Ok(Ok(o)) if o.status.success() => o,
|
|
Ok(Err(e)) => {
|
|
debug!("podman ps failed: {}", e);
|
|
return Vec::new();
|
|
}
|
|
Err(_) => {
|
|
debug!("podman ps timed out (60s)");
|
|
return Vec::new();
|
|
}
|
|
_ => return Vec::new(),
|
|
};
|
|
|
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
let containers: Vec<serde_json::Value> =
|
|
serde_json::from_str(&stdout).unwrap_or_default();
|
|
|
|
// Monitor ALL long-running containers for health — backend services (databases,
|
|
// nbxplorer, mempool-api) and UI containers need auto-restart too.
|
|
// Only skip ephemeral containers (build infrastructure, init one-shots).
|
|
|
|
containers
|
|
.iter()
|
|
.filter_map(|c| {
|
|
let name = c.get("Names").and_then(|v| {
|
|
if let Some(arr) = v.as_array() {
|
|
arr.first().and_then(|n| n.as_str()).map(|s| s.to_string())
|
|
} else {
|
|
v.as_str().map(|s| s.to_string())
|
|
}
|
|
})?;
|
|
|
|
// Skip podman-compose infrastructure and one-shot init containers
|
|
if name.starts_with("indeedhub-build_") || name.contains("-init") {
|
|
return None;
|
|
}
|
|
|
|
let app_id = name
|
|
.strip_prefix("archy-")
|
|
.unwrap_or(&name)
|
|
.to_string();
|
|
|
|
let state = c.get("State")
|
|
.and_then(|v| v.as_str())
|
|
.unwrap_or("unknown")
|
|
.to_lowercase();
|
|
|
|
let healthy = state == "running";
|
|
|
|
Some(ContainerHealth {
|
|
name,
|
|
app_id,
|
|
state,
|
|
healthy,
|
|
})
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
/// Try to restart a container.
|
|
async fn restart_container(name: &str) -> bool {
|
|
info!("Auto-restarting unhealthy container: {}", name);
|
|
let result = tokio::time::timeout(
|
|
std::time::Duration::from_secs(120),
|
|
tokio::process::Command::new("podman")
|
|
.args(["start", name])
|
|
.output(),
|
|
)
|
|
.await;
|
|
|
|
match result {
|
|
Ok(Ok(output)) if output.status.success() => {
|
|
info!("Successfully restarted container: {}", name);
|
|
true
|
|
}
|
|
Ok(Ok(output)) => {
|
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
warn!("Failed to restart container {}: {}", name, stderr.trim());
|
|
false
|
|
}
|
|
Ok(Err(e)) => {
|
|
warn!("Failed to execute podman start for {}: {}", name, e);
|
|
false
|
|
}
|
|
Err(_) => {
|
|
warn!("Timeout starting container {} (120s)", name);
|
|
false
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Spawn the health monitor background task.
|
|
pub fn spawn_health_monitor(state: Arc<StateManager>, data_dir: PathBuf) {
|
|
tokio::spawn(async move {
|
|
// Wait for boot recovery to complete before starting health checks.
|
|
// This prevents the health monitor from fighting with crash_recovery
|
|
// which is starting containers in tier order.
|
|
info!("Health monitor: waiting for boot recovery to complete...");
|
|
let wait_start = std::time::Instant::now();
|
|
loop {
|
|
if crate::crash_recovery::is_recovery_complete() {
|
|
break;
|
|
}
|
|
// Safety timeout: start anyway after 5 minutes even if recovery hangs
|
|
if wait_start.elapsed().as_secs() > 300 {
|
|
warn!("Health monitor: boot recovery did not complete within 5 minutes, starting anyway");
|
|
break;
|
|
}
|
|
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
|
|
}
|
|
// Additional cooldown after recovery to let containers stabilize
|
|
info!("Health monitor: recovery done, waiting 60s for containers to stabilize...");
|
|
tokio::time::sleep(std::time::Duration::from_secs(60)).await;
|
|
info!("Health monitor: starting health checks");
|
|
|
|
let mut tracker = RestartTracker::new();
|
|
let mut mem_tracker = MemoryTracker::new();
|
|
let mut mem_check_counter: u32 = 0;
|
|
let mut interval = tokio::time::interval(std::time::Duration::from_secs(CHECK_INTERVAL_SECS));
|
|
// Skip missed ticks — prevents burst of health checks after slow podman response
|
|
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
|
|
|
|
// Load persistent restart history and seed the in-memory tracker
|
|
let mut restart_history = RestartHistory::load(&data_dir).await;
|
|
restart_history.seed_tracker(&mut tracker);
|
|
#[allow(unused_assignments)]
|
|
let mut history_dirty = false;
|
|
|
|
loop {
|
|
interval.tick().await;
|
|
mem_check_counter += 1;
|
|
|
|
// Check container memory every 5 minutes (every 5th health check)
|
|
if mem_check_counter % 5 == 0 {
|
|
let mem_stats = check_container_memory().await;
|
|
for (name, rss) in &mem_stats {
|
|
mem_tracker.record(name, *rss);
|
|
if let Some(growth) = mem_tracker.check_leak(name) {
|
|
warn!("Potential memory leak in {}: {:.0}% growth over tracking period", name, growth);
|
|
}
|
|
}
|
|
}
|
|
|
|
let containers = check_containers().await;
|
|
if containers.is_empty() {
|
|
continue;
|
|
}
|
|
|
|
// Load user-stopped list to skip intentionally stopped containers
|
|
let user_stopped = crate::crash_recovery::load_user_stopped(&data_dir).await;
|
|
|
|
// Sort containers by startup tier so databases restart before dependent services
|
|
let mut unhealthy: Vec<&ContainerHealth> = Vec::new();
|
|
let mut state_changed = false;
|
|
let (mut data, _) = state.get_snapshot().await;
|
|
|
|
for container in &containers {
|
|
// Skip optional/marketplace containers that aren't installed
|
|
if let Some(pkg) = data.package_data.get(&container.app_id) {
|
|
if pkg.installed.is_none() {
|
|
debug!("Skipping uninstalled container: {}", container.name);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if container.healthy {
|
|
if tracker.attempt_count(&container.name) > 0 {
|
|
info!("Container {} is healthy again after restart", container.name);
|
|
// Reset attempt counters for containers that depend on this one,
|
|
// since their previous failures may have been caused by this
|
|
// dependency being down
|
|
let recovered_id = container.name.strip_prefix("archy-")
|
|
.unwrap_or(&container.name).to_string();
|
|
for other in &containers {
|
|
let deps = container_dependencies(&other.name);
|
|
if deps.iter().any(|d| *d == recovered_id || *d == container.name) {
|
|
if tracker.attempt_count(&other.name) > 0 {
|
|
info!("Resetting restart counter for {} (dependency {} recovered)",
|
|
other.name, container.name);
|
|
tracker.clear(&other.name);
|
|
restart_history.clear(&other.name);
|
|
}
|
|
}
|
|
}
|
|
tracker.clear(&container.name);
|
|
restart_history.clear(&container.name);
|
|
history_dirty = true;
|
|
}
|
|
continue;
|
|
}
|
|
// Handle exited, stopped, AND created state containers
|
|
if container.state == "exited" || container.state == "stopped"
|
|
|| container.state == "created"
|
|
{
|
|
// Skip user-stopped containers
|
|
if user_stopped.contains(&container.name) {
|
|
debug!("Skipping user-stopped container: {}", container.name);
|
|
continue;
|
|
}
|
|
unhealthy.push(container);
|
|
}
|
|
}
|
|
|
|
// Sort by startup tier: databases first, then core, then dependent, then apps, then UIs
|
|
unhealthy.sort_by_key(|c| container_tier(&c.name));
|
|
|
|
let mut prev_tier: Option<StartupTier> = None;
|
|
for container in &unhealthy {
|
|
let tier = container_tier(&container.name);
|
|
|
|
// Reset counter after 1 hour for permanently failed containers
|
|
if tracker.should_reset_failed(&container.name) {
|
|
info!("Resetting restart counter for {} after {}s stability window", container.name, STABILITY_RESET_SECS);
|
|
tracker.clear(&container.name);
|
|
restart_history.clear(&container.name);
|
|
history_dirty = true;
|
|
}
|
|
|
|
if tracker.attempt_count(&container.name) >= MAX_RESTART_ATTEMPTS {
|
|
debug!("Container {} exceeded max restart attempts ({})", container.name, MAX_RESTART_ATTEMPTS);
|
|
continue;
|
|
}
|
|
|
|
// Wait for backoff delay before retrying
|
|
if !tracker.backoff_elapsed(&container.name) {
|
|
let delay = tracker.backoff_delay_secs(&container.name);
|
|
debug!("Container {} waiting for backoff ({}s)", container.name, delay);
|
|
continue;
|
|
}
|
|
|
|
// Skip if dependencies aren't running — they need to start first
|
|
if !deps_are_running(&container.name, &containers) {
|
|
let deps = container_dependencies(&container.name);
|
|
debug!("Container {} waiting for dependencies {:?}", container.name, deps);
|
|
continue;
|
|
}
|
|
|
|
// When transitioning to a higher tier, wait briefly for previous tier to stabilize
|
|
if let Some(prev) = prev_tier {
|
|
if tier > prev {
|
|
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
|
|
}
|
|
}
|
|
prev_tier = Some(tier);
|
|
|
|
if tracker.record_attempt(&container.name) {
|
|
restart_history.record_attempt(&container.name);
|
|
history_dirty = true;
|
|
let attempt = tracker.attempt_count(&container.name);
|
|
info!("Restarting {} (tier {:?}, attempt {}/{}, backoff {}s)",
|
|
container.name, tier, attempt, MAX_RESTART_ATTEMPTS,
|
|
BACKOFF_DELAYS_SECS.get(attempt.saturating_sub(1) as usize).unwrap_or(&90));
|
|
|
|
let restarted = restart_container(&container.name).await;
|
|
|
|
if !restarted || attempt >= MAX_RESTART_ATTEMPTS {
|
|
let notification = Notification {
|
|
id: format!("health-{}-{}", container.app_id, chrono::Utc::now().timestamp()),
|
|
level: NotificationLevel::Error,
|
|
title: format!("{} is unhealthy", container.app_id),
|
|
message: if restarted {
|
|
format!(
|
|
"Container restarted ({}/{} attempts). May need manual attention.",
|
|
attempt, MAX_RESTART_ATTEMPTS
|
|
)
|
|
} else {
|
|
format!(
|
|
"Auto-restart failed (attempt {}/{}). Container state: {}",
|
|
attempt, MAX_RESTART_ATTEMPTS, container.state
|
|
)
|
|
},
|
|
timestamp: chrono::Utc::now().to_rfc3339(),
|
|
app_id: Some(container.app_id.clone()),
|
|
};
|
|
|
|
data.notifications.push(notification.clone());
|
|
if data.notifications.len() > 20 {
|
|
data.notifications = data.notifications.split_off(data.notifications.len() - 20);
|
|
}
|
|
state_changed = true;
|
|
|
|
let webhook_payload = webhooks::WebhookPayload {
|
|
event: WebhookEvent::ContainerCrash,
|
|
title: notification.title,
|
|
message: notification.message,
|
|
timestamp: notification.timestamp,
|
|
node_id: String::new(),
|
|
details: Some(serde_json::json!({
|
|
"container": container.name,
|
|
"app_id": container.app_id,
|
|
"state": container.state,
|
|
"attempt": attempt,
|
|
"tier": format!("{:?}", tier),
|
|
})),
|
|
};
|
|
webhooks::send_webhook(&data_dir, webhook_payload).await;
|
|
}
|
|
}
|
|
}
|
|
|
|
if state_changed {
|
|
state.update_data(data).await;
|
|
debug!("Health monitor: state updated with notifications");
|
|
}
|
|
|
|
// Persist restart history to disk (debounced: once per check cycle)
|
|
if history_dirty {
|
|
restart_history.save(&data_dir).await;
|
|
history_dirty = false;
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_restart_tracker_new_is_empty() {
|
|
let tracker = RestartTracker::new();
|
|
assert_eq!(tracker.attempt_count("any-container"), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_restart_tracker_record_attempt_increments() {
|
|
let mut tracker = RestartTracker::new();
|
|
assert!(tracker.record_attempt("test-container"));
|
|
assert_eq!(tracker.attempt_count("test-container"), 1);
|
|
|
|
assert!(tracker.record_attempt("test-container"));
|
|
assert_eq!(tracker.attempt_count("test-container"), 2);
|
|
|
|
assert!(tracker.record_attempt("test-container"));
|
|
assert_eq!(tracker.attempt_count("test-container"), 3);
|
|
}
|
|
|
|
#[test]
|
|
fn test_restart_tracker_max_attempts_exceeded() {
|
|
let mut tracker = RestartTracker::new();
|
|
for i in 1..=MAX_RESTART_ATTEMPTS {
|
|
assert!(
|
|
tracker.record_attempt("container-a"),
|
|
"Attempt {} should be allowed",
|
|
i
|
|
);
|
|
}
|
|
assert!(!tracker.record_attempt("container-a"));
|
|
assert_eq!(tracker.attempt_count("container-a"), MAX_RESTART_ATTEMPTS + 1);
|
|
}
|
|
|
|
#[test]
|
|
fn test_restart_tracker_independent_containers() {
|
|
let mut tracker = RestartTracker::new();
|
|
tracker.record_attempt("container-a");
|
|
tracker.record_attempt("container-a");
|
|
tracker.record_attempt("container-b");
|
|
|
|
assert_eq!(tracker.attempt_count("container-a"), 2);
|
|
assert_eq!(tracker.attempt_count("container-b"), 1);
|
|
assert_eq!(tracker.attempt_count("container-c"), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_restart_tracker_clear_resets_count() {
|
|
let mut tracker = RestartTracker::new();
|
|
tracker.record_attempt("container-x");
|
|
tracker.record_attempt("container-x");
|
|
assert_eq!(tracker.attempt_count("container-x"), 2);
|
|
|
|
tracker.clear("container-x");
|
|
assert_eq!(tracker.attempt_count("container-x"), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_restart_tracker_clear_allows_new_attempts() {
|
|
let mut tracker = RestartTracker::new();
|
|
for _ in 0..=MAX_RESTART_ATTEMPTS {
|
|
tracker.record_attempt("container-y");
|
|
}
|
|
assert!(!tracker.record_attempt("container-y"));
|
|
|
|
tracker.clear("container-y");
|
|
assert!(tracker.record_attempt("container-y"));
|
|
assert_eq!(tracker.attempt_count("container-y"), 1);
|
|
}
|
|
|
|
#[test]
|
|
fn test_restart_tracker_clear_nonexistent_is_safe() {
|
|
let mut tracker = RestartTracker::new();
|
|
tracker.clear("nonexistent");
|
|
assert_eq!(tracker.attempt_count("nonexistent"), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_container_health_struct() {
|
|
let health = ContainerHealth {
|
|
name: "archy-bitcoin-knots".to_string(),
|
|
app_id: "bitcoin-knots".to_string(),
|
|
state: "running".to_string(),
|
|
healthy: true,
|
|
};
|
|
assert!(health.healthy);
|
|
assert_eq!(health.name, "archy-bitcoin-knots");
|
|
assert_eq!(health.app_id, "bitcoin-knots");
|
|
assert_eq!(health.state, "running");
|
|
}
|
|
|
|
#[test]
|
|
fn test_container_health_unhealthy() {
|
|
let health = ContainerHealth {
|
|
name: "archy-mempool-web".to_string(),
|
|
app_id: "mempool-web".to_string(),
|
|
state: "exited".to_string(),
|
|
healthy: false,
|
|
};
|
|
assert!(!health.healthy);
|
|
assert_eq!(health.state, "exited");
|
|
}
|
|
|
|
#[test]
|
|
fn test_max_restart_attempts_constant() {
|
|
assert!(MAX_RESTART_ATTEMPTS >= 1);
|
|
assert!(MAX_RESTART_ATTEMPTS <= 20);
|
|
assert_eq!(MAX_RESTART_ATTEMPTS, 10);
|
|
}
|
|
|
|
#[test]
|
|
fn test_check_interval_constant() {
|
|
assert_eq!(CHECK_INTERVAL_SECS, 120);
|
|
}
|
|
|
|
#[test]
|
|
fn test_backoff_delays() {
|
|
let tracker = RestartTracker::new();
|
|
// Before any attempts, delay is first backoff
|
|
assert_eq!(tracker.backoff_delay_secs("test"), BACKOFF_DELAYS_SECS[0]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_backoff_delays_escalate() {
|
|
let mut tracker = RestartTracker::new();
|
|
tracker.record_attempt("test");
|
|
assert_eq!(tracker.backoff_delay_secs("test"), BACKOFF_DELAYS_SECS[0]); // 10s
|
|
tracker.record_attempt("test");
|
|
assert_eq!(tracker.backoff_delay_secs("test"), BACKOFF_DELAYS_SECS[1]); // 30s
|
|
tracker.record_attempt("test");
|
|
assert_eq!(tracker.backoff_delay_secs("test"), BACKOFF_DELAYS_SECS[2]); // 90s
|
|
}
|
|
|
|
#[test]
|
|
fn test_backoff_elapsed_true_for_new() {
|
|
let tracker = RestartTracker::new();
|
|
assert!(tracker.backoff_elapsed("test"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_stability_reset_not_triggered_early() {
|
|
let mut tracker = RestartTracker::new();
|
|
tracker.record_attempt("test");
|
|
assert!(!tracker.should_reset_failed("test"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_container_tier_database() {
|
|
assert_eq!(container_tier("archy-btcpay-db"), StartupTier::Database);
|
|
assert_eq!(container_tier("immich_postgres"), StartupTier::Database);
|
|
assert_eq!(container_tier("penpot-valkey"), StartupTier::Database);
|
|
assert_eq!(container_tier("indeedhub-postgres"), StartupTier::Database);
|
|
assert_eq!(container_tier("indeedhub-redis"), StartupTier::Database);
|
|
assert_eq!(container_tier("indeedhub-minio"), StartupTier::Database);
|
|
}
|
|
|
|
#[test]
|
|
fn test_container_tier_indeedhub_api() {
|
|
assert_eq!(container_tier("indeedhub-api"), StartupTier::DependentService);
|
|
}
|
|
|
|
#[test]
|
|
fn test_container_tier_mempool_api() {
|
|
assert_eq!(container_tier("mempool-api"), StartupTier::DependentService);
|
|
}
|
|
|
|
#[test]
|
|
fn test_container_dependencies() {
|
|
assert!(container_dependencies("lnd").contains(&"bitcoin-knots"));
|
|
assert!(container_dependencies("indeedhub-api").contains(&"indeedhub-postgres"));
|
|
assert!(container_dependencies("indeedhub-api").contains(&"indeedhub-redis"));
|
|
assert!(container_dependencies("mempool-api").contains(&"mempool-db"));
|
|
assert!(container_dependencies("mempool-api").contains(&"electrumx"));
|
|
assert!(container_dependencies("nextcloud").is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn test_deps_are_running() {
|
|
let containers = vec![
|
|
ContainerHealth { name: "indeedhub-postgres".into(), app_id: "indeedhub-postgres".into(), state: "running".into(), healthy: true },
|
|
ContainerHealth { name: "indeedhub-redis".into(), app_id: "indeedhub-redis".into(), state: "running".into(), healthy: true },
|
|
ContainerHealth { name: "indeedhub-api".into(), app_id: "indeedhub-api".into(), state: "exited".into(), healthy: false },
|
|
];
|
|
assert!(deps_are_running("indeedhub-api", &containers));
|
|
// Missing postgres
|
|
let partial = vec![
|
|
ContainerHealth { name: "indeedhub-redis".into(), app_id: "indeedhub-redis".into(), state: "running".into(), healthy: true },
|
|
];
|
|
assert!(!deps_are_running("indeedhub-api", &partial));
|
|
}
|
|
|
|
#[test]
|
|
fn test_container_tier_core() {
|
|
assert_eq!(container_tier("bitcoin-knots"), StartupTier::CoreInfra);
|
|
}
|
|
|
|
#[test]
|
|
fn test_container_tier_dependent() {
|
|
assert_eq!(container_tier("lnd"), StartupTier::DependentService);
|
|
assert_eq!(container_tier("mempool-electrs"), StartupTier::DependentService);
|
|
assert_eq!(container_tier("archy-nbxplorer"), StartupTier::DependentService);
|
|
}
|
|
|
|
#[test]
|
|
fn test_container_tier_frontend() {
|
|
assert_eq!(container_tier("archy-mempool-web"), StartupTier::Frontend);
|
|
assert_eq!(container_tier("archy-bitcoin-ui"), StartupTier::Frontend);
|
|
}
|
|
|
|
#[test]
|
|
fn test_container_tier_application_default() {
|
|
assert_eq!(container_tier("nextcloud"), StartupTier::Application);
|
|
assert_eq!(container_tier("grafana"), StartupTier::Application);
|
|
assert_eq!(container_tier("btcpay-server"), StartupTier::Application);
|
|
}
|
|
|
|
#[test]
|
|
fn test_tier_ordering() {
|
|
assert!(StartupTier::Database < StartupTier::CoreInfra);
|
|
assert!(StartupTier::CoreInfra < StartupTier::DependentService);
|
|
assert!(StartupTier::DependentService < StartupTier::Application);
|
|
assert!(StartupTier::Application < StartupTier::Frontend);
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_memory_gib() {
|
|
assert_eq!(parse_memory_string("1.5GiB"), Some(1_610_612_736));
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_memory_mib() {
|
|
assert_eq!(parse_memory_string("256MiB"), Some(268_435_456));
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_memory_kib() {
|
|
assert_eq!(parse_memory_string("512KiB"), Some(524_288));
|
|
}
|
|
|
|
#[test]
|
|
fn test_parse_memory_invalid() {
|
|
assert_eq!(parse_memory_string("abc"), None);
|
|
}
|
|
|
|
#[test]
|
|
fn test_memory_tracker_no_leak_few_samples() {
|
|
let mut tracker = MemoryTracker::new();
|
|
tracker.record("test", 100_000_000);
|
|
assert!(tracker.check_leak("test").is_none());
|
|
}
|
|
}
|