From 04f1f4a20fdd0d38ec52a16b9748977d8db83a1c Mon Sep 17 00:00:00 2001 From: Dorian Date: Thu, 12 Mar 2026 22:26:58 +0000 Subject: [PATCH] fix: decouple health monitor from webhook config Health checks, auto-restarts, and WebSocket notifications now run unconditionally. Previously the entire health loop was gated on webhook config, so fresh installs (webhooks disabled) got zero container monitoring. Webhook HTTP delivery is now fire-and-forget after the notification is pushed to the UI. Co-Authored-By: Claude Opus 4.6 --- core/archipelago/src/health_monitor.rs | 25 ++++++++++++++++--------- loop/plan.md | 2 +- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/core/archipelago/src/health_monitor.rs b/core/archipelago/src/health_monitor.rs index 6c316e1e..7fe4c62a 100644 --- a/core/archipelago/src/health_monitor.rs +++ b/core/archipelago/src/health_monitor.rs @@ -147,14 +147,6 @@ pub fn spawn_health_monitor(state: Arc, data_dir: PathBuf) { loop { interval.tick().await; - // Check webhook config — if webhooks are disabled or ContainerCrash - // isn't subscribed, skip all health monitoring (no restarts, no notifications) - let webhook_config = webhooks::load_config(&data_dir).await.unwrap_or_default(); - if !webhook_config.enabled || !webhook_config.events.contains(&WebhookEvent::ContainerCrash) { - debug!("Health monitor: skipping — webhooks disabled or ContainerCrash not subscribed"); - continue; - } - let containers = check_containers().await; if containers.is_empty() { continue; @@ -210,11 +202,26 @@ pub fn spawn_health_monitor(state: Arc, data_dir: PathBuf) { }; // Keep only the latest 20 notifications - data.notifications.push(notification); + data.notifications.push(notification.clone()); if data.notifications.len() > 20 { data.notifications = data.notifications.split_off(data.notifications.len() - 20); } state_changed = true; + + // Fire-and-forget webhook delivery (checks config internally) + let webhook_payload = webhooks::WebhookPayload { + event: WebhookEvent::ContainerCrash, + title: notification.title, + message: notification.message, + timestamp: notification.timestamp, + node_id: String::new(), + details: Some(serde_json::json!({ + "container": container.name, + "app_id": container.app_id, + "state": container.state, + })), + }; + webhooks::send_webhook(&data_dir, webhook_payload).await; } } } diff --git a/loop/plan.md b/loop/plan.md index 815c1a87..fd8816b9 100644 --- a/loop/plan.md +++ b/loop/plan.md @@ -468,7 +468,7 @@ ### Sprint 40: Critical Fix & Identity Completion (April 2026 Week 1-2) -- [ ] **WHFIX-01** — Decouple health monitor from webhook config. In `core/archipelago/src/health_monitor.rs` lines 150-156, the health check loop skips ALL monitoring (restarts + WebSocket notifications) when webhooks are disabled or ContainerCrash isn't subscribed. This means fresh installs (webhooks disabled by default) get NO auto-restart and NO UI notifications. Fix: remove the webhook config gate from the main loop. Health checks, auto-restarts, and WebSocket `Notification` pushes must run unconditionally. Move the webhook gate into a separate block that only controls external HTTP webhook delivery — call `webhooks::send_webhook()` only when enabled AND the event is subscribed. Keep the existing `send_webhook()` function which already checks `config.enabled` and `config.events.contains()` internally. **Acceptance**: With webhooks disabled (default), crash a container (`sudo podman stop archy-filebrowser`), confirm health monitor detects it within 60s, auto-restarts it, and pushes a Notification visible in the Dashboard toast. With webhooks enabled + URL configured, confirm HTTP POST is also sent. Deploy and verify on 192.168.1.228. +- [x] **WHFIX-01** — Decouple health monitor from webhook config. In `core/archipelago/src/health_monitor.rs` lines 150-156, the health check loop skips ALL monitoring (restarts + WebSocket notifications) when webhooks are disabled or ContainerCrash isn't subscribed. This means fresh installs (webhooks disabled by default) get NO auto-restart and NO UI notifications. Fix: remove the webhook config gate from the main loop. Health checks, auto-restarts, and WebSocket `Notification` pushes must run unconditionally. Move the webhook gate into a separate block that only controls external HTTP webhook delivery — call `webhooks::send_webhook()` only when enabled AND the event is subscribed. Keep the existing `send_webhook()` function which already checks `config.enabled` and `config.events.contains()` internally. **Acceptance**: With webhooks disabled (default), crash a container (`sudo podman stop archy-filebrowser`), confirm health monitor detects it within 60s, auto-restarts it, and pushes a Notification visible in the Dashboard toast. With webhooks enabled + URL configured, confirm HTTP POST is also sent. Deploy and verify on 192.168.1.228. - [ ] **WHFIX-02** — Add monitoring.rs webhook integration. In `core/archipelago/src/monitoring/mod.rs`, the alert system pushes `Notification` to DataModel but never calls `webhooks::send_webhook()`. Add webhook delivery for fired alerts: when a `DiskWarning` alert fires, send `WebhookEvent::DiskWarning`; when `ContainerCrash` fires, send `WebhookEvent::ContainerCrash`. Map alert types to webhook events. The webhook call should be fire-and-forget (already is in `send_webhook`). **Acceptance**: Configure a webhook URL, trigger a disk warning (lower threshold temporarily to 1%), confirm HTTP POST received. Deploy and verify.