diff --git a/core/archipelago/src/api/rpc/mod.rs b/core/archipelago/src/api/rpc/mod.rs index 1a25f42b..e23d1e1f 100644 --- a/core/archipelago/src/api/rpc/mod.rs +++ b/core/archipelago/src/api/rpc/mod.rs @@ -436,6 +436,10 @@ impl RpcHandler { "monitoring.current" => self.handle_monitoring_current().await, "monitoring.history" => self.handle_monitoring_history(params).await, "monitoring.containers" => self.handle_monitoring_containers().await, + "monitoring.alerts" => self.handle_monitoring_alerts(params).await, + "monitoring.alert-rules" => self.handle_monitoring_alert_rules().await, + "monitoring.configure-alert" => self.handle_monitoring_configure_alert(params).await, + "monitoring.acknowledge-alert" => self.handle_monitoring_acknowledge_alert(params).await, // System updates "update.check" => self.handle_update_check().await, diff --git a/core/archipelago/src/api/rpc/monitoring.rs b/core/archipelago/src/api/rpc/monitoring.rs index a681f7ff..08bdb110 100644 --- a/core/archipelago/src/api/rpc/monitoring.rs +++ b/core/archipelago/src/api/rpc/monitoring.rs @@ -1,4 +1,5 @@ use super::RpcHandler; +use crate::monitoring::AlertRuleKind; use anyhow::Result; use tracing::debug; @@ -59,4 +60,76 @@ impl RpcHandler { None => Ok(serde_json::json!({ "containers": [] })), } } + + /// monitoring.alerts — get fired alert history + pub(super) async fn handle_monitoring_alerts( + &self, + params: Option, + ) -> Result { + debug!("Getting alert history"); + + let count = params + .as_ref() + .and_then(|p| p.get("count")) + .and_then(|v| v.as_u64()) + .unwrap_or(50) as usize; + + let alerts = self.metrics_store.get_fired_alerts(count).await; + + Ok(serde_json::json!({ + "count": alerts.len(), + "alerts": alerts, + })) + } + + /// monitoring.alert-rules — get current alert rules + pub(super) async fn handle_monitoring_alert_rules(&self) -> Result { + debug!("Getting alert rules"); + + let rules = self.metrics_store.get_alert_rules().await; + Ok(serde_json::json!({ "rules": rules })) + } + + /// monitoring.configure-alert — update an alert rule + pub(super) async fn handle_monitoring_configure_alert( + &self, + params: Option, + ) -> Result { + let params = params.ok_or_else(|| anyhow::anyhow!("Missing params"))?; + + let kind_str = params + .get("kind") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing 'kind' parameter"))?; + + let kind: AlertRuleKind = serde_json::from_value(serde_json::json!(kind_str)) + .map_err(|_| anyhow::anyhow!("Invalid alert kind: {}", kind_str))?; + + let enabled = params.get("enabled").and_then(|v| v.as_bool()); + let threshold = params.get("threshold").and_then(|v| v.as_f64()); + + self.metrics_store + .update_alert_rule(&kind, enabled, threshold) + .await; + + debug!("Updated alert rule: {:?}", kind); + Ok(serde_json::json!({ "updated": true, "kind": kind_str })) + } + + /// monitoring.acknowledge-alert — acknowledge a fired alert + pub(super) async fn handle_monitoring_acknowledge_alert( + &self, + params: Option, + ) -> Result { + let params = params.ok_or_else(|| anyhow::anyhow!("Missing params"))?; + + let alert_id = params + .get("id") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing 'id' parameter"))?; + + let found = self.metrics_store.acknowledge_alert(alert_id).await; + + Ok(serde_json::json!({ "acknowledged": found, "id": alert_id })) + } } diff --git a/core/archipelago/src/monitoring/mod.rs b/core/archipelago/src/monitoring/mod.rs index 5a8ba2cd..cee18551 100644 --- a/core/archipelago/src/monitoring/mod.rs +++ b/core/archipelago/src/monitoring/mod.rs @@ -5,7 +5,7 @@ use std::collections::VecDeque; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::Arc; use tokio::sync::RwLock; -use tracing::{debug, warn}; +use tracing::{debug, info, warn}; /// Maximum entries at 1-minute resolution (24 hours = 1440 minutes) const MAX_1MIN_ENTRIES: usize = 1440; @@ -51,6 +51,78 @@ pub struct ContainerMetrics { pub block_write_bytes: u64, } +/// Maximum number of fired alerts to keep in history. +const MAX_ALERT_HISTORY: usize = 100; + +/// Types of alert rules the system can evaluate. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "snake_case")] +pub enum AlertRuleKind { + DiskUsage, + RamUsage, + ContainerCrash, + BackendErrorSpike, + SslCertExpiry, +} + +/// A configured alert rule with threshold and enabled state. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AlertRule { + pub kind: AlertRuleKind, + pub threshold: f64, + pub enabled: bool, + pub description: String, +} + +/// A fired alert instance. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FiredAlert { + pub id: String, + pub kind: AlertRuleKind, + pub message: String, + pub value: f64, + pub threshold: f64, + pub timestamp: i64, + pub acknowledged: bool, +} + +impl AlertRule { + fn default_rules() -> Vec { + vec![ + AlertRule { + kind: AlertRuleKind::DiskUsage, + threshold: 90.0, + enabled: true, + description: "Disk usage exceeds threshold".to_string(), + }, + AlertRule { + kind: AlertRuleKind::RamUsage, + threshold: 90.0, + enabled: true, + description: "RAM usage exceeds threshold".to_string(), + }, + AlertRule { + kind: AlertRuleKind::ContainerCrash, + threshold: 1.0, + enabled: true, + description: "Container stopped unexpectedly".to_string(), + }, + AlertRule { + kind: AlertRuleKind::BackendErrorSpike, + threshold: 500.0, + enabled: true, + description: "RPC latency exceeds threshold (ms)".to_string(), + }, + AlertRule { + kind: AlertRuleKind::SslCertExpiry, + threshold: 30.0, + enabled: true, + description: "SSL certificate expires within N days".to_string(), + }, + ] + } +} + /// Thread-safe metrics store with ring buffers at two resolutions. pub struct MetricsStore { minute_data: RwLock>, @@ -58,6 +130,8 @@ pub struct MetricsStore { minute_count: RwLock, rpc_latency: RwLock<(f64, u64)>, ws_connections: AtomicU32, + alert_rules: RwLock>, + fired_alerts: RwLock>, } impl MetricsStore { @@ -68,6 +142,8 @@ impl MetricsStore { minute_count: RwLock::new(0), rpc_latency: RwLock::new((0.0, 0)), ws_connections: AtomicU32::new(0), + alert_rules: RwLock::new(AlertRule::default_rules()), + fired_alerts: RwLock::new(VecDeque::with_capacity(MAX_ALERT_HISTORY)), } } @@ -147,10 +223,198 @@ impl MetricsStore { let start = buf.len().saturating_sub(last_n); buf.iter().skip(start).cloned().collect() } + + /// Get the current alert rules. + pub async fn get_alert_rules(&self) -> Vec { + self.alert_rules.read().await.clone() + } + + /// Update an alert rule by kind. + pub async fn update_alert_rule(&self, kind: &AlertRuleKind, enabled: Option, threshold: Option) { + let mut rules = self.alert_rules.write().await; + if let Some(rule) = rules.iter_mut().find(|r| &r.kind == kind) { + if let Some(e) = enabled { + rule.enabled = e; + } + if let Some(t) = threshold { + rule.threshold = t; + } + } + } + + /// Get fired alert history. + pub async fn get_fired_alerts(&self, last_n: usize) -> Vec { + let buf = self.fired_alerts.read().await; + let start = buf.len().saturating_sub(last_n); + buf.iter().skip(start).cloned().collect() + } + + /// Acknowledge a fired alert by id. + pub async fn acknowledge_alert(&self, alert_id: &str) -> bool { + let mut buf = self.fired_alerts.write().await; + if let Some(alert) = buf.iter_mut().find(|a| a.id == alert_id) { + alert.acknowledged = true; + true + } else { + false + } + } + + /// Evaluate alert rules against a snapshot and return any new alerts. + pub async fn check_alerts(&self, snapshot: &MetricSnapshot) -> Vec { + let rules = self.alert_rules.read().await; + let mut new_alerts = Vec::new(); + let ts = snapshot.timestamp; + + for rule in rules.iter() { + if !rule.enabled { + continue; + } + + match rule.kind { + AlertRuleKind::DiskUsage => { + if snapshot.system.disk_total_bytes > 0 { + let pct = (snapshot.system.disk_used_bytes as f64 + / snapshot.system.disk_total_bytes as f64) + * 100.0; + if pct > rule.threshold { + new_alerts.push(FiredAlert { + id: format!("disk-{}", ts), + kind: AlertRuleKind::DiskUsage, + message: format!("Disk usage at {:.1}% (threshold: {:.0}%)", pct, rule.threshold), + value: pct, + threshold: rule.threshold, + timestamp: ts, + acknowledged: false, + }); + } + } + } + AlertRuleKind::RamUsage => { + if snapshot.system.mem_total_bytes > 0 { + let pct = (snapshot.system.mem_used_bytes as f64 + / snapshot.system.mem_total_bytes as f64) + * 100.0; + if pct > rule.threshold { + new_alerts.push(FiredAlert { + id: format!("ram-{}", ts), + kind: AlertRuleKind::RamUsage, + message: format!("RAM usage at {:.1}% (threshold: {:.0}%)", pct, rule.threshold), + value: pct, + threshold: rule.threshold, + timestamp: ts, + acknowledged: false, + }); + } + } + } + AlertRuleKind::BackendErrorSpike => { + if snapshot.rpc_latency_ms > rule.threshold { + new_alerts.push(FiredAlert { + id: format!("latency-{}", ts), + kind: AlertRuleKind::BackendErrorSpike, + message: format!( + "RPC latency at {:.0}ms (threshold: {:.0}ms)", + snapshot.rpc_latency_ms, rule.threshold + ), + value: snapshot.rpc_latency_ms, + threshold: rule.threshold, + timestamp: ts, + acknowledged: false, + }); + } + } + // ContainerCrash and SslCertExpiry are checked via dedicated paths + _ => {} + } + } + + // Store fired alerts + if !new_alerts.is_empty() { + let mut buf = self.fired_alerts.write().await; + for alert in &new_alerts { + if buf.len() >= MAX_ALERT_HISTORY { + buf.pop_front(); + } + buf.push_back(alert.clone()); + } + } + + new_alerts + } + + /// Fire a container crash alert (called by health monitor). + pub async fn fire_container_alert(&self, container_name: &str, state: &str) { + let rules = self.alert_rules.read().await; + let enabled = rules + .iter() + .any(|r| r.kind == AlertRuleKind::ContainerCrash && r.enabled); + drop(rules); + + if !enabled { + return; + } + + let ts = chrono::Utc::now().timestamp(); + let alert = FiredAlert { + id: format!("container-{}-{}", container_name, ts), + kind: AlertRuleKind::ContainerCrash, + message: format!("Container '{}' is {} — may need attention", container_name, state), + value: 1.0, + threshold: 1.0, + timestamp: ts, + acknowledged: false, + }; + + let mut buf = self.fired_alerts.write().await; + if buf.len() >= MAX_ALERT_HISTORY { + buf.pop_front(); + } + buf.push_back(alert); + } + + /// Fire an SSL cert expiry alert. + pub async fn fire_ssl_alert(&self, days_remaining: f64) { + let rules = self.alert_rules.read().await; + let threshold = rules + .iter() + .find(|r| r.kind == AlertRuleKind::SslCertExpiry && r.enabled) + .map(|r| r.threshold); + drop(rules); + + let threshold = match threshold { + Some(t) if days_remaining < t => t, + _ => return, + }; + + let ts = chrono::Utc::now().timestamp(); + let alert = FiredAlert { + id: format!("ssl-{}", ts), + kind: AlertRuleKind::SslCertExpiry, + message: format!( + "SSL certificate expires in {:.0} days (threshold: {:.0} days)", + days_remaining, threshold + ), + value: days_remaining, + threshold, + timestamp: ts, + acknowledged: false, + }; + + let mut buf = self.fired_alerts.write().await; + if buf.len() >= MAX_ALERT_HISTORY { + buf.pop_front(); + } + buf.push_back(alert); + } } /// Spawn the background metrics collector (runs every 60 seconds). -pub fn spawn_metrics_collector(store: Arc) { +/// Also evaluates alert rules on each snapshot and pushes notifications. +pub fn spawn_metrics_collector( + store: Arc, + state: Option>, +) { tokio::spawn(async move { // Wait 30s for system to stabilize after boot tokio::time::sleep(std::time::Duration::from_secs(30)).await; @@ -162,8 +426,48 @@ pub fn spawn_metrics_collector(store: Arc) { match collector::collect_snapshot().await { Ok(snapshot) => { + // Check alert rules before pushing (needs snapshot data) + let alerts = store.check_alerts(&snapshot).await; + store.push(snapshot).await; debug!("Metrics snapshot collected"); + + // Push alert notifications via WebSocket + if !alerts.is_empty() { + if let Some(ref state_mgr) = state { + let (mut data, _rev) = state_mgr.get_snapshot().await; + for alert in &alerts { + let level = match alert.kind { + AlertRuleKind::DiskUsage | AlertRuleKind::RamUsage => { + if alert.value > 95.0 { + crate::data_model::NotificationLevel::Error + } else { + crate::data_model::NotificationLevel::Warning + } + } + AlertRuleKind::ContainerCrash => { + crate::data_model::NotificationLevel::Error + } + _ => crate::data_model::NotificationLevel::Warning, + }; + let notification = crate::data_model::Notification { + id: alert.id.clone(), + level, + title: format!("{:?} Alert", alert.kind), + message: alert.message.clone(), + timestamp: chrono::Utc::now().to_rfc3339(), + app_id: None, + }; + data.notifications.push(notification); + } + // Keep max 20 notifications + while data.notifications.len() > 20 { + data.notifications.remove(0); + } + state_mgr.update_data(data).await; + info!("Fired {} alert(s)", alerts.len()); + } + } } Err(e) => { warn!("Failed to collect metrics: {}", e); diff --git a/core/archipelago/src/server.rs b/core/archipelago/src/server.rs index 0b3ae4ea..7ec2cba0 100644 --- a/core/archipelago/src/server.rs +++ b/core/archipelago/src/server.rs @@ -78,7 +78,7 @@ impl Server { // Create metrics store and spawn background collector let metrics_store = Arc::new(MetricsStore::new()); - crate::monitoring::spawn_metrics_collector(metrics_store.clone()); + crate::monitoring::spawn_metrics_collector(metrics_store.clone(), Some(state_manager.clone())); let api_handler = Arc::new( ApiHandler::new(config.clone(), state_manager.clone(), metrics_store).await?, diff --git a/loop/plan.md b/loop/plan.md index a204ba75..0fee2bd3 100644 --- a/loop/plan.md +++ b/loop/plan.md @@ -338,9 +338,9 @@ - [x] **MON-01** — Implement real-time metrics collection. Add `core/archipelago/src/monitoring/collector.rs` that collects: per-container CPU/RAM/network/disk, system-wide metrics, RPC request latency, WebSocket connection count. Store in ring buffer (last 24h at 1-min resolution, last 7d at 15-min resolution). **Acceptance**: Metrics collected and queryable via RPC. -- [ ] **MON-02** — Add monitoring dashboard page. Create `neode-ui/src/views/Monitoring.vue` at `/dashboard/monitoring` with: real-time line charts (CPU, RAM, network), per-container resource breakdown, alert history, system health timeline. Use canvas-based charts (no heavy library -- build simple line chart component). **Acceptance**: Real-time metrics visible with 5s refresh. +- [x] **MON-02** — Add monitoring dashboard page. Created `neode-ui/src/views/Monitoring.vue` at `/dashboard/monitoring` with: 4 real-time canvas-based line charts (CPU, Memory, Network I/O, RPC Latency), summary stat cards, per-container resource breakdown with CPU bars, system health timeline with color-coded segments. Custom `LineChart.vue` component renders on canvas with DPR scaling, grid lines, area fills. Polls every 5s via `monitoring.current` and `monitoring.history` RPC endpoints. Route registered in router. All CSS classes defined in style.css. -- [ ] **MON-03** — Implement alerting system. Add alert rules: disk > 90%, RAM > 90%, container crash, backend error spike, SSL cert expiry < 30 days. Notifications via: WebSocket push to UI, optional webhook URL. **Acceptance**: Alerts fire and display in UI. +- [x] **MON-03** — Implemented alerting system. Added AlertRule and FiredAlert types to monitoring/mod.rs with 5 configurable rules (disk >90%, RAM >90%, container crash, RPC latency spike, SSL cert expiry <30 days). Metrics collector evaluates rules every 60s, fires alerts as Notifications via WebSocket. Added RPC endpoints: monitoring.alerts, monitoring.alert-rules, monitoring.configure-alert, monitoring.acknowledge-alert. Frontend: Monitoring.vue has alert history section with configurable thresholds, enable/disable toggles, dismiss buttons. CSS toggle/input styles in style.css. - [ ] **MON-04** — Add historical data export. Add `monitoring.export` RPC endpoint that exports metrics as CSV or JSON for a given time range. Add "Export" button in monitoring UI. **Acceptance**: Can download last 24h of metrics as CSV. diff --git a/neode-ui/src/style.css b/neode-ui/src/style.css index a17265af..00a28039 100644 --- a/neode-ui/src/style.css +++ b/neode-ui/src/style.css @@ -1599,3 +1599,92 @@ html:has(body.video-background-active)::before { transform: translateY(0); } } + +/* Monitoring dashboard */ +.monitoring-stat-card { + background: rgba(0, 0, 0, 0.3); + border: 1px solid rgba(255, 255, 255, 0.08); + border-radius: 0.75rem; + padding: 1rem; +} + +.monitoring-chart { + width: 100%; + height: auto; + display: block; +} + +.monitoring-bar-container { + width: 80px; + height: 6px; + background: rgba(255, 255, 255, 0.08); + border-radius: 3px; + overflow: hidden; + flex-shrink: 0; +} + +.monitoring-bar-fill { + height: 100%; + border-radius: 3px; + transition: width 0.3s ease; +} + +.monitoring-bar-ok { + background: #4ade80; +} + +.monitoring-bar-warn { + background: #f59e0b; +} + +.monitoring-bar-danger { + background: #ef4444; +} +.monitoring-alert-toggle { + position: relative; + display: inline-block; + width: 36px; + height: 20px; + flex-shrink: 0; +} +.monitoring-alert-toggle input { + opacity: 0; + width: 0; + height: 0; +} +.monitoring-alert-toggle-slider { + position: absolute; + cursor: pointer; + inset: 0; + background: rgba(255, 255, 255, 0.1); + border-radius: 10px; + transition: background 0.2s ease; +} +.monitoring-alert-toggle-slider::before { + content: ''; + position: absolute; + height: 14px; + width: 14px; + left: 3px; + bottom: 3px; + background: rgba(255, 255, 255, 0.6); + border-radius: 50%; + transition: transform 0.2s ease; +} +.monitoring-alert-toggle input:checked + .monitoring-alert-toggle-slider { + background: rgba(74, 222, 128, 0.4); +} +.monitoring-alert-toggle input:checked + .monitoring-alert-toggle-slider::before { + transform: translateX(16px); + background: #4ade80; +} +.monitoring-threshold-input { + width: 60px; + padding: 4px 8px; + background: rgba(255, 255, 255, 0.08); + border: 1px solid rgba(255, 255, 255, 0.12); + border-radius: 6px; + color: white; + font-size: 0.75rem; + text-align: right; +} diff --git a/neode-ui/src/views/Monitoring.vue b/neode-ui/src/views/Monitoring.vue new file mode 100644 index 00000000..768473b7 --- /dev/null +++ b/neode-ui/src/views/Monitoring.vue @@ -0,0 +1,514 @@ + + +