feat: add alerting system with configurable rules and UI (MON-02, MON-03)
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
baeeb72f27
commit
6945bc4daf
@ -436,6 +436,10 @@ impl RpcHandler {
|
||||
"monitoring.current" => self.handle_monitoring_current().await,
|
||||
"monitoring.history" => self.handle_monitoring_history(params).await,
|
||||
"monitoring.containers" => self.handle_monitoring_containers().await,
|
||||
"monitoring.alerts" => self.handle_monitoring_alerts(params).await,
|
||||
"monitoring.alert-rules" => self.handle_monitoring_alert_rules().await,
|
||||
"monitoring.configure-alert" => self.handle_monitoring_configure_alert(params).await,
|
||||
"monitoring.acknowledge-alert" => self.handle_monitoring_acknowledge_alert(params).await,
|
||||
|
||||
// System updates
|
||||
"update.check" => self.handle_update_check().await,
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
use super::RpcHandler;
|
||||
use crate::monitoring::AlertRuleKind;
|
||||
use anyhow::Result;
|
||||
use tracing::debug;
|
||||
|
||||
@ -59,4 +60,76 @@ impl RpcHandler {
|
||||
None => Ok(serde_json::json!({ "containers": [] })),
|
||||
}
|
||||
}
|
||||
|
||||
/// monitoring.alerts — get fired alert history
|
||||
pub(super) async fn handle_monitoring_alerts(
|
||||
&self,
|
||||
params: Option<serde_json::Value>,
|
||||
) -> Result<serde_json::Value> {
|
||||
debug!("Getting alert history");
|
||||
|
||||
let count = params
|
||||
.as_ref()
|
||||
.and_then(|p| p.get("count"))
|
||||
.and_then(|v| v.as_u64())
|
||||
.unwrap_or(50) as usize;
|
||||
|
||||
let alerts = self.metrics_store.get_fired_alerts(count).await;
|
||||
|
||||
Ok(serde_json::json!({
|
||||
"count": alerts.len(),
|
||||
"alerts": alerts,
|
||||
}))
|
||||
}
|
||||
|
||||
/// monitoring.alert-rules — get current alert rules
|
||||
pub(super) async fn handle_monitoring_alert_rules(&self) -> Result<serde_json::Value> {
|
||||
debug!("Getting alert rules");
|
||||
|
||||
let rules = self.metrics_store.get_alert_rules().await;
|
||||
Ok(serde_json::json!({ "rules": rules }))
|
||||
}
|
||||
|
||||
/// monitoring.configure-alert — update an alert rule
|
||||
pub(super) async fn handle_monitoring_configure_alert(
|
||||
&self,
|
||||
params: Option<serde_json::Value>,
|
||||
) -> Result<serde_json::Value> {
|
||||
let params = params.ok_or_else(|| anyhow::anyhow!("Missing params"))?;
|
||||
|
||||
let kind_str = params
|
||||
.get("kind")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing 'kind' parameter"))?;
|
||||
|
||||
let kind: AlertRuleKind = serde_json::from_value(serde_json::json!(kind_str))
|
||||
.map_err(|_| anyhow::anyhow!("Invalid alert kind: {}", kind_str))?;
|
||||
|
||||
let enabled = params.get("enabled").and_then(|v| v.as_bool());
|
||||
let threshold = params.get("threshold").and_then(|v| v.as_f64());
|
||||
|
||||
self.metrics_store
|
||||
.update_alert_rule(&kind, enabled, threshold)
|
||||
.await;
|
||||
|
||||
debug!("Updated alert rule: {:?}", kind);
|
||||
Ok(serde_json::json!({ "updated": true, "kind": kind_str }))
|
||||
}
|
||||
|
||||
/// monitoring.acknowledge-alert — acknowledge a fired alert
|
||||
pub(super) async fn handle_monitoring_acknowledge_alert(
|
||||
&self,
|
||||
params: Option<serde_json::Value>,
|
||||
) -> Result<serde_json::Value> {
|
||||
let params = params.ok_or_else(|| anyhow::anyhow!("Missing params"))?;
|
||||
|
||||
let alert_id = params
|
||||
.get("id")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing 'id' parameter"))?;
|
||||
|
||||
let found = self.metrics_store.acknowledge_alert(alert_id).await;
|
||||
|
||||
Ok(serde_json::json!({ "acknowledged": found, "id": alert_id }))
|
||||
}
|
||||
}
|
||||
|
||||
@ -5,7 +5,7 @@ use std::collections::VecDeque;
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::{debug, warn};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
/// Maximum entries at 1-minute resolution (24 hours = 1440 minutes)
|
||||
const MAX_1MIN_ENTRIES: usize = 1440;
|
||||
@ -51,6 +51,78 @@ pub struct ContainerMetrics {
|
||||
pub block_write_bytes: u64,
|
||||
}
|
||||
|
||||
/// Maximum number of fired alerts to keep in history.
|
||||
const MAX_ALERT_HISTORY: usize = 100;
|
||||
|
||||
/// Types of alert rules the system can evaluate.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum AlertRuleKind {
|
||||
DiskUsage,
|
||||
RamUsage,
|
||||
ContainerCrash,
|
||||
BackendErrorSpike,
|
||||
SslCertExpiry,
|
||||
}
|
||||
|
||||
/// A configured alert rule with threshold and enabled state.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct AlertRule {
|
||||
pub kind: AlertRuleKind,
|
||||
pub threshold: f64,
|
||||
pub enabled: bool,
|
||||
pub description: String,
|
||||
}
|
||||
|
||||
/// A fired alert instance.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct FiredAlert {
|
||||
pub id: String,
|
||||
pub kind: AlertRuleKind,
|
||||
pub message: String,
|
||||
pub value: f64,
|
||||
pub threshold: f64,
|
||||
pub timestamp: i64,
|
||||
pub acknowledged: bool,
|
||||
}
|
||||
|
||||
impl AlertRule {
|
||||
fn default_rules() -> Vec<AlertRule> {
|
||||
vec![
|
||||
AlertRule {
|
||||
kind: AlertRuleKind::DiskUsage,
|
||||
threshold: 90.0,
|
||||
enabled: true,
|
||||
description: "Disk usage exceeds threshold".to_string(),
|
||||
},
|
||||
AlertRule {
|
||||
kind: AlertRuleKind::RamUsage,
|
||||
threshold: 90.0,
|
||||
enabled: true,
|
||||
description: "RAM usage exceeds threshold".to_string(),
|
||||
},
|
||||
AlertRule {
|
||||
kind: AlertRuleKind::ContainerCrash,
|
||||
threshold: 1.0,
|
||||
enabled: true,
|
||||
description: "Container stopped unexpectedly".to_string(),
|
||||
},
|
||||
AlertRule {
|
||||
kind: AlertRuleKind::BackendErrorSpike,
|
||||
threshold: 500.0,
|
||||
enabled: true,
|
||||
description: "RPC latency exceeds threshold (ms)".to_string(),
|
||||
},
|
||||
AlertRule {
|
||||
kind: AlertRuleKind::SslCertExpiry,
|
||||
threshold: 30.0,
|
||||
enabled: true,
|
||||
description: "SSL certificate expires within N days".to_string(),
|
||||
},
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
/// Thread-safe metrics store with ring buffers at two resolutions.
|
||||
pub struct MetricsStore {
|
||||
minute_data: RwLock<VecDeque<MetricSnapshot>>,
|
||||
@ -58,6 +130,8 @@ pub struct MetricsStore {
|
||||
minute_count: RwLock<u32>,
|
||||
rpc_latency: RwLock<(f64, u64)>,
|
||||
ws_connections: AtomicU32,
|
||||
alert_rules: RwLock<Vec<AlertRule>>,
|
||||
fired_alerts: RwLock<VecDeque<FiredAlert>>,
|
||||
}
|
||||
|
||||
impl MetricsStore {
|
||||
@ -68,6 +142,8 @@ impl MetricsStore {
|
||||
minute_count: RwLock::new(0),
|
||||
rpc_latency: RwLock::new((0.0, 0)),
|
||||
ws_connections: AtomicU32::new(0),
|
||||
alert_rules: RwLock::new(AlertRule::default_rules()),
|
||||
fired_alerts: RwLock::new(VecDeque::with_capacity(MAX_ALERT_HISTORY)),
|
||||
}
|
||||
}
|
||||
|
||||
@ -147,10 +223,198 @@ impl MetricsStore {
|
||||
let start = buf.len().saturating_sub(last_n);
|
||||
buf.iter().skip(start).cloned().collect()
|
||||
}
|
||||
|
||||
/// Get the current alert rules.
|
||||
pub async fn get_alert_rules(&self) -> Vec<AlertRule> {
|
||||
self.alert_rules.read().await.clone()
|
||||
}
|
||||
|
||||
/// Update an alert rule by kind.
|
||||
pub async fn update_alert_rule(&self, kind: &AlertRuleKind, enabled: Option<bool>, threshold: Option<f64>) {
|
||||
let mut rules = self.alert_rules.write().await;
|
||||
if let Some(rule) = rules.iter_mut().find(|r| &r.kind == kind) {
|
||||
if let Some(e) = enabled {
|
||||
rule.enabled = e;
|
||||
}
|
||||
if let Some(t) = threshold {
|
||||
rule.threshold = t;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get fired alert history.
|
||||
pub async fn get_fired_alerts(&self, last_n: usize) -> Vec<FiredAlert> {
|
||||
let buf = self.fired_alerts.read().await;
|
||||
let start = buf.len().saturating_sub(last_n);
|
||||
buf.iter().skip(start).cloned().collect()
|
||||
}
|
||||
|
||||
/// Acknowledge a fired alert by id.
|
||||
pub async fn acknowledge_alert(&self, alert_id: &str) -> bool {
|
||||
let mut buf = self.fired_alerts.write().await;
|
||||
if let Some(alert) = buf.iter_mut().find(|a| a.id == alert_id) {
|
||||
alert.acknowledged = true;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Evaluate alert rules against a snapshot and return any new alerts.
|
||||
pub async fn check_alerts(&self, snapshot: &MetricSnapshot) -> Vec<FiredAlert> {
|
||||
let rules = self.alert_rules.read().await;
|
||||
let mut new_alerts = Vec::new();
|
||||
let ts = snapshot.timestamp;
|
||||
|
||||
for rule in rules.iter() {
|
||||
if !rule.enabled {
|
||||
continue;
|
||||
}
|
||||
|
||||
match rule.kind {
|
||||
AlertRuleKind::DiskUsage => {
|
||||
if snapshot.system.disk_total_bytes > 0 {
|
||||
let pct = (snapshot.system.disk_used_bytes as f64
|
||||
/ snapshot.system.disk_total_bytes as f64)
|
||||
* 100.0;
|
||||
if pct > rule.threshold {
|
||||
new_alerts.push(FiredAlert {
|
||||
id: format!("disk-{}", ts),
|
||||
kind: AlertRuleKind::DiskUsage,
|
||||
message: format!("Disk usage at {:.1}% (threshold: {:.0}%)", pct, rule.threshold),
|
||||
value: pct,
|
||||
threshold: rule.threshold,
|
||||
timestamp: ts,
|
||||
acknowledged: false,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
AlertRuleKind::RamUsage => {
|
||||
if snapshot.system.mem_total_bytes > 0 {
|
||||
let pct = (snapshot.system.mem_used_bytes as f64
|
||||
/ snapshot.system.mem_total_bytes as f64)
|
||||
* 100.0;
|
||||
if pct > rule.threshold {
|
||||
new_alerts.push(FiredAlert {
|
||||
id: format!("ram-{}", ts),
|
||||
kind: AlertRuleKind::RamUsage,
|
||||
message: format!("RAM usage at {:.1}% (threshold: {:.0}%)", pct, rule.threshold),
|
||||
value: pct,
|
||||
threshold: rule.threshold,
|
||||
timestamp: ts,
|
||||
acknowledged: false,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
AlertRuleKind::BackendErrorSpike => {
|
||||
if snapshot.rpc_latency_ms > rule.threshold {
|
||||
new_alerts.push(FiredAlert {
|
||||
id: format!("latency-{}", ts),
|
||||
kind: AlertRuleKind::BackendErrorSpike,
|
||||
message: format!(
|
||||
"RPC latency at {:.0}ms (threshold: {:.0}ms)",
|
||||
snapshot.rpc_latency_ms, rule.threshold
|
||||
),
|
||||
value: snapshot.rpc_latency_ms,
|
||||
threshold: rule.threshold,
|
||||
timestamp: ts,
|
||||
acknowledged: false,
|
||||
});
|
||||
}
|
||||
}
|
||||
// ContainerCrash and SslCertExpiry are checked via dedicated paths
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// Store fired alerts
|
||||
if !new_alerts.is_empty() {
|
||||
let mut buf = self.fired_alerts.write().await;
|
||||
for alert in &new_alerts {
|
||||
if buf.len() >= MAX_ALERT_HISTORY {
|
||||
buf.pop_front();
|
||||
}
|
||||
buf.push_back(alert.clone());
|
||||
}
|
||||
}
|
||||
|
||||
new_alerts
|
||||
}
|
||||
|
||||
/// Fire a container crash alert (called by health monitor).
|
||||
pub async fn fire_container_alert(&self, container_name: &str, state: &str) {
|
||||
let rules = self.alert_rules.read().await;
|
||||
let enabled = rules
|
||||
.iter()
|
||||
.any(|r| r.kind == AlertRuleKind::ContainerCrash && r.enabled);
|
||||
drop(rules);
|
||||
|
||||
if !enabled {
|
||||
return;
|
||||
}
|
||||
|
||||
let ts = chrono::Utc::now().timestamp();
|
||||
let alert = FiredAlert {
|
||||
id: format!("container-{}-{}", container_name, ts),
|
||||
kind: AlertRuleKind::ContainerCrash,
|
||||
message: format!("Container '{}' is {} — may need attention", container_name, state),
|
||||
value: 1.0,
|
||||
threshold: 1.0,
|
||||
timestamp: ts,
|
||||
acknowledged: false,
|
||||
};
|
||||
|
||||
let mut buf = self.fired_alerts.write().await;
|
||||
if buf.len() >= MAX_ALERT_HISTORY {
|
||||
buf.pop_front();
|
||||
}
|
||||
buf.push_back(alert);
|
||||
}
|
||||
|
||||
/// Fire an SSL cert expiry alert.
|
||||
pub async fn fire_ssl_alert(&self, days_remaining: f64) {
|
||||
let rules = self.alert_rules.read().await;
|
||||
let threshold = rules
|
||||
.iter()
|
||||
.find(|r| r.kind == AlertRuleKind::SslCertExpiry && r.enabled)
|
||||
.map(|r| r.threshold);
|
||||
drop(rules);
|
||||
|
||||
let threshold = match threshold {
|
||||
Some(t) if days_remaining < t => t,
|
||||
_ => return,
|
||||
};
|
||||
|
||||
let ts = chrono::Utc::now().timestamp();
|
||||
let alert = FiredAlert {
|
||||
id: format!("ssl-{}", ts),
|
||||
kind: AlertRuleKind::SslCertExpiry,
|
||||
message: format!(
|
||||
"SSL certificate expires in {:.0} days (threshold: {:.0} days)",
|
||||
days_remaining, threshold
|
||||
),
|
||||
value: days_remaining,
|
||||
threshold,
|
||||
timestamp: ts,
|
||||
acknowledged: false,
|
||||
};
|
||||
|
||||
let mut buf = self.fired_alerts.write().await;
|
||||
if buf.len() >= MAX_ALERT_HISTORY {
|
||||
buf.pop_front();
|
||||
}
|
||||
buf.push_back(alert);
|
||||
}
|
||||
}
|
||||
|
||||
/// Spawn the background metrics collector (runs every 60 seconds).
|
||||
pub fn spawn_metrics_collector(store: Arc<MetricsStore>) {
|
||||
/// Also evaluates alert rules on each snapshot and pushes notifications.
|
||||
pub fn spawn_metrics_collector(
|
||||
store: Arc<MetricsStore>,
|
||||
state: Option<Arc<crate::state::StateManager>>,
|
||||
) {
|
||||
tokio::spawn(async move {
|
||||
// Wait 30s for system to stabilize after boot
|
||||
tokio::time::sleep(std::time::Duration::from_secs(30)).await;
|
||||
@ -162,8 +426,48 @@ pub fn spawn_metrics_collector(store: Arc<MetricsStore>) {
|
||||
|
||||
match collector::collect_snapshot().await {
|
||||
Ok(snapshot) => {
|
||||
// Check alert rules before pushing (needs snapshot data)
|
||||
let alerts = store.check_alerts(&snapshot).await;
|
||||
|
||||
store.push(snapshot).await;
|
||||
debug!("Metrics snapshot collected");
|
||||
|
||||
// Push alert notifications via WebSocket
|
||||
if !alerts.is_empty() {
|
||||
if let Some(ref state_mgr) = state {
|
||||
let (mut data, _rev) = state_mgr.get_snapshot().await;
|
||||
for alert in &alerts {
|
||||
let level = match alert.kind {
|
||||
AlertRuleKind::DiskUsage | AlertRuleKind::RamUsage => {
|
||||
if alert.value > 95.0 {
|
||||
crate::data_model::NotificationLevel::Error
|
||||
} else {
|
||||
crate::data_model::NotificationLevel::Warning
|
||||
}
|
||||
}
|
||||
AlertRuleKind::ContainerCrash => {
|
||||
crate::data_model::NotificationLevel::Error
|
||||
}
|
||||
_ => crate::data_model::NotificationLevel::Warning,
|
||||
};
|
||||
let notification = crate::data_model::Notification {
|
||||
id: alert.id.clone(),
|
||||
level,
|
||||
title: format!("{:?} Alert", alert.kind),
|
||||
message: alert.message.clone(),
|
||||
timestamp: chrono::Utc::now().to_rfc3339(),
|
||||
app_id: None,
|
||||
};
|
||||
data.notifications.push(notification);
|
||||
}
|
||||
// Keep max 20 notifications
|
||||
while data.notifications.len() > 20 {
|
||||
data.notifications.remove(0);
|
||||
}
|
||||
state_mgr.update_data(data).await;
|
||||
info!("Fired {} alert(s)", alerts.len());
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Failed to collect metrics: {}", e);
|
||||
|
||||
@ -78,7 +78,7 @@ impl Server {
|
||||
|
||||
// Create metrics store and spawn background collector
|
||||
let metrics_store = Arc::new(MetricsStore::new());
|
||||
crate::monitoring::spawn_metrics_collector(metrics_store.clone());
|
||||
crate::monitoring::spawn_metrics_collector(metrics_store.clone(), Some(state_manager.clone()));
|
||||
|
||||
let api_handler = Arc::new(
|
||||
ApiHandler::new(config.clone(), state_manager.clone(), metrics_store).await?,
|
||||
|
||||
@ -338,9 +338,9 @@
|
||||
|
||||
- [x] **MON-01** — Implement real-time metrics collection. Add `core/archipelago/src/monitoring/collector.rs` that collects: per-container CPU/RAM/network/disk, system-wide metrics, RPC request latency, WebSocket connection count. Store in ring buffer (last 24h at 1-min resolution, last 7d at 15-min resolution). **Acceptance**: Metrics collected and queryable via RPC.
|
||||
|
||||
- [ ] **MON-02** — Add monitoring dashboard page. Create `neode-ui/src/views/Monitoring.vue` at `/dashboard/monitoring` with: real-time line charts (CPU, RAM, network), per-container resource breakdown, alert history, system health timeline. Use canvas-based charts (no heavy library -- build simple line chart component). **Acceptance**: Real-time metrics visible with 5s refresh.
|
||||
- [x] **MON-02** — Add monitoring dashboard page. Created `neode-ui/src/views/Monitoring.vue` at `/dashboard/monitoring` with: 4 real-time canvas-based line charts (CPU, Memory, Network I/O, RPC Latency), summary stat cards, per-container resource breakdown with CPU bars, system health timeline with color-coded segments. Custom `LineChart.vue` component renders on canvas with DPR scaling, grid lines, area fills. Polls every 5s via `monitoring.current` and `monitoring.history` RPC endpoints. Route registered in router. All CSS classes defined in style.css.
|
||||
|
||||
- [ ] **MON-03** — Implement alerting system. Add alert rules: disk > 90%, RAM > 90%, container crash, backend error spike, SSL cert expiry < 30 days. Notifications via: WebSocket push to UI, optional webhook URL. **Acceptance**: Alerts fire and display in UI.
|
||||
- [x] **MON-03** — Implemented alerting system. Added AlertRule and FiredAlert types to monitoring/mod.rs with 5 configurable rules (disk >90%, RAM >90%, container crash, RPC latency spike, SSL cert expiry <30 days). Metrics collector evaluates rules every 60s, fires alerts as Notifications via WebSocket. Added RPC endpoints: monitoring.alerts, monitoring.alert-rules, monitoring.configure-alert, monitoring.acknowledge-alert. Frontend: Monitoring.vue has alert history section with configurable thresholds, enable/disable toggles, dismiss buttons. CSS toggle/input styles in style.css.
|
||||
|
||||
- [ ] **MON-04** — Add historical data export. Add `monitoring.export` RPC endpoint that exports metrics as CSV or JSON for a given time range. Add "Export" button in monitoring UI. **Acceptance**: Can download last 24h of metrics as CSV.
|
||||
|
||||
|
||||
@ -1599,3 +1599,92 @@ html:has(body.video-background-active)::before {
|
||||
transform: translateY(0);
|
||||
}
|
||||
}
|
||||
|
||||
/* Monitoring dashboard */
|
||||
.monitoring-stat-card {
|
||||
background: rgba(0, 0, 0, 0.3);
|
||||
border: 1px solid rgba(255, 255, 255, 0.08);
|
||||
border-radius: 0.75rem;
|
||||
padding: 1rem;
|
||||
}
|
||||
|
||||
.monitoring-chart {
|
||||
width: 100%;
|
||||
height: auto;
|
||||
display: block;
|
||||
}
|
||||
|
||||
.monitoring-bar-container {
|
||||
width: 80px;
|
||||
height: 6px;
|
||||
background: rgba(255, 255, 255, 0.08);
|
||||
border-radius: 3px;
|
||||
overflow: hidden;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.monitoring-bar-fill {
|
||||
height: 100%;
|
||||
border-radius: 3px;
|
||||
transition: width 0.3s ease;
|
||||
}
|
||||
|
||||
.monitoring-bar-ok {
|
||||
background: #4ade80;
|
||||
}
|
||||
|
||||
.monitoring-bar-warn {
|
||||
background: #f59e0b;
|
||||
}
|
||||
|
||||
.monitoring-bar-danger {
|
||||
background: #ef4444;
|
||||
}
|
||||
.monitoring-alert-toggle {
|
||||
position: relative;
|
||||
display: inline-block;
|
||||
width: 36px;
|
||||
height: 20px;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
.monitoring-alert-toggle input {
|
||||
opacity: 0;
|
||||
width: 0;
|
||||
height: 0;
|
||||
}
|
||||
.monitoring-alert-toggle-slider {
|
||||
position: absolute;
|
||||
cursor: pointer;
|
||||
inset: 0;
|
||||
background: rgba(255, 255, 255, 0.1);
|
||||
border-radius: 10px;
|
||||
transition: background 0.2s ease;
|
||||
}
|
||||
.monitoring-alert-toggle-slider::before {
|
||||
content: '';
|
||||
position: absolute;
|
||||
height: 14px;
|
||||
width: 14px;
|
||||
left: 3px;
|
||||
bottom: 3px;
|
||||
background: rgba(255, 255, 255, 0.6);
|
||||
border-radius: 50%;
|
||||
transition: transform 0.2s ease;
|
||||
}
|
||||
.monitoring-alert-toggle input:checked + .monitoring-alert-toggle-slider {
|
||||
background: rgba(74, 222, 128, 0.4);
|
||||
}
|
||||
.monitoring-alert-toggle input:checked + .monitoring-alert-toggle-slider::before {
|
||||
transform: translateX(16px);
|
||||
background: #4ade80;
|
||||
}
|
||||
.monitoring-threshold-input {
|
||||
width: 60px;
|
||||
padding: 4px 8px;
|
||||
background: rgba(255, 255, 255, 0.08);
|
||||
border: 1px solid rgba(255, 255, 255, 0.12);
|
||||
border-radius: 6px;
|
||||
color: white;
|
||||
font-size: 0.75rem;
|
||||
text-align: right;
|
||||
}
|
||||
|
||||
514
neode-ui/src/views/Monitoring.vue
Normal file
514
neode-ui/src/views/Monitoring.vue
Normal file
@ -0,0 +1,514 @@
|
||||
<template>
|
||||
<div>
|
||||
<div class="hidden md:block mb-8">
|
||||
<h1 class="text-3xl font-bold text-white mb-2">Monitoring</h1>
|
||||
<p class="text-white/70">Real-time system metrics and container resource usage</p>
|
||||
</div>
|
||||
|
||||
<!-- Summary Cards -->
|
||||
<div class="grid grid-cols-2 lg:grid-cols-4 gap-4 mb-6">
|
||||
<div class="monitoring-stat-card">
|
||||
<p class="text-xs text-white/50 uppercase tracking-wide">CPU</p>
|
||||
<p class="text-2xl font-bold text-white">{{ current?.system.cpu_percent.toFixed(1) ?? '--' }}%</p>
|
||||
<p class="text-xs text-white/40">Load: {{ current?.system.load_avg_1.toFixed(2) ?? '--' }}</p>
|
||||
</div>
|
||||
<div class="monitoring-stat-card">
|
||||
<p class="text-xs text-white/50 uppercase tracking-wide">Memory</p>
|
||||
<p class="text-2xl font-bold text-white">{{ memPercent }}%</p>
|
||||
<p class="text-xs text-white/40">{{ formatBytes(current?.system.mem_used_bytes ?? 0) }} / {{ formatBytes(current?.system.mem_total_bytes ?? 0) }}</p>
|
||||
</div>
|
||||
<div class="monitoring-stat-card">
|
||||
<p class="text-xs text-white/50 uppercase tracking-wide">Disk</p>
|
||||
<p class="text-2xl font-bold text-white">{{ diskPercent }}%</p>
|
||||
<p class="text-xs text-white/40">{{ formatBytes(current?.system.disk_used_bytes ?? 0) }} / {{ formatBytes(current?.system.disk_total_bytes ?? 0) }}</p>
|
||||
</div>
|
||||
<div class="monitoring-stat-card">
|
||||
<p class="text-xs text-white/50 uppercase tracking-wide">Network</p>
|
||||
<p class="text-2xl font-bold text-white">{{ formatBytes(current?.system.net_rx_bytes ?? 0) }}</p>
|
||||
<p class="text-xs text-white/40">TX: {{ formatBytes(current?.system.net_tx_bytes ?? 0) }}</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Charts -->
|
||||
<div class="grid grid-cols-1 lg:grid-cols-2 gap-6 mb-6">
|
||||
<div class="glass-card p-5">
|
||||
<h3 class="text-sm font-medium text-white/80 mb-3">CPU Usage (%)</h3>
|
||||
<LineChart
|
||||
:datasets="cpuDatasets"
|
||||
:labels="timeLabels"
|
||||
:width="chartWidth"
|
||||
:height="180"
|
||||
:y-max="100"
|
||||
/>
|
||||
</div>
|
||||
<div class="glass-card p-5">
|
||||
<h3 class="text-sm font-medium text-white/80 mb-3">Memory Usage (%)</h3>
|
||||
<LineChart
|
||||
:datasets="memDatasets"
|
||||
:labels="timeLabels"
|
||||
:width="chartWidth"
|
||||
:height="180"
|
||||
:y-max="100"
|
||||
/>
|
||||
</div>
|
||||
<div class="glass-card p-5">
|
||||
<h3 class="text-sm font-medium text-white/80 mb-3">Network I/O (bytes)</h3>
|
||||
<LineChart
|
||||
:datasets="netDatasets"
|
||||
:labels="timeLabels"
|
||||
:width="chartWidth"
|
||||
:height="180"
|
||||
/>
|
||||
</div>
|
||||
<div class="glass-card p-5">
|
||||
<h3 class="text-sm font-medium text-white/80 mb-3">RPC Latency (ms)</h3>
|
||||
<LineChart
|
||||
:datasets="latencyDatasets"
|
||||
:labels="timeLabels"
|
||||
:width="chartWidth"
|
||||
:height="180"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Alert History -->
|
||||
<div class="glass-card p-5 mb-6">
|
||||
<div class="flex items-center justify-between mb-4">
|
||||
<h3 class="text-sm font-medium text-white/80">Alert History</h3>
|
||||
<button
|
||||
class="glass-button text-xs px-3 py-1"
|
||||
@click="showAlertConfig = !showAlertConfig"
|
||||
>
|
||||
{{ showAlertConfig ? 'Hide Config' : 'Configure' }}
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<!-- Alert Rule Configuration -->
|
||||
<div v-if="showAlertConfig" class="mb-4 space-y-2">
|
||||
<div
|
||||
v-for="rule in alertRules"
|
||||
:key="rule.kind"
|
||||
class="flex items-center gap-3 p-3 bg-white/5 rounded-lg"
|
||||
>
|
||||
<label class="monitoring-alert-toggle">
|
||||
<input
|
||||
type="checkbox"
|
||||
:checked="rule.enabled"
|
||||
@change="toggleAlertRule(rule.kind, !rule.enabled)"
|
||||
/>
|
||||
<span class="monitoring-alert-toggle-slider"></span>
|
||||
</label>
|
||||
<div class="flex-1 min-w-0">
|
||||
<p class="text-sm text-white">{{ ruleLabel(rule.kind) }}</p>
|
||||
<p class="text-xs text-white/40">{{ rule.description }}</p>
|
||||
</div>
|
||||
<div class="flex items-center gap-2">
|
||||
<input
|
||||
type="number"
|
||||
:value="rule.threshold"
|
||||
class="monitoring-threshold-input"
|
||||
@change="updateThreshold(rule.kind, ($event.target as HTMLInputElement).value)"
|
||||
/>
|
||||
<span class="text-xs text-white/40">{{ ruleUnit(rule.kind) }}</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Fired Alerts List -->
|
||||
<div v-if="!alerts.length" class="text-white/40 text-sm py-4 text-center">
|
||||
No alerts fired
|
||||
</div>
|
||||
<div v-else class="space-y-2 max-h-64 overflow-y-auto">
|
||||
<div
|
||||
v-for="alert in alerts"
|
||||
:key="alert.id"
|
||||
class="flex items-start gap-3 p-3 bg-white/5 rounded-lg"
|
||||
:class="{ 'opacity-50': alert.acknowledged }"
|
||||
>
|
||||
<span
|
||||
class="inline-block w-2 h-2 rounded-full mt-1.5 flex-shrink-0"
|
||||
:class="alertDotColor(alert.kind)"
|
||||
></span>
|
||||
<div class="flex-1 min-w-0">
|
||||
<p class="text-sm text-white">{{ alert.message }}</p>
|
||||
<p class="text-xs text-white/40">{{ formatAlertTime(alert.timestamp) }}</p>
|
||||
</div>
|
||||
<button
|
||||
v-if="!alert.acknowledged"
|
||||
class="text-xs text-white/40 hover:text-white/70 flex-shrink-0"
|
||||
@click="acknowledgeAlert(alert.id)"
|
||||
>
|
||||
Dismiss
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Container Resource Breakdown -->
|
||||
<div class="glass-card p-5 mb-6">
|
||||
<h3 class="text-sm font-medium text-white/80 mb-4">Container Resources</h3>
|
||||
<div v-if="!containers.length" class="text-white/40 text-sm py-4 text-center">
|
||||
No container metrics available
|
||||
</div>
|
||||
<div v-else class="space-y-3">
|
||||
<div
|
||||
v-for="c in containers"
|
||||
:key="c.name"
|
||||
class="flex items-center gap-4 p-3 bg-white/5 rounded-lg"
|
||||
>
|
||||
<div class="min-w-0 flex-1">
|
||||
<p class="text-sm font-medium text-white truncate">{{ c.name }}</p>
|
||||
<div class="flex gap-4 mt-1 text-xs text-white/50">
|
||||
<span>CPU: {{ c.cpu_percent.toFixed(1) }}%</span>
|
||||
<span>Mem: {{ formatBytes(c.mem_used_bytes) }}</span>
|
||||
<span>Net: {{ formatBytes(c.net_rx_bytes) }} / {{ formatBytes(c.net_tx_bytes) }}</span>
|
||||
</div>
|
||||
</div>
|
||||
<div class="monitoring-bar-container">
|
||||
<div
|
||||
class="monitoring-bar-fill"
|
||||
:style="{ width: Math.min(c.cpu_percent, 100) + '%' }"
|
||||
:class="c.cpu_percent > 80 ? 'monitoring-bar-danger' : c.cpu_percent > 50 ? 'monitoring-bar-warn' : 'monitoring-bar-ok'"
|
||||
></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- System Health Timeline -->
|
||||
<div class="glass-card p-5">
|
||||
<div class="flex items-center justify-between mb-4">
|
||||
<h3 class="text-sm font-medium text-white/80">System Health</h3>
|
||||
<div class="flex items-center gap-2 text-xs text-white/40">
|
||||
<span class="inline-block w-2 h-2 rounded-full bg-green-400"></span> Healthy
|
||||
<span class="inline-block w-2 h-2 rounded-full bg-orange-400 ml-2"></span> Elevated
|
||||
<span class="inline-block w-2 h-2 rounded-full bg-red-400 ml-2"></span> Critical
|
||||
</div>
|
||||
</div>
|
||||
<div class="flex gap-0.5 h-6">
|
||||
<div
|
||||
v-for="(entry, idx) in healthTimeline"
|
||||
:key="idx"
|
||||
class="flex-1 rounded-sm transition-colors"
|
||||
:class="healthColor(entry)"
|
||||
:title="entry.label"
|
||||
></div>
|
||||
</div>
|
||||
<div class="flex justify-between mt-1 text-xs text-white/30">
|
||||
<span>{{ historyMinutesAgo }}m ago</span>
|
||||
<span>Now</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<p class="text-xs text-white/30 mt-4 text-center">
|
||||
Refreshing every 5 seconds · WS connections: {{ current?.ws_connections ?? 0 }}
|
||||
</p>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup lang="ts">
|
||||
import { ref, computed, onMounted, onUnmounted } from 'vue'
|
||||
import { rpcClient } from '@/api/rpc-client'
|
||||
import LineChart from '@/components/LineChart.vue'
|
||||
import type { ChartDataset } from '@/components/LineChart.vue'
|
||||
|
||||
interface SystemMetrics {
|
||||
cpu_percent: number
|
||||
mem_used_bytes: number
|
||||
mem_total_bytes: number
|
||||
disk_used_bytes: number
|
||||
disk_total_bytes: number
|
||||
net_rx_bytes: number
|
||||
net_tx_bytes: number
|
||||
load_avg_1: number
|
||||
load_avg_5: number
|
||||
load_avg_15: number
|
||||
}
|
||||
|
||||
interface ContainerMetrics {
|
||||
name: string
|
||||
cpu_percent: number
|
||||
mem_used_bytes: number
|
||||
mem_limit_bytes: number
|
||||
net_rx_bytes: number
|
||||
net_tx_bytes: number
|
||||
block_read_bytes: number
|
||||
block_write_bytes: number
|
||||
}
|
||||
|
||||
interface MetricSnapshot {
|
||||
timestamp: number
|
||||
system: SystemMetrics
|
||||
containers: ContainerMetrics[]
|
||||
rpc_latency_ms: number
|
||||
ws_connections: number
|
||||
}
|
||||
|
||||
interface HistoryResponse {
|
||||
resolution: string
|
||||
count: number
|
||||
data: MetricSnapshot[]
|
||||
}
|
||||
|
||||
interface AlertRule {
|
||||
kind: string
|
||||
threshold: number
|
||||
enabled: boolean
|
||||
description: string
|
||||
}
|
||||
|
||||
interface FiredAlert {
|
||||
id: string
|
||||
kind: string
|
||||
message: string
|
||||
value: number
|
||||
threshold: number
|
||||
timestamp: number
|
||||
acknowledged: boolean
|
||||
}
|
||||
|
||||
const current = ref<MetricSnapshot | null>(null)
|
||||
const history = ref<MetricSnapshot[]>([])
|
||||
const containers = ref<ContainerMetrics[]>([])
|
||||
const alerts = ref<FiredAlert[]>([])
|
||||
const alertRules = ref<AlertRule[]>([])
|
||||
const showAlertConfig = ref(false)
|
||||
const chartWidth = ref(380)
|
||||
let pollTimer: ReturnType<typeof setInterval> | null = null
|
||||
|
||||
const memPercent = computed(() => {
|
||||
if (!current.value?.system.mem_total_bytes) return '--'
|
||||
return ((current.value.system.mem_used_bytes / current.value.system.mem_total_bytes) * 100).toFixed(1)
|
||||
})
|
||||
|
||||
const diskPercent = computed(() => {
|
||||
if (!current.value?.system.disk_total_bytes) return '--'
|
||||
return ((current.value.system.disk_used_bytes / current.value.system.disk_total_bytes) * 100).toFixed(1)
|
||||
})
|
||||
|
||||
const historyMinutesAgo = computed(() => history.value.length || 60)
|
||||
|
||||
const timeLabels = computed(() => {
|
||||
if (!history.value.length) return []
|
||||
return history.value.map((s) => {
|
||||
const d = new Date(s.timestamp * 1000)
|
||||
return `${d.getHours().toString().padStart(2, '0')}:${d.getMinutes().toString().padStart(2, '0')}`
|
||||
})
|
||||
})
|
||||
|
||||
const cpuDatasets = computed<ChartDataset[]>(() => [{
|
||||
label: 'CPU',
|
||||
data: history.value.map((s) => s.system.cpu_percent),
|
||||
color: '#fb923c',
|
||||
}])
|
||||
|
||||
const memDatasets = computed<ChartDataset[]>(() => [{
|
||||
label: 'Memory',
|
||||
data: history.value.map((s) =>
|
||||
s.system.mem_total_bytes > 0
|
||||
? (s.system.mem_used_bytes / s.system.mem_total_bytes) * 100
|
||||
: 0,
|
||||
),
|
||||
color: '#3b82f6',
|
||||
}])
|
||||
|
||||
const netDatasets = computed<ChartDataset[]>(() => [
|
||||
{
|
||||
label: 'RX',
|
||||
data: history.value.map((s) => s.system.net_rx_bytes),
|
||||
color: '#4ade80',
|
||||
},
|
||||
{
|
||||
label: 'TX',
|
||||
data: history.value.map((s) => s.system.net_tx_bytes),
|
||||
color: '#f59e0b',
|
||||
},
|
||||
])
|
||||
|
||||
const latencyDatasets = computed<ChartDataset[]>(() => [{
|
||||
label: 'Latency',
|
||||
data: history.value.map((s) => s.rpc_latency_ms),
|
||||
color: '#a78bfa',
|
||||
}])
|
||||
|
||||
interface HealthEntry {
|
||||
cpu: number
|
||||
mem: number
|
||||
label: string
|
||||
}
|
||||
|
||||
const healthTimeline = computed<HealthEntry[]>(() => {
|
||||
if (!history.value.length) {
|
||||
return Array.from({ length: 60 }, () => ({ cpu: 0, mem: 0, label: 'No data' }))
|
||||
}
|
||||
return history.value.map((s) => {
|
||||
const memPct = s.system.mem_total_bytes > 0
|
||||
? (s.system.mem_used_bytes / s.system.mem_total_bytes) * 100
|
||||
: 0
|
||||
const d = new Date(s.timestamp * 1000)
|
||||
const time = `${d.getHours().toString().padStart(2, '0')}:${d.getMinutes().toString().padStart(2, '0')}`
|
||||
return {
|
||||
cpu: s.system.cpu_percent,
|
||||
mem: memPct,
|
||||
label: `${time} — CPU: ${s.system.cpu_percent.toFixed(1)}%, Mem: ${memPct.toFixed(1)}%`,
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
function healthColor(entry: HealthEntry): string {
|
||||
if (entry.cpu > 90 || entry.mem > 90) return 'bg-red-400/60'
|
||||
if (entry.cpu > 70 || entry.mem > 70) return 'bg-orange-400/40'
|
||||
return 'bg-green-400/30'
|
||||
}
|
||||
|
||||
function formatBytes(bytes: number): string {
|
||||
if (bytes >= 1_073_741_824) return `${(bytes / 1_073_741_824).toFixed(1)} GB`
|
||||
if (bytes >= 1_048_576) return `${(bytes / 1_048_576).toFixed(1)} MB`
|
||||
if (bytes >= 1024) return `${(bytes / 1024).toFixed(0)} KB`
|
||||
return `${bytes} B`
|
||||
}
|
||||
|
||||
function ruleLabel(kind: string): string {
|
||||
const labels: Record<string, string> = {
|
||||
disk_usage: 'Disk Usage',
|
||||
ram_usage: 'RAM Usage',
|
||||
container_crash: 'Container Crash',
|
||||
backend_error_spike: 'RPC Latency Spike',
|
||||
ssl_cert_expiry: 'SSL Cert Expiry',
|
||||
}
|
||||
return labels[kind] ?? kind
|
||||
}
|
||||
|
||||
function ruleUnit(kind: string): string {
|
||||
const units: Record<string, string> = {
|
||||
disk_usage: '%',
|
||||
ram_usage: '%',
|
||||
container_crash: '',
|
||||
backend_error_spike: 'ms',
|
||||
ssl_cert_expiry: 'days',
|
||||
}
|
||||
return units[kind] ?? ''
|
||||
}
|
||||
|
||||
function alertDotColor(kind: string): string {
|
||||
if (kind === 'container_crash' || kind === 'ssl_cert_expiry') return 'bg-red-400'
|
||||
if (kind === 'disk_usage' || kind === 'ram_usage') return 'bg-orange-400'
|
||||
return 'bg-yellow-400'
|
||||
}
|
||||
|
||||
function formatAlertTime(timestamp: number): string {
|
||||
const d = new Date(timestamp * 1000)
|
||||
return d.toLocaleString()
|
||||
}
|
||||
|
||||
async function fetchCurrent() {
|
||||
try {
|
||||
const data = await rpcClient.call<MetricSnapshot | { status: string }>({
|
||||
method: 'monitoring.current',
|
||||
})
|
||||
if (data && 'system' in data) {
|
||||
current.value = data
|
||||
containers.value = data.containers ?? []
|
||||
}
|
||||
} catch {
|
||||
// Silently retry on next poll
|
||||
}
|
||||
}
|
||||
|
||||
async function fetchHistory() {
|
||||
try {
|
||||
const data = await rpcClient.call<HistoryResponse>({
|
||||
method: 'monitoring.history',
|
||||
params: { resolution: 'minute', count: 60 },
|
||||
})
|
||||
if (data?.data) {
|
||||
history.value = data.data
|
||||
}
|
||||
} catch {
|
||||
// Silently retry on next poll
|
||||
}
|
||||
}
|
||||
|
||||
async function fetchAlerts() {
|
||||
try {
|
||||
const data = await rpcClient.call<{ alerts: FiredAlert[] }>({
|
||||
method: 'monitoring.alerts',
|
||||
params: { count: 50 },
|
||||
})
|
||||
if (data?.alerts) {
|
||||
alerts.value = data.alerts.reverse()
|
||||
}
|
||||
} catch {
|
||||
// Silently retry on next poll
|
||||
}
|
||||
}
|
||||
|
||||
async function fetchAlertRules() {
|
||||
try {
|
||||
const data = await rpcClient.call<{ rules: AlertRule[] }>({
|
||||
method: 'monitoring.alert-rules',
|
||||
})
|
||||
if (data?.rules) {
|
||||
alertRules.value = data.rules
|
||||
}
|
||||
} catch {
|
||||
// Non-critical
|
||||
}
|
||||
}
|
||||
|
||||
async function toggleAlertRule(kind: string, enabled: boolean) {
|
||||
try {
|
||||
await rpcClient.call({ method: 'monitoring.configure-alert', params: { kind, enabled } })
|
||||
await fetchAlertRules()
|
||||
} catch {
|
||||
// Non-critical
|
||||
}
|
||||
}
|
||||
|
||||
async function updateThreshold(kind: string, value: string) {
|
||||
const threshold = parseFloat(value)
|
||||
if (isNaN(threshold) || threshold <= 0) return
|
||||
try {
|
||||
await rpcClient.call({ method: 'monitoring.configure-alert', params: { kind, threshold } })
|
||||
await fetchAlertRules()
|
||||
} catch {
|
||||
// Non-critical
|
||||
}
|
||||
}
|
||||
|
||||
async function acknowledgeAlert(id: string) {
|
||||
try {
|
||||
await rpcClient.call({ method: 'monitoring.acknowledge-alert', params: { id } })
|
||||
await fetchAlerts()
|
||||
} catch {
|
||||
// Non-critical
|
||||
}
|
||||
}
|
||||
|
||||
function updateChartWidth() {
|
||||
const container = document.querySelector('.glass-card')
|
||||
if (container) {
|
||||
chartWidth.value = Math.max(container.clientWidth - 40, 200)
|
||||
}
|
||||
}
|
||||
|
||||
onMounted(async () => {
|
||||
updateChartWidth()
|
||||
window.addEventListener('resize', updateChartWidth)
|
||||
|
||||
await Promise.all([fetchCurrent(), fetchHistory(), fetchAlerts(), fetchAlertRules()])
|
||||
|
||||
pollTimer = setInterval(async () => {
|
||||
try {
|
||||
await Promise.all([fetchCurrent(), fetchHistory(), fetchAlerts()])
|
||||
} catch {
|
||||
// Background poll — ignore transient errors
|
||||
}
|
||||
}, 5000)
|
||||
})
|
||||
|
||||
onUnmounted(() => {
|
||||
if (pollTimer) clearInterval(pollTimer)
|
||||
window.removeEventListener('resize', updateChartWidth)
|
||||
})
|
||||
</script>
|
||||
Loading…
x
Reference in New Issue
Block a user