Dorian 592548066e feat: add real-time metrics collection with ring buffer storage (MON-01)
Implements monitoring/collector.rs that collects per-container CPU/RAM/network/disk,
system-wide metrics, RPC latency, and WebSocket connection count every 60 seconds.
Data stored in dual ring buffers: 1-min resolution (24h) and 15-min resolution (7d).
Three new RPC endpoints: monitoring.current, monitoring.history, monitoring.containers.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 11:11:02 +00:00

365 lines
12 KiB
Rust

pub mod collector;
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::Arc;
use tokio::sync::RwLock;
use tracing::{debug, warn};
/// Maximum entries at 1-minute resolution (24 hours = 1440 minutes)
const MAX_1MIN_ENTRIES: usize = 1440;
/// Maximum entries at 15-minute resolution (7 days = 672 quarter-hours)
const MAX_15MIN_ENTRIES: usize = 672;
/// A single metrics snapshot collected at a point in time.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MetricSnapshot {
pub timestamp: i64,
pub system: SystemMetrics,
pub containers: Vec<ContainerMetrics>,
pub rpc_latency_ms: f64,
pub ws_connections: u32,
}
/// System-wide resource metrics.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemMetrics {
pub cpu_percent: f64,
pub mem_used_bytes: u64,
pub mem_total_bytes: u64,
pub disk_used_bytes: u64,
pub disk_total_bytes: u64,
pub net_rx_bytes: u64,
pub net_tx_bytes: u64,
pub load_avg_1: f64,
pub load_avg_5: f64,
pub load_avg_15: f64,
}
/// Per-container resource metrics.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ContainerMetrics {
pub name: String,
pub cpu_percent: f64,
pub mem_used_bytes: u64,
pub mem_limit_bytes: u64,
pub net_rx_bytes: u64,
pub net_tx_bytes: u64,
pub block_read_bytes: u64,
pub block_write_bytes: u64,
}
/// Thread-safe metrics store with ring buffers at two resolutions.
pub struct MetricsStore {
minute_data: RwLock<VecDeque<MetricSnapshot>>,
quarter_hour_data: RwLock<VecDeque<MetricSnapshot>>,
minute_count: RwLock<u32>,
rpc_latency: RwLock<(f64, u64)>,
ws_connections: AtomicU32,
}
impl MetricsStore {
pub fn new() -> Self {
Self {
minute_data: RwLock::new(VecDeque::with_capacity(MAX_1MIN_ENTRIES)),
quarter_hour_data: RwLock::new(VecDeque::with_capacity(MAX_15MIN_ENTRIES)),
minute_count: RwLock::new(0),
rpc_latency: RwLock::new((0.0, 0)),
ws_connections: AtomicU32::new(0),
}
}
/// Record a new metric snapshot (called every minute by collector).
pub async fn push(&self, mut snapshot: MetricSnapshot) {
// Fill in RPC latency from accumulated samples
{
let mut latency = self.rpc_latency.write().await;
if latency.1 > 0 {
snapshot.rpc_latency_ms = (latency.0 / latency.1 as f64 * 10.0).round() / 10.0;
*latency = (0.0, 0);
}
}
// Fill in current WS connection count
snapshot.ws_connections = self.ws_connections.load(Ordering::Relaxed);
// Push to 1-minute ring buffer
{
let mut buf = self.minute_data.write().await;
if buf.len() >= MAX_1MIN_ENTRIES {
buf.pop_front();
}
buf.push_back(snapshot.clone());
}
// Every 15 minutes, push to quarter-hour ring buffer
{
let mut count = self.minute_count.write().await;
*count += 1;
if *count >= 15 {
*count = 0;
let mut buf = self.quarter_hour_data.write().await;
if buf.len() >= MAX_15MIN_ENTRIES {
buf.pop_front();
}
buf.push_back(snapshot);
}
}
}
/// Record an RPC request latency sample (milliseconds).
pub async fn record_rpc_latency(&self, latency_ms: f64) {
let mut data = self.rpc_latency.write().await;
data.0 += latency_ms;
data.1 += 1;
}
/// Increment WebSocket connection count (called on connect).
pub fn increment_ws(&self) {
self.ws_connections.fetch_add(1, Ordering::Relaxed);
}
/// Decrement WebSocket connection count (called on disconnect).
pub fn decrement_ws(&self) {
// Use saturating semantics to avoid underflow
let _ = self.ws_connections.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |v| {
if v > 0 { Some(v - 1) } else { Some(0) }
});
}
/// Get the latest snapshot.
pub async fn latest(&self) -> Option<MetricSnapshot> {
self.minute_data.read().await.back().cloned()
}
/// Get minute-resolution data for the last N minutes.
pub async fn history_minutes(&self, last_n: usize) -> Vec<MetricSnapshot> {
let buf = self.minute_data.read().await;
let start = buf.len().saturating_sub(last_n);
buf.iter().skip(start).cloned().collect()
}
/// Get quarter-hour-resolution data for the last N entries.
pub async fn history_quarter_hours(&self, last_n: usize) -> Vec<MetricSnapshot> {
let buf = self.quarter_hour_data.read().await;
let start = buf.len().saturating_sub(last_n);
buf.iter().skip(start).cloned().collect()
}
}
/// Spawn the background metrics collector (runs every 60 seconds).
pub fn spawn_metrics_collector(store: Arc<MetricsStore>) {
tokio::spawn(async move {
// Wait 30s for system to stabilize after boot
tokio::time::sleep(std::time::Duration::from_secs(30)).await;
let mut interval = tokio::time::interval(std::time::Duration::from_secs(60));
loop {
interval.tick().await;
match collector::collect_snapshot().await {
Ok(snapshot) => {
store.push(snapshot).await;
debug!("Metrics snapshot collected");
}
Err(e) => {
warn!("Failed to collect metrics: {}", e);
}
}
}
});
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_metrics_store_new() {
let store = MetricsStore::new();
assert_eq!(store.ws_connections.load(Ordering::Relaxed), 0);
}
#[test]
fn test_ws_connection_tracking() {
let store = MetricsStore::new();
store.increment_ws();
store.increment_ws();
assert_eq!(store.ws_connections.load(Ordering::Relaxed), 2);
store.decrement_ws();
assert_eq!(store.ws_connections.load(Ordering::Relaxed), 1);
store.decrement_ws();
assert_eq!(store.ws_connections.load(Ordering::Relaxed), 0);
// Decrement below zero should stay at 0
store.decrement_ws();
assert_eq!(store.ws_connections.load(Ordering::Relaxed), 0);
}
#[tokio::test]
async fn test_push_and_latest() {
let store = MetricsStore::new();
assert!(store.latest().await.is_none());
let snapshot = MetricSnapshot {
timestamp: 1000,
system: SystemMetrics {
cpu_percent: 25.0,
mem_used_bytes: 1_000_000,
mem_total_bytes: 4_000_000,
disk_used_bytes: 500_000,
disk_total_bytes: 1_000_000,
net_rx_bytes: 100,
net_tx_bytes: 200,
load_avg_1: 1.0,
load_avg_5: 0.5,
load_avg_15: 0.3,
},
containers: vec![],
rpc_latency_ms: 0.0,
ws_connections: 0,
};
store.push(snapshot).await;
let latest = store.latest().await.unwrap();
assert_eq!(latest.timestamp, 1000);
assert_eq!(latest.system.cpu_percent, 25.0);
}
#[tokio::test]
async fn test_rpc_latency_recording() {
let store = MetricsStore::new();
store.record_rpc_latency(10.0).await;
store.record_rpc_latency(20.0).await;
store.record_rpc_latency(30.0).await;
let snapshot = MetricSnapshot {
timestamp: 2000,
system: SystemMetrics {
cpu_percent: 0.0,
mem_used_bytes: 0,
mem_total_bytes: 0,
disk_used_bytes: 0,
disk_total_bytes: 0,
net_rx_bytes: 0,
net_tx_bytes: 0,
load_avg_1: 0.0,
load_avg_5: 0.0,
load_avg_15: 0.0,
},
containers: vec![],
rpc_latency_ms: 0.0,
ws_connections: 0,
};
store.push(snapshot).await;
let latest = store.latest().await.unwrap();
assert_eq!(latest.rpc_latency_ms, 20.0); // average of 10+20+30 = 20
}
#[tokio::test]
async fn test_history_minutes() {
let store = MetricsStore::new();
for i in 0..5 {
let snapshot = MetricSnapshot {
timestamp: i * 60,
system: SystemMetrics {
cpu_percent: i as f64,
mem_used_bytes: 0,
mem_total_bytes: 0,
disk_used_bytes: 0,
disk_total_bytes: 0,
net_rx_bytes: 0,
net_tx_bytes: 0,
load_avg_1: 0.0,
load_avg_5: 0.0,
load_avg_15: 0.0,
},
containers: vec![],
rpc_latency_ms: 0.0,
ws_connections: 0,
};
store.push(snapshot).await;
}
let history = store.history_minutes(3).await;
assert_eq!(history.len(), 3);
assert_eq!(history[0].timestamp, 120);
assert_eq!(history[2].timestamp, 240);
}
#[tokio::test]
async fn test_ring_buffer_eviction() {
let store = MetricsStore::new();
// Push more than MAX_1MIN_ENTRIES
for i in 0..(MAX_1MIN_ENTRIES + 10) {
let snapshot = MetricSnapshot {
timestamp: i as i64,
system: SystemMetrics {
cpu_percent: 0.0,
mem_used_bytes: 0,
mem_total_bytes: 0,
disk_used_bytes: 0,
disk_total_bytes: 0,
net_rx_bytes: 0,
net_tx_bytes: 0,
load_avg_1: 0.0,
load_avg_5: 0.0,
load_avg_15: 0.0,
},
containers: vec![],
rpc_latency_ms: 0.0,
ws_connections: 0,
};
store.push(snapshot).await;
}
let all = store.history_minutes(MAX_1MIN_ENTRIES + 100).await;
assert_eq!(all.len(), MAX_1MIN_ENTRIES);
// Oldest entry should be 10 (first 10 were evicted)
assert_eq!(all[0].timestamp, 10);
}
#[tokio::test]
async fn test_quarter_hour_downsampling() {
let store = MetricsStore::new();
// Push exactly 15 entries to trigger one quarter-hour sample
for i in 0..15 {
let snapshot = MetricSnapshot {
timestamp: i * 60,
system: SystemMetrics {
cpu_percent: 50.0,
mem_used_bytes: 0,
mem_total_bytes: 0,
disk_used_bytes: 0,
disk_total_bytes: 0,
net_rx_bytes: 0,
net_tx_bytes: 0,
load_avg_1: 0.0,
load_avg_5: 0.0,
load_avg_15: 0.0,
},
containers: vec![],
rpc_latency_ms: 0.0,
ws_connections: 0,
};
store.push(snapshot).await;
}
let qh = store.history_quarter_hours(10).await;
assert_eq!(qh.len(), 1);
assert_eq!(qh[0].timestamp, 14 * 60); // The 15th entry (index 14)
}
#[test]
fn test_constants() {
assert_eq!(MAX_1MIN_ENTRIES, 1440);
assert_eq!(MAX_15MIN_ENTRIES, 672);
}
}