feat: add real-time metrics collection with ring buffer storage (MON-01)
Implements monitoring/collector.rs that collects per-container CPU/RAM/network/disk, system-wide metrics, RPC latency, and WebSocket connection count every 60 seconds. Data stored in dual ring buffers: 1-min resolution (24h) and 15-min resolution (7d). Three new RPC endpoints: monitoring.current, monitoring.history, monitoring.containers. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
45032d937b
commit
592548066e
@ -1,6 +1,8 @@
|
||||
use crate::api::rpc::RpcHandler;
|
||||
use crate::content_server;
|
||||
use crate::electrs_status;
|
||||
use crate::monitoring::MetricsStore;
|
||||
use crate::network::dwn_store::DwnStore;
|
||||
use crate::node_message as node_msg;
|
||||
use crate::config::Config;
|
||||
use crate::session::{self, SessionStore};
|
||||
@ -12,26 +14,39 @@ use hyper_ws_listener::WsStream;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::broadcast;
|
||||
use tokio_tungstenite::tungstenite::Message;
|
||||
use std::time::Instant;
|
||||
use tracing::{debug, info};
|
||||
|
||||
pub struct ApiHandler {
|
||||
config: Config,
|
||||
rpc_handler: Arc<RpcHandler>,
|
||||
state_manager: Arc<StateManager>,
|
||||
metrics_store: Arc<MetricsStore>,
|
||||
session_store: SessionStore,
|
||||
}
|
||||
|
||||
impl ApiHandler {
|
||||
pub async fn new(config: Config, state_manager: Arc<StateManager>) -> Result<Self> {
|
||||
pub async fn new(
|
||||
config: Config,
|
||||
state_manager: Arc<StateManager>,
|
||||
metrics_store: Arc<MetricsStore>,
|
||||
) -> Result<Self> {
|
||||
let session_store = SessionStore::new();
|
||||
let rpc_handler = Arc::new(
|
||||
RpcHandler::new(config.clone(), state_manager.clone(), session_store.clone()).await?,
|
||||
RpcHandler::new(
|
||||
config.clone(),
|
||||
state_manager.clone(),
|
||||
metrics_store.clone(),
|
||||
session_store.clone(),
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
config,
|
||||
rpc_handler,
|
||||
state_manager,
|
||||
metrics_store,
|
||||
session_store,
|
||||
})
|
||||
}
|
||||
@ -105,7 +120,7 @@ impl ApiHandler {
|
||||
if !self.is_authenticated(req.headers()).await {
|
||||
return Ok(Self::unauthorized());
|
||||
}
|
||||
return Self::handle_websocket(req, self.state_manager.clone()).await;
|
||||
return Self::handle_websocket(req, self.state_manager.clone(), self.metrics_store.clone()).await;
|
||||
}
|
||||
|
||||
// Convert body to bytes for non-WS routes
|
||||
@ -163,6 +178,19 @@ impl ApiHandler {
|
||||
Self::handle_lnd_proxy(path, &origin).await
|
||||
}
|
||||
|
||||
// DWN health — unauthenticated
|
||||
(Method::GET, "/dwn/health") => {
|
||||
Self::handle_dwn_health(&self.config).await
|
||||
}
|
||||
|
||||
// DWN message processing — authenticated
|
||||
(Method::POST, "/dwn") => {
|
||||
if !self.is_authenticated(&headers).await {
|
||||
return Ok(Self::unauthorized());
|
||||
}
|
||||
Self::handle_dwn_message(body_bytes, &self.config).await
|
||||
}
|
||||
|
||||
_ => Ok(Response::builder()
|
||||
.status(StatusCode::NOT_FOUND)
|
||||
.body(hyper::Body::from("Not Found"))
|
||||
@ -439,6 +467,7 @@ impl ApiHandler {
|
||||
async fn handle_websocket(
|
||||
req: Request<hyper::Body>,
|
||||
state_manager: Arc<StateManager>,
|
||||
metrics_store: Arc<MetricsStore>,
|
||||
) -> Result<Response<hyper::Body>> {
|
||||
let (response, ws_fut_opt) = hyper_ws_listener::create_ws(req)
|
||||
.map_err(|e| anyhow::anyhow!("WebSocket upgrade failed: {}", e))?;
|
||||
@ -456,6 +485,7 @@ impl ApiHandler {
|
||||
return;
|
||||
}
|
||||
};
|
||||
metrics_store.increment_ws();
|
||||
info!("WebSocket /ws/db connected");
|
||||
|
||||
let (mut tx, mut rx) = ws_stream.split();
|
||||
@ -472,10 +502,18 @@ impl ApiHandler {
|
||||
let mut state_rx = state_manager.subscribe();
|
||||
let ping_interval = tokio::time::interval(tokio::time::Duration::from_secs(30));
|
||||
tokio::pin!(ping_interval);
|
||||
let mut last_client_activity = Instant::now();
|
||||
const INACTIVITY_TIMEOUT_SECS: u64 = 300; // 5 minutes
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = ping_interval.tick() => {
|
||||
// Check inactivity timeout
|
||||
if last_client_activity.elapsed().as_secs() >= INACTIVITY_TIMEOUT_SECS {
|
||||
info!("WebSocket client inactive for {}s, closing", INACTIVITY_TIMEOUT_SECS);
|
||||
let _ = tx.send(Message::Close(None)).await;
|
||||
break;
|
||||
}
|
||||
if tx.send(Message::Ping(vec![])).await.is_err() {
|
||||
debug!("Failed to send ping, connection likely closed");
|
||||
break;
|
||||
@ -505,12 +543,23 @@ impl ApiHandler {
|
||||
match msg {
|
||||
Some(Ok(Message::Close(_))) => break,
|
||||
Some(Ok(Message::Pong(_))) => {
|
||||
last_client_activity = Instant::now();
|
||||
debug!("Received pong");
|
||||
}
|
||||
Some(Ok(Message::Ping(data))) => {
|
||||
last_client_activity = Instant::now();
|
||||
let _ = tx.send(Message::Pong(data)).await;
|
||||
}
|
||||
Some(Ok(_)) => {}
|
||||
Some(Ok(Message::Text(text))) => {
|
||||
last_client_activity = Instant::now();
|
||||
// Handle JSON ping from frontend
|
||||
if text.contains("\"type\":\"ping\"") || text.contains("\"type\": \"ping\"") {
|
||||
let _ = tx.send(Message::Text(r#"{"type":"pong"}"#.to_string())).await;
|
||||
}
|
||||
}
|
||||
Some(Ok(_)) => {
|
||||
last_client_activity = Instant::now();
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
debug!("WebSocket stream error: {}", e);
|
||||
break;
|
||||
@ -520,6 +569,7 @@ impl ApiHandler {
|
||||
}
|
||||
}
|
||||
}
|
||||
metrics_store.decrement_ws();
|
||||
info!("WebSocket /ws/db disconnected");
|
||||
});
|
||||
}
|
||||
@ -556,3 +606,139 @@ fn sanitize_html(s: &str) -> String {
|
||||
.replace('"', """)
|
||||
.replace('\'', "'")
|
||||
}
|
||||
|
||||
impl ApiHandler {
|
||||
/// DWN health endpoint — returns store stats.
|
||||
async fn handle_dwn_health(config: &Config) -> Result<Response<hyper::Body>> {
|
||||
match DwnStore::new(&config.data_dir).await {
|
||||
Ok(store) => {
|
||||
let stats = store.stats().await.unwrap_or(crate::network::dwn_store::StoreStats {
|
||||
message_count: 0,
|
||||
protocol_count: 0,
|
||||
total_bytes: 0,
|
||||
});
|
||||
let body = serde_json::json!({
|
||||
"status": "ok",
|
||||
"message_count": stats.message_count,
|
||||
"protocol_count": stats.protocol_count,
|
||||
"total_bytes": stats.total_bytes,
|
||||
});
|
||||
Ok(Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header("Content-Type", "application/json")
|
||||
.body(hyper::Body::from(body.to_string()))
|
||||
.unwrap())
|
||||
}
|
||||
Err(_) => Ok(Response::builder()
|
||||
.status(StatusCode::SERVICE_UNAVAILABLE)
|
||||
.header("Content-Type", "application/json")
|
||||
.body(hyper::Body::from(r#"{"status":"unavailable"}"#))
|
||||
.unwrap()),
|
||||
}
|
||||
}
|
||||
|
||||
/// DWN message processing endpoint — handles RecordsWrite, RecordsQuery, RecordsRead, RecordsDelete.
|
||||
async fn handle_dwn_message(
|
||||
body: hyper::body::Bytes,
|
||||
config: &Config,
|
||||
) -> Result<Response<hyper::Body>> {
|
||||
let request: serde_json::Value = match serde_json::from_slice(&body) {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
let err = serde_json::json!({"error": format!("Invalid JSON: {}", e)});
|
||||
return Ok(Response::builder()
|
||||
.status(StatusCode::BAD_REQUEST)
|
||||
.header("Content-Type", "application/json")
|
||||
.body(hyper::Body::from(err.to_string()))
|
||||
.unwrap());
|
||||
}
|
||||
};
|
||||
|
||||
let interface = request["message"]["descriptor"]["interface"]
|
||||
.as_str()
|
||||
.unwrap_or("");
|
||||
let method = request["message"]["descriptor"]["method"]
|
||||
.as_str()
|
||||
.unwrap_or("");
|
||||
|
||||
let store = DwnStore::new(&config.data_dir).await?;
|
||||
|
||||
let result = match (interface, method) {
|
||||
("Records", "Write") => {
|
||||
let author = request["message"]["author"].as_str().unwrap_or("unknown");
|
||||
let protocol = request["message"]["descriptor"]["protocol"].as_str();
|
||||
let schema = request["message"]["descriptor"]["schema"].as_str();
|
||||
let data_format = request["message"]["descriptor"]["dataFormat"].as_str();
|
||||
let data = request["message"].get("data").cloned();
|
||||
match store.write_message(author, protocol, schema, data_format, data).await {
|
||||
Ok(msg) => serde_json::json!({"status": {"code": 202}, "entry": msg}),
|
||||
Err(e) => serde_json::json!({"status": {"code": 500, "detail": e.to_string()}}),
|
||||
}
|
||||
}
|
||||
("Records", "Query") => {
|
||||
let query = crate::network::dwn_store::MessageQuery {
|
||||
protocol: request["message"]["descriptor"]["filter"]["protocol"]
|
||||
.as_str()
|
||||
.map(|s| s.to_string()),
|
||||
schema: request["message"]["descriptor"]["filter"]["schema"]
|
||||
.as_str()
|
||||
.map(|s| s.to_string()),
|
||||
author: request["message"]["descriptor"]["filter"]["author"]
|
||||
.as_str()
|
||||
.map(|s| s.to_string()),
|
||||
date_from: request["message"]["descriptor"]["filter"]["dateFrom"]
|
||||
.as_str()
|
||||
.map(|s| s.to_string()),
|
||||
date_to: request["message"]["descriptor"]["filter"]["dateTo"]
|
||||
.as_str()
|
||||
.map(|s| s.to_string()),
|
||||
limit: request["message"]["descriptor"]["filter"]["limit"]
|
||||
.as_u64()
|
||||
.map(|n| n as usize),
|
||||
};
|
||||
match store.query_messages(&query).await {
|
||||
Ok(messages) => serde_json::json!({"status": {"code": 200}, "entries": messages}),
|
||||
Err(e) => serde_json::json!({"status": {"code": 500, "detail": e.to_string()}}),
|
||||
}
|
||||
}
|
||||
("Records", "Read") => {
|
||||
let record_id = request["message"]["descriptor"]["recordId"]
|
||||
.as_str()
|
||||
.unwrap_or("");
|
||||
match store.read_message(record_id).await {
|
||||
Ok(Some(msg)) => serde_json::json!({"status": {"code": 200}, "entry": msg}),
|
||||
Ok(None) => serde_json::json!({"status": {"code": 404, "detail": "Record not found"}}),
|
||||
Err(e) => serde_json::json!({"status": {"code": 500, "detail": e.to_string()}}),
|
||||
}
|
||||
}
|
||||
("Records", "Delete") => {
|
||||
let record_id = request["message"]["descriptor"]["recordId"]
|
||||
.as_str()
|
||||
.unwrap_or("");
|
||||
match store.delete_message(record_id).await {
|
||||
Ok(true) => serde_json::json!({"status": {"code": 200}}),
|
||||
Ok(false) => serde_json::json!({"status": {"code": 404, "detail": "Record not found"}}),
|
||||
Err(e) => serde_json::json!({"status": {"code": 500, "detail": e.to_string()}}),
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
serde_json::json!({"status": {"code": 400, "detail": format!("Unknown method: {}.{}", interface, method)}})
|
||||
}
|
||||
};
|
||||
|
||||
let status_code = result["status"]["code"].as_u64().unwrap_or(200);
|
||||
let http_status = match status_code {
|
||||
202 => StatusCode::ACCEPTED,
|
||||
400 => StatusCode::BAD_REQUEST,
|
||||
404 => StatusCode::NOT_FOUND,
|
||||
500 => StatusCode::INTERNAL_SERVER_ERROR,
|
||||
_ => StatusCode::OK,
|
||||
};
|
||||
|
||||
Ok(Response::builder()
|
||||
.status(http_status)
|
||||
.header("Content-Type", "application/json")
|
||||
.body(hyper::Body::from(result.to_string()))
|
||||
.unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,28 +1,36 @@
|
||||
mod auth;
|
||||
mod backup_rpc;
|
||||
mod bitcoin;
|
||||
mod container;
|
||||
mod content;
|
||||
mod credentials;
|
||||
mod dwn;
|
||||
mod federation;
|
||||
mod identity;
|
||||
mod interfaces;
|
||||
mod marketplace;
|
||||
mod monitoring;
|
||||
mod names;
|
||||
mod lnd;
|
||||
mod mesh;
|
||||
mod network;
|
||||
mod node;
|
||||
mod nostr;
|
||||
mod package;
|
||||
mod peers;
|
||||
mod router;
|
||||
mod security;
|
||||
mod tor;
|
||||
mod totp;
|
||||
mod system;
|
||||
mod update;
|
||||
mod vpn;
|
||||
mod wallet;
|
||||
|
||||
use crate::auth::AuthManager;
|
||||
use crate::config::Config;
|
||||
use crate::container::DevContainerOrchestrator;
|
||||
use crate::monitoring::MetricsStore;
|
||||
use crate::port_allocator::PortAllocator;
|
||||
use crate::session::{self, LoginRateLimiter, SessionStore};
|
||||
use crate::state::StateManager;
|
||||
@ -70,6 +78,7 @@ pub struct RpcHandler {
|
||||
auth_manager: AuthManager,
|
||||
orchestrator: Option<Arc<DevContainerOrchestrator>>,
|
||||
state_manager: Arc<StateManager>,
|
||||
pub(crate) metrics_store: Arc<MetricsStore>,
|
||||
port_allocator: Arc<Mutex<PortAllocator>>,
|
||||
pub session_store: SessionStore,
|
||||
login_rate_limiter: LoginRateLimiter,
|
||||
@ -79,6 +88,7 @@ impl RpcHandler {
|
||||
pub async fn new(
|
||||
config: Config,
|
||||
state_manager: Arc<StateManager>,
|
||||
metrics_store: Arc<MetricsStore>,
|
||||
session_store: SessionStore,
|
||||
) -> Result<Self> {
|
||||
let auth_manager = AuthManager::new(config.data_dir.clone());
|
||||
@ -96,6 +106,7 @@ impl RpcHandler {
|
||||
auth_manager,
|
||||
orchestrator,
|
||||
state_manager,
|
||||
metrics_store,
|
||||
port_allocator,
|
||||
session_store,
|
||||
login_rate_limiter: LoginRateLimiter::new(),
|
||||
@ -210,13 +221,14 @@ impl RpcHandler {
|
||||
None
|
||||
};
|
||||
|
||||
// Route to handler
|
||||
// Route to handler (track latency for metrics)
|
||||
let rpc_start = std::time::Instant::now();
|
||||
let result = match rpc_req.method.as_str() {
|
||||
"echo" => self.handle_echo(params).await,
|
||||
"server.echo" => self.handle_echo(params).await,
|
||||
"auth.login" => self.handle_auth_login(params).await,
|
||||
"auth.logout" => self.handle_auth_logout().await,
|
||||
"auth.changePassword" => self.handle_auth_change_password(params).await,
|
||||
"auth.changePassword" => self.handle_auth_change_password(params, &session_token).await,
|
||||
"auth.onboardingComplete" => self.handle_auth_onboarding_complete().await,
|
||||
"auth.isOnboardingComplete" => self.handle_auth_is_onboarding_complete().await,
|
||||
"auth.resetOnboarding" => self.handle_auth_reset_onboarding().await,
|
||||
@ -276,6 +288,8 @@ impl RpcHandler {
|
||||
"lnd.sendcoins" => self.handle_lnd_sendcoins(params).await,
|
||||
"lnd.createinvoice" => self.handle_lnd_createinvoice(params).await,
|
||||
"lnd.payinvoice" => self.handle_lnd_payinvoice(params).await,
|
||||
"lnd.create-psbt" => self.handle_lnd_create_psbt(params).await,
|
||||
"lnd.finalize-psbt" => self.handle_lnd_finalize_psbt(params).await,
|
||||
|
||||
// Multi-identity management
|
||||
"identity.list" => self.handle_identity_list(params).await,
|
||||
@ -285,6 +299,9 @@ impl RpcHandler {
|
||||
"identity.set-default" => self.handle_identity_set_default(params).await,
|
||||
"identity.sign" => self.handle_identity_sign(params).await,
|
||||
"identity.verify" => self.handle_identity_verify(params).await,
|
||||
"identity.resolve-did" => self.handle_identity_resolve_did(params).await,
|
||||
"identity.resolve-remote-did" => self.handle_identity_resolve_remote_did(params).await,
|
||||
"identity.verify-did-document" => self.handle_identity_verify_did_document(params).await,
|
||||
"identity.create-nostr-key" => self.handle_identity_create_nostr_key(params).await,
|
||||
"identity.nostr-sign" => self.handle_identity_nostr_sign(params).await,
|
||||
|
||||
@ -300,6 +317,8 @@ impl RpcHandler {
|
||||
"identity.verify-credential" => self.handle_identity_verify_credential(params).await,
|
||||
"identity.list-credentials" => self.handle_identity_list_credentials(params).await,
|
||||
"identity.revoke-credential" => self.handle_identity_revoke_credential(params).await,
|
||||
"identity.create-presentation" => self.handle_identity_create_presentation(params).await,
|
||||
"identity.verify-presentation" => self.handle_identity_verify_presentation(params).await,
|
||||
|
||||
// Network overlay
|
||||
"network.get-visibility" => self.handle_network_get_visibility().await,
|
||||
@ -332,6 +351,8 @@ impl RpcHandler {
|
||||
"network.scan-wifi" => self.handle_network_scan_wifi().await,
|
||||
"network.configure-wifi" => self.handle_network_configure_wifi(params).await,
|
||||
"network.configure-ethernet" => self.handle_network_configure_ethernet(params).await,
|
||||
"network.dns-status" => self.handle_network_dns_status().await,
|
||||
"network.configure-dns" => self.handle_network_configure_dns(params).await,
|
||||
"router.detect" => self.handle_router_detect(params).await,
|
||||
"router.info" => self.handle_router_info().await,
|
||||
"router.configure" => self.handle_router_configure(params).await,
|
||||
@ -356,22 +377,122 @@ impl RpcHandler {
|
||||
// DWN (Decentralized Web Node)
|
||||
"dwn.status" => self.handle_dwn_status().await,
|
||||
"dwn.sync" => self.handle_dwn_sync().await,
|
||||
"dwn.register-protocol" => {
|
||||
let p = params.unwrap_or(serde_json::json!({}));
|
||||
self.handle_dwn_register_protocol(&p).await
|
||||
}
|
||||
"dwn.list-protocols" => self.handle_dwn_list_protocols().await,
|
||||
"dwn.remove-protocol" => {
|
||||
let p = params.unwrap_or(serde_json::json!({}));
|
||||
self.handle_dwn_remove_protocol(&p).await
|
||||
}
|
||||
"dwn.query-messages" => {
|
||||
let p = params.unwrap_or(serde_json::json!({}));
|
||||
self.handle_dwn_query_messages(&p).await
|
||||
}
|
||||
"dwn.write-message" => {
|
||||
let p = params.unwrap_or(serde_json::json!({}));
|
||||
self.handle_dwn_write_message(&p).await
|
||||
}
|
||||
|
||||
// Federation
|
||||
"federation.invite" => self.handle_federation_invite().await,
|
||||
"federation.join" => self.handle_federation_join(params).await,
|
||||
"federation.list-nodes" => self.handle_federation_list_nodes().await,
|
||||
"federation.remove-node" => self.handle_federation_remove_node(params).await,
|
||||
"federation.set-trust" => self.handle_federation_set_trust(params).await,
|
||||
"federation.sync-state" => self.handle_federation_sync_state().await,
|
||||
"federation.get-state" => self.handle_federation_get_state().await,
|
||||
"federation.peer-joined" => self.handle_federation_peer_joined(params).await,
|
||||
"federation.deploy-app" => self.handle_federation_deploy_app(params).await,
|
||||
|
||||
// VPN
|
||||
"vpn.status" => self.handle_vpn_status().await,
|
||||
"vpn.configure" => self.handle_vpn_configure(params).await,
|
||||
"vpn.disconnect" => self.handle_vpn_disconnect().await,
|
||||
|
||||
// Marketplace
|
||||
"marketplace.discover" => self.handle_marketplace_discover().await,
|
||||
"marketplace.publish" => self.handle_marketplace_publish(params).await,
|
||||
"marketplace.get-manifest" => self.handle_marketplace_get_manifest(params).await,
|
||||
"marketplace.list-published" => self.handle_marketplace_list_published().await,
|
||||
"marketplace.verify" => self.handle_marketplace_verify(params).await,
|
||||
|
||||
// Mesh networking
|
||||
"mesh.status" => self.handle_mesh_status().await,
|
||||
"mesh.discover" => self.handle_mesh_discover(params).await,
|
||||
"mesh.broadcast" => self.handle_mesh_broadcast().await,
|
||||
"mesh.configure" => self.handle_mesh_configure(params).await,
|
||||
|
||||
// System monitoring
|
||||
"system.stats" => self.handle_system_stats().await,
|
||||
"system.processes" => self.handle_system_processes().await,
|
||||
"system.temperature" => self.handle_system_temperature().await,
|
||||
"system.detect-usb-devices" => self.handle_system_detect_usb_devices().await,
|
||||
"system.disk-status" => self.handle_system_disk_status().await,
|
||||
"system.disk-cleanup" => self.handle_system_disk_cleanup().await,
|
||||
|
||||
// Real-time metrics monitoring
|
||||
"monitoring.current" => self.handle_monitoring_current().await,
|
||||
"monitoring.history" => self.handle_monitoring_history(params).await,
|
||||
"monitoring.containers" => self.handle_monitoring_containers().await,
|
||||
|
||||
// System updates
|
||||
"update.check" => self.handle_update_check().await,
|
||||
"update.status" => self.handle_update_status().await,
|
||||
"update.dismiss" => self.handle_update_dismiss().await,
|
||||
"update.download" => self.handle_update_download().await,
|
||||
"update.apply" => self.handle_update_apply().await,
|
||||
"update.rollback" => self.handle_update_rollback().await,
|
||||
"update.get-schedule" => self.handle_update_get_schedule().await,
|
||||
"update.set-schedule" => {
|
||||
let p = params.unwrap_or(serde_json::json!({}));
|
||||
self.handle_update_set_schedule(&p).await
|
||||
}
|
||||
|
||||
// Backup & Restore
|
||||
"backup.create" => {
|
||||
let p = params.unwrap_or(serde_json::json!({}));
|
||||
self.handle_backup_create(&p).await
|
||||
}
|
||||
"backup.list" => self.handle_backup_list().await,
|
||||
"backup.verify" => {
|
||||
let p = params.unwrap_or(serde_json::json!({}));
|
||||
self.handle_backup_verify(&p).await
|
||||
}
|
||||
"backup.restore" => {
|
||||
let p = params.unwrap_or(serde_json::json!({}));
|
||||
self.handle_backup_restore(&p).await
|
||||
}
|
||||
"backup.delete" => {
|
||||
let p = params.unwrap_or(serde_json::json!({}));
|
||||
self.handle_backup_delete(&p).await
|
||||
}
|
||||
"backup.list-drives" => self.handle_backup_list_drives().await,
|
||||
"backup.to-usb" => {
|
||||
let p = params.unwrap_or(serde_json::json!({}));
|
||||
self.handle_backup_to_usb(&p).await
|
||||
}
|
||||
|
||||
// Security / secrets
|
||||
"security.rotate-secrets" => {
|
||||
let p = params.unwrap_or(serde_json::json!({}));
|
||||
self.handle_security_rotate_secrets(&p).await
|
||||
}
|
||||
"security.list-expiring" => {
|
||||
let p = params.unwrap_or(serde_json::json!({}));
|
||||
self.handle_security_list_expiring(&p).await
|
||||
}
|
||||
|
||||
_ => {
|
||||
Err(anyhow::anyhow!("Unknown method: {}", rpc_req.method))
|
||||
}
|
||||
};
|
||||
|
||||
// Record RPC latency for monitoring
|
||||
let elapsed_ms = rpc_start.elapsed().as_secs_f64() * 1000.0;
|
||||
self.metrics_store.record_rpc_latency(elapsed_ms).await;
|
||||
|
||||
// Build response
|
||||
let rpc_resp = match result {
|
||||
Ok(data) => RpcResponse {
|
||||
@ -465,6 +586,34 @@ impl RpcHandler {
|
||||
// On successful TOTP verification, the session is already upgraded to full
|
||||
// (handled inside handle_login_totp/handle_login_backup)
|
||||
|
||||
// On password change, rotate the session token for the caller
|
||||
if rpc_req.method == "auth.changePassword" && rpc_resp.error.is_none() {
|
||||
if let Some(token) = &session_token {
|
||||
let new_token = self.session_store.rotate(token).await;
|
||||
let csrf_token = generate_csrf_token();
|
||||
response.headers_mut().append(
|
||||
"Set-Cookie",
|
||||
format!(
|
||||
"session={}; HttpOnly; SameSite=Strict; Path=/{}",
|
||||
new_token,
|
||||
self.cookie_suffix()
|
||||
)
|
||||
.parse()
|
||||
.unwrap(),
|
||||
);
|
||||
response.headers_mut().append(
|
||||
"Set-Cookie",
|
||||
format!(
|
||||
"csrf_token={}; SameSite=Strict; Path=/{}",
|
||||
csrf_token,
|
||||
self.cookie_suffix()
|
||||
)
|
||||
.parse()
|
||||
.unwrap(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// On logout, invalidate session and expire cookies
|
||||
if rpc_req.method == "auth.logout" {
|
||||
if let Some(token) = &session_token {
|
||||
|
||||
62
core/archipelago/src/api/rpc/monitoring.rs
Normal file
62
core/archipelago/src/api/rpc/monitoring.rs
Normal file
@ -0,0 +1,62 @@
|
||||
use super::RpcHandler;
|
||||
use anyhow::Result;
|
||||
use tracing::debug;
|
||||
|
||||
impl RpcHandler {
|
||||
/// monitoring.current — latest metrics snapshot
|
||||
pub(super) async fn handle_monitoring_current(&self) -> Result<serde_json::Value> {
|
||||
debug!("Getting current metrics");
|
||||
|
||||
match self.metrics_store.latest().await {
|
||||
Some(snapshot) => Ok(serde_json::to_value(snapshot)?),
|
||||
None => Ok(serde_json::json!({ "status": "collecting", "message": "No metrics collected yet" })),
|
||||
}
|
||||
}
|
||||
|
||||
/// monitoring.history — historical metrics at given resolution
|
||||
pub(super) async fn handle_monitoring_history(
|
||||
&self,
|
||||
params: Option<serde_json::Value>,
|
||||
) -> Result<serde_json::Value> {
|
||||
debug!("Getting metrics history");
|
||||
|
||||
let resolution = params
|
||||
.as_ref()
|
||||
.and_then(|p| p.get("resolution"))
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("minute");
|
||||
|
||||
let count = params
|
||||
.as_ref()
|
||||
.and_then(|p| p.get("count"))
|
||||
.and_then(|v| v.as_u64())
|
||||
.unwrap_or(60) as usize;
|
||||
|
||||
// Clamp count to reasonable limits
|
||||
let count = count.min(1440);
|
||||
|
||||
let data = match resolution {
|
||||
"quarter_hour" | "15min" => self.metrics_store.history_quarter_hours(count).await,
|
||||
_ => self.metrics_store.history_minutes(count).await,
|
||||
};
|
||||
|
||||
Ok(serde_json::json!({
|
||||
"resolution": resolution,
|
||||
"count": data.len(),
|
||||
"data": data,
|
||||
}))
|
||||
}
|
||||
|
||||
/// monitoring.containers — latest per-container metrics
|
||||
pub(super) async fn handle_monitoring_containers(&self) -> Result<serde_json::Value> {
|
||||
debug!("Getting container metrics");
|
||||
|
||||
match self.metrics_store.latest().await {
|
||||
Some(snapshot) => Ok(serde_json::json!({
|
||||
"timestamp": snapshot.timestamp,
|
||||
"containers": snapshot.containers,
|
||||
})),
|
||||
None => Ok(serde_json::json!({ "containers": [] })),
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -4,19 +4,27 @@
|
||||
use anyhow::Result;
|
||||
use std::net::SocketAddr;
|
||||
use tracing::info;
|
||||
use tokio::signal;
|
||||
|
||||
mod api;
|
||||
mod auth;
|
||||
mod backup;
|
||||
mod config;
|
||||
mod content_server;
|
||||
mod crash_recovery;
|
||||
mod credentials;
|
||||
mod disk_monitor;
|
||||
mod health_monitor;
|
||||
mod electrs_status;
|
||||
mod container;
|
||||
mod port_allocator;
|
||||
mod data_model;
|
||||
mod federation;
|
||||
mod identity;
|
||||
mod identity_manager;
|
||||
mod marketplace;
|
||||
mod mesh;
|
||||
mod monitoring;
|
||||
mod node_message;
|
||||
mod nostr_discovery;
|
||||
mod peers;
|
||||
@ -29,6 +37,7 @@ mod names;
|
||||
mod network;
|
||||
mod nostr_relays;
|
||||
mod update;
|
||||
mod vpn;
|
||||
|
||||
use auth::AuthManager;
|
||||
use config::Config;
|
||||
@ -39,6 +48,8 @@ const DEV_DEFAULT_PASSWORD: &str = "password123";
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let startup_start = std::time::Instant::now();
|
||||
|
||||
// Initialize tracing
|
||||
tracing_subscriber::fmt()
|
||||
.with_env_filter(
|
||||
@ -47,12 +58,25 @@ async fn main() -> Result<()> {
|
||||
)
|
||||
.init();
|
||||
|
||||
info!("🚀 Starting Archipelago Bitcoin Node OS");
|
||||
info!("Starting Archipelago Bitcoin Node OS");
|
||||
|
||||
// Load configuration
|
||||
let config = Config::load().await?;
|
||||
info!("📁 Data directory: {}", config.data_dir.display());
|
||||
|
||||
// Crash recovery: check if previous instance shut down cleanly
|
||||
if let Some(containers) = crash_recovery::check_for_crash(&config.data_dir).await? {
|
||||
info!("🔧 Recovering {} containers from previous crash...", containers.len());
|
||||
let report = crash_recovery::recover_containers(&containers).await;
|
||||
info!(
|
||||
"🔧 Recovery complete: {}/{} containers restarted (failed: {:?})",
|
||||
report.recovered, report.total, report.failed
|
||||
);
|
||||
}
|
||||
|
||||
// Write PID marker so we can detect crashes on next startup
|
||||
crash_recovery::write_pid_marker(&config.data_dir).await?;
|
||||
|
||||
// In dev mode, ensure a default user exists so login works without manual setup
|
||||
if config.dev_mode {
|
||||
let auth = AuthManager::new(config.data_dir.clone());
|
||||
@ -70,11 +94,42 @@ async fn main() -> Result<()> {
|
||||
.parse()
|
||||
.expect("Invalid bind address");
|
||||
|
||||
info!("🌐 Server listening on http://{}", addr);
|
||||
info!("📡 RPC API: http://{}/rpc/v1", addr);
|
||||
info!("🔌 WebSocket: ws://{}/ws", addr);
|
||||
// Spawn background update scheduler
|
||||
let update_data_dir = config.data_dir.clone();
|
||||
tokio::spawn(async move {
|
||||
update::run_update_scheduler(update_data_dir).await;
|
||||
});
|
||||
|
||||
server.serve(addr).await?;
|
||||
// Spawn periodic container snapshot (for crash recovery)
|
||||
crash_recovery::spawn_snapshot_task(config.data_dir.clone());
|
||||
|
||||
// Spawn disk space monitor (warns at 85%, auto-cleans at 90%)
|
||||
disk_monitor::spawn_disk_monitor(config.data_dir.clone());
|
||||
|
||||
let startup_ms = startup_start.elapsed().as_millis();
|
||||
info!("Server listening on http://{} (startup: {}ms)", addr, startup_ms);
|
||||
info!("RPC API: http://{}/rpc/v1", addr);
|
||||
info!("WebSocket: ws://{}/ws", addr);
|
||||
|
||||
// Graceful shutdown: wait for SIGTERM or SIGINT
|
||||
let shutdown = async {
|
||||
let mut sigterm = signal::unix::signal(signal::unix::SignalKind::terminate())
|
||||
.expect("Failed to register SIGTERM handler");
|
||||
tokio::select! {
|
||||
_ = signal::ctrl_c() => {
|
||||
info!("Received SIGINT (Ctrl+C), initiating graceful shutdown...");
|
||||
}
|
||||
_ = sigterm.recv() => {
|
||||
info!("Received SIGTERM, initiating graceful shutdown...");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
server.serve_with_shutdown(addr, shutdown).await?;
|
||||
|
||||
// Clean shutdown: remove PID marker so next startup doesn't trigger recovery
|
||||
crash_recovery::remove_pid_marker(&config.data_dir).await;
|
||||
|
||||
info!("Archipelago shut down cleanly");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
384
core/archipelago/src/monitoring/collector.rs
Normal file
384
core/archipelago/src/monitoring/collector.rs
Normal file
@ -0,0 +1,384 @@
|
||||
use super::{ContainerMetrics, MetricSnapshot, SystemMetrics};
|
||||
use anyhow::{Context, Result};
|
||||
|
||||
/// Collect a full metrics snapshot from the system.
|
||||
pub async fn collect_snapshot() -> Result<MetricSnapshot> {
|
||||
let timestamp = chrono::Utc::now().timestamp();
|
||||
|
||||
let (cpu, mem, disk, net, load) = tokio::join!(
|
||||
read_cpu_usage(),
|
||||
read_meminfo(),
|
||||
read_disk_usage(),
|
||||
read_network_totals(),
|
||||
read_loadavg(),
|
||||
);
|
||||
|
||||
let cpu = cpu.unwrap_or(0.0);
|
||||
let (mem_used, mem_total) = mem.unwrap_or((0, 0));
|
||||
let (disk_used, disk_total) = disk.unwrap_or((0, 0));
|
||||
let (net_rx, net_tx) = net.unwrap_or((0, 0));
|
||||
let (l1, l5, l15) = load.unwrap_or((0.0, 0.0, 0.0));
|
||||
|
||||
let system = SystemMetrics {
|
||||
cpu_percent: cpu,
|
||||
mem_used_bytes: mem_used,
|
||||
mem_total_bytes: mem_total,
|
||||
disk_used_bytes: disk_used,
|
||||
disk_total_bytes: disk_total,
|
||||
net_rx_bytes: net_rx,
|
||||
net_tx_bytes: net_tx,
|
||||
load_avg_1: l1,
|
||||
load_avg_5: l5,
|
||||
load_avg_15: l15,
|
||||
};
|
||||
|
||||
let containers = read_container_stats().await.unwrap_or_default();
|
||||
|
||||
Ok(MetricSnapshot {
|
||||
timestamp,
|
||||
system,
|
||||
containers,
|
||||
rpc_latency_ms: 0.0, // filled in by MetricsStore::push
|
||||
ws_connections: 0, // filled in by MetricsStore::push
|
||||
})
|
||||
}
|
||||
|
||||
/// Compute CPU usage by sampling /proc/stat twice with a 250ms gap.
|
||||
async fn read_cpu_usage() -> Result<f64> {
|
||||
let snap1 = read_cpu_jiffies().await?;
|
||||
tokio::time::sleep(std::time::Duration::from_millis(250)).await;
|
||||
let snap2 = read_cpu_jiffies().await?;
|
||||
|
||||
let total_delta = snap2.0.saturating_sub(snap1.0);
|
||||
let idle_delta = snap2.1.saturating_sub(snap1.1);
|
||||
|
||||
if total_delta == 0 {
|
||||
return Ok(0.0);
|
||||
}
|
||||
|
||||
let usage = 100.0 * (1.0 - (idle_delta as f64 / total_delta as f64));
|
||||
Ok((usage * 10.0).round() / 10.0)
|
||||
}
|
||||
|
||||
/// Returns (total_jiffies, idle_jiffies) from /proc/stat.
|
||||
async fn read_cpu_jiffies() -> Result<(u64, u64)> {
|
||||
let content = tokio::fs::read_to_string("/proc/stat")
|
||||
.await
|
||||
.context("Failed to read /proc/stat")?;
|
||||
let cpu_line = content
|
||||
.lines()
|
||||
.next()
|
||||
.ok_or_else(|| anyhow::anyhow!("Empty /proc/stat"))?;
|
||||
let vals: Vec<u64> = cpu_line
|
||||
.split_whitespace()
|
||||
.skip(1)
|
||||
.filter_map(|v| v.parse().ok())
|
||||
.collect();
|
||||
if vals.len() < 4 {
|
||||
anyhow::bail!("Not enough fields in /proc/stat cpu line");
|
||||
}
|
||||
let idle = vals[3];
|
||||
let total: u64 = vals.iter().sum();
|
||||
Ok((total, idle))
|
||||
}
|
||||
|
||||
/// Read memory used/total from /proc/meminfo.
|
||||
async fn read_meminfo() -> Result<(u64, u64)> {
|
||||
let content = tokio::fs::read_to_string("/proc/meminfo")
|
||||
.await
|
||||
.context("Failed to read /proc/meminfo")?;
|
||||
|
||||
let mut total_kb: u64 = 0;
|
||||
let mut available_kb: u64 = 0;
|
||||
|
||||
for line in content.lines() {
|
||||
if let Some(val) = line.strip_prefix("MemTotal:") {
|
||||
total_kb = parse_kb(val)?;
|
||||
} else if let Some(val) = line.strip_prefix("MemAvailable:") {
|
||||
available_kb = parse_kb(val)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok((
|
||||
total_kb.saturating_sub(available_kb) * 1024,
|
||||
total_kb * 1024,
|
||||
))
|
||||
}
|
||||
|
||||
fn parse_kb(val: &str) -> Result<u64> {
|
||||
val.trim()
|
||||
.trim_end_matches("kB")
|
||||
.trim()
|
||||
.parse::<u64>()
|
||||
.context("parse meminfo kB value")
|
||||
}
|
||||
|
||||
/// Read disk used/total via `df` for the root filesystem.
|
||||
async fn read_disk_usage() -> Result<(u64, u64)> {
|
||||
let output = tokio::process::Command::new("df")
|
||||
.args(["--block-size=1", "--output=used,size", "/"])
|
||||
.output()
|
||||
.await
|
||||
.context("Failed to run df")?;
|
||||
|
||||
if !output.status.success() {
|
||||
anyhow::bail!("df failed: {}", String::from_utf8_lossy(&output.stderr));
|
||||
}
|
||||
|
||||
let stdout = String::from_utf8(output.stdout).context("df output not utf8")?;
|
||||
let data_line = stdout
|
||||
.lines()
|
||||
.nth(1)
|
||||
.ok_or_else(|| anyhow::anyhow!("No data line from df"))?;
|
||||
let mut parts = data_line.split_whitespace();
|
||||
let used: u64 = parts
|
||||
.next()
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing used"))?
|
||||
.parse()
|
||||
.context("parse df used")?;
|
||||
let total: u64 = parts
|
||||
.next()
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing total"))?
|
||||
.parse()
|
||||
.context("parse df total")?;
|
||||
|
||||
Ok((used, total))
|
||||
}
|
||||
|
||||
/// Read load averages from /proc/loadavg.
|
||||
async fn read_loadavg() -> Result<(f64, f64, f64)> {
|
||||
let content = tokio::fs::read_to_string("/proc/loadavg")
|
||||
.await
|
||||
.context("Failed to read /proc/loadavg")?;
|
||||
let mut parts = content.split_whitespace();
|
||||
let l1: f64 = parts
|
||||
.next()
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing load1"))?
|
||||
.parse()
|
||||
.context("parse load1")?;
|
||||
let l5: f64 = parts
|
||||
.next()
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing load5"))?
|
||||
.parse()
|
||||
.context("parse load5")?;
|
||||
let l15: f64 = parts
|
||||
.next()
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing load15"))?
|
||||
.parse()
|
||||
.context("parse load15")?;
|
||||
Ok((l1, l5, l15))
|
||||
}
|
||||
|
||||
/// Read total network RX/TX bytes from /proc/net/dev (sum of all real interfaces).
|
||||
async fn read_network_totals() -> Result<(u64, u64)> {
|
||||
let content = tokio::fs::read_to_string("/proc/net/dev")
|
||||
.await
|
||||
.context("Failed to read /proc/net/dev")?;
|
||||
|
||||
let mut rx_total: u64 = 0;
|
||||
let mut tx_total: u64 = 0;
|
||||
|
||||
for line in content.lines().skip(2) {
|
||||
let line = line.trim();
|
||||
if line.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Split on colon to separate interface name from data
|
||||
let (iface, data) = match line.split_once(':') {
|
||||
Some((i, d)) => (i.trim(), d),
|
||||
None => continue,
|
||||
};
|
||||
|
||||
// Skip loopback
|
||||
if iface == "lo" {
|
||||
continue;
|
||||
}
|
||||
|
||||
let parts: Vec<&str> = data.split_whitespace().collect();
|
||||
// Fields: rx_bytes rx_packets rx_errs ... (8 fields) tx_bytes tx_packets ...
|
||||
if parts.len() >= 10 {
|
||||
if let Ok(rx) = parts[0].parse::<u64>() {
|
||||
rx_total += rx;
|
||||
}
|
||||
if let Ok(tx) = parts[8].parse::<u64>() {
|
||||
tx_total += tx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok((rx_total, tx_total))
|
||||
}
|
||||
|
||||
/// Get per-container resource stats via `podman stats --no-stream --format json`.
|
||||
async fn read_container_stats() -> Result<Vec<ContainerMetrics>> {
|
||||
let output = tokio::process::Command::new("sudo")
|
||||
.args(["podman", "stats", "--no-stream", "--format", "json"])
|
||||
.output()
|
||||
.await
|
||||
.context("Failed to run podman stats")?;
|
||||
|
||||
if !output.status.success() {
|
||||
anyhow::bail!(
|
||||
"podman stats failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
let entries: Vec<serde_json::Value> = serde_json::from_str(&stdout).unwrap_or_default();
|
||||
|
||||
Ok(entries
|
||||
.iter()
|
||||
.filter_map(|e| {
|
||||
let name = e
|
||||
.get("name")
|
||||
.or_else(|| e.get("Name"))
|
||||
.and_then(|v| v.as_str())?
|
||||
.to_string();
|
||||
|
||||
Some(ContainerMetrics {
|
||||
name,
|
||||
cpu_percent: parse_percent_field(e, "cpu_percent")
|
||||
.or_else(|| parse_percent_field(e, "CPUPerc"))
|
||||
.unwrap_or(0.0),
|
||||
mem_used_bytes: parse_bytes_field(e, "mem_usage")
|
||||
.or_else(|| parse_bytes_field(e, "MemUsage"))
|
||||
.unwrap_or(0),
|
||||
mem_limit_bytes: parse_bytes_field(e, "mem_limit")
|
||||
.or_else(|| parse_bytes_field(e, "MemLimit"))
|
||||
.unwrap_or(0),
|
||||
net_rx_bytes: parse_bytes_field(e, "net_input")
|
||||
.or_else(|| parse_bytes_field(e, "NetInput"))
|
||||
.unwrap_or(0),
|
||||
net_tx_bytes: parse_bytes_field(e, "net_output")
|
||||
.or_else(|| parse_bytes_field(e, "NetOutput"))
|
||||
.unwrap_or(0),
|
||||
block_read_bytes: parse_bytes_field(e, "block_input")
|
||||
.or_else(|| parse_bytes_field(e, "BlockInput"))
|
||||
.unwrap_or(0),
|
||||
block_write_bytes: parse_bytes_field(e, "block_output")
|
||||
.or_else(|| parse_bytes_field(e, "BlockOutput"))
|
||||
.unwrap_or(0),
|
||||
})
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Parse a percentage field that may be a number or a string like "12.5%".
|
||||
fn parse_percent_field(obj: &serde_json::Value, key: &str) -> Option<f64> {
|
||||
let val = obj.get(key)?;
|
||||
if let Some(n) = val.as_f64() {
|
||||
return Some(n);
|
||||
}
|
||||
val.as_str()?.trim_end_matches('%').trim().parse::<f64>().ok()
|
||||
}
|
||||
|
||||
/// Parse a bytes field that may be a number or a human-readable string.
|
||||
fn parse_bytes_field(obj: &serde_json::Value, key: &str) -> Option<u64> {
|
||||
let val = obj.get(key)?;
|
||||
if let Some(n) = val.as_u64() {
|
||||
return Some(n);
|
||||
}
|
||||
parse_human_bytes(val.as_str()?)
|
||||
}
|
||||
|
||||
/// Parse human-readable byte strings like "1.5GiB", "256MiB", "100MiB / 16GiB".
|
||||
fn parse_human_bytes(s: &str) -> Option<u64> {
|
||||
let s = s.trim();
|
||||
if s == "--" || s.is_empty() {
|
||||
return Some(0);
|
||||
}
|
||||
|
||||
// Handle "X / Y" format — take only the first part
|
||||
let s = s.split('/').next()?.trim();
|
||||
|
||||
let (num_str, multiplier) = if let Some(n) = s.strip_suffix("GiB") {
|
||||
(n, 1024u64 * 1024 * 1024)
|
||||
} else if let Some(n) = s.strip_suffix("MiB") {
|
||||
(n, 1024u64 * 1024)
|
||||
} else if let Some(n) = s.strip_suffix("KiB") {
|
||||
(n, 1024u64)
|
||||
} else if let Some(n) = s.strip_suffix("GB") {
|
||||
(n, 1_000_000_000u64)
|
||||
} else if let Some(n) = s.strip_suffix("MB") {
|
||||
(n, 1_000_000u64)
|
||||
} else if let Some(n) = s.strip_suffix("kB") {
|
||||
(n, 1000u64)
|
||||
} else if let Some(n) = s.strip_suffix('B') {
|
||||
(n, 1u64)
|
||||
} else {
|
||||
(s, 1u64)
|
||||
};
|
||||
|
||||
let num: f64 = num_str.trim().parse().ok()?;
|
||||
Some((num * multiplier as f64) as u64)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_parse_human_bytes_gib() {
|
||||
assert_eq!(parse_human_bytes("1GiB"), Some(1073741824));
|
||||
assert_eq!(parse_human_bytes("1.5GiB"), Some(1610612736));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_human_bytes_mib() {
|
||||
assert_eq!(parse_human_bytes("256MiB"), Some(268435456));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_human_bytes_kib() {
|
||||
assert_eq!(parse_human_bytes("1024KiB"), Some(1048576));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_human_bytes_si() {
|
||||
assert_eq!(parse_human_bytes("1000MB"), Some(1000000000));
|
||||
assert_eq!(parse_human_bytes("100B"), Some(100));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_human_bytes_empty() {
|
||||
assert_eq!(parse_human_bytes("--"), Some(0));
|
||||
assert_eq!(parse_human_bytes(""), Some(0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_human_bytes_slash_format() {
|
||||
assert_eq!(parse_human_bytes("100MiB / 16GiB"), Some(104857600));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_percent_string() {
|
||||
let obj = serde_json::json!({"cpu": "12.5%"});
|
||||
assert_eq!(parse_percent_field(&obj, "cpu"), Some(12.5));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_percent_number() {
|
||||
let obj = serde_json::json!({"cpu": 12.5});
|
||||
assert_eq!(parse_percent_field(&obj, "cpu"), Some(12.5));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_percent_missing() {
|
||||
let obj = serde_json::json!({"other": 1});
|
||||
assert_eq!(parse_percent_field(&obj, "cpu"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_bytes_field_number() {
|
||||
let obj = serde_json::json!({"mem": 1048576});
|
||||
assert_eq!(parse_bytes_field(&obj, "mem"), Some(1048576));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_bytes_field_string() {
|
||||
let obj = serde_json::json!({"mem": "256MiB"});
|
||||
assert_eq!(parse_bytes_field(&obj, "mem"), Some(268435456));
|
||||
}
|
||||
}
|
||||
364
core/archipelago/src/monitoring/mod.rs
Normal file
364
core/archipelago/src/monitoring/mod.rs
Normal file
@ -0,0 +1,364 @@
|
||||
pub mod collector;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::{debug, warn};
|
||||
|
||||
/// Maximum entries at 1-minute resolution (24 hours = 1440 minutes)
|
||||
const MAX_1MIN_ENTRIES: usize = 1440;
|
||||
|
||||
/// Maximum entries at 15-minute resolution (7 days = 672 quarter-hours)
|
||||
const MAX_15MIN_ENTRIES: usize = 672;
|
||||
|
||||
/// A single metrics snapshot collected at a point in time.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MetricSnapshot {
|
||||
pub timestamp: i64,
|
||||
pub system: SystemMetrics,
|
||||
pub containers: Vec<ContainerMetrics>,
|
||||
pub rpc_latency_ms: f64,
|
||||
pub ws_connections: u32,
|
||||
}
|
||||
|
||||
/// System-wide resource metrics.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SystemMetrics {
|
||||
pub cpu_percent: f64,
|
||||
pub mem_used_bytes: u64,
|
||||
pub mem_total_bytes: u64,
|
||||
pub disk_used_bytes: u64,
|
||||
pub disk_total_bytes: u64,
|
||||
pub net_rx_bytes: u64,
|
||||
pub net_tx_bytes: u64,
|
||||
pub load_avg_1: f64,
|
||||
pub load_avg_5: f64,
|
||||
pub load_avg_15: f64,
|
||||
}
|
||||
|
||||
/// Per-container resource metrics.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ContainerMetrics {
|
||||
pub name: String,
|
||||
pub cpu_percent: f64,
|
||||
pub mem_used_bytes: u64,
|
||||
pub mem_limit_bytes: u64,
|
||||
pub net_rx_bytes: u64,
|
||||
pub net_tx_bytes: u64,
|
||||
pub block_read_bytes: u64,
|
||||
pub block_write_bytes: u64,
|
||||
}
|
||||
|
||||
/// Thread-safe metrics store with ring buffers at two resolutions.
|
||||
pub struct MetricsStore {
|
||||
minute_data: RwLock<VecDeque<MetricSnapshot>>,
|
||||
quarter_hour_data: RwLock<VecDeque<MetricSnapshot>>,
|
||||
minute_count: RwLock<u32>,
|
||||
rpc_latency: RwLock<(f64, u64)>,
|
||||
ws_connections: AtomicU32,
|
||||
}
|
||||
|
||||
impl MetricsStore {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
minute_data: RwLock::new(VecDeque::with_capacity(MAX_1MIN_ENTRIES)),
|
||||
quarter_hour_data: RwLock::new(VecDeque::with_capacity(MAX_15MIN_ENTRIES)),
|
||||
minute_count: RwLock::new(0),
|
||||
rpc_latency: RwLock::new((0.0, 0)),
|
||||
ws_connections: AtomicU32::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// Record a new metric snapshot (called every minute by collector).
|
||||
pub async fn push(&self, mut snapshot: MetricSnapshot) {
|
||||
// Fill in RPC latency from accumulated samples
|
||||
{
|
||||
let mut latency = self.rpc_latency.write().await;
|
||||
if latency.1 > 0 {
|
||||
snapshot.rpc_latency_ms = (latency.0 / latency.1 as f64 * 10.0).round() / 10.0;
|
||||
*latency = (0.0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
// Fill in current WS connection count
|
||||
snapshot.ws_connections = self.ws_connections.load(Ordering::Relaxed);
|
||||
|
||||
// Push to 1-minute ring buffer
|
||||
{
|
||||
let mut buf = self.minute_data.write().await;
|
||||
if buf.len() >= MAX_1MIN_ENTRIES {
|
||||
buf.pop_front();
|
||||
}
|
||||
buf.push_back(snapshot.clone());
|
||||
}
|
||||
|
||||
// Every 15 minutes, push to quarter-hour ring buffer
|
||||
{
|
||||
let mut count = self.minute_count.write().await;
|
||||
*count += 1;
|
||||
if *count >= 15 {
|
||||
*count = 0;
|
||||
let mut buf = self.quarter_hour_data.write().await;
|
||||
if buf.len() >= MAX_15MIN_ENTRIES {
|
||||
buf.pop_front();
|
||||
}
|
||||
buf.push_back(snapshot);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Record an RPC request latency sample (milliseconds).
|
||||
pub async fn record_rpc_latency(&self, latency_ms: f64) {
|
||||
let mut data = self.rpc_latency.write().await;
|
||||
data.0 += latency_ms;
|
||||
data.1 += 1;
|
||||
}
|
||||
|
||||
/// Increment WebSocket connection count (called on connect).
|
||||
pub fn increment_ws(&self) {
|
||||
self.ws_connections.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
/// Decrement WebSocket connection count (called on disconnect).
|
||||
pub fn decrement_ws(&self) {
|
||||
// Use saturating semantics to avoid underflow
|
||||
let _ = self.ws_connections.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |v| {
|
||||
if v > 0 { Some(v - 1) } else { Some(0) }
|
||||
});
|
||||
}
|
||||
|
||||
/// Get the latest snapshot.
|
||||
pub async fn latest(&self) -> Option<MetricSnapshot> {
|
||||
self.minute_data.read().await.back().cloned()
|
||||
}
|
||||
|
||||
/// Get minute-resolution data for the last N minutes.
|
||||
pub async fn history_minutes(&self, last_n: usize) -> Vec<MetricSnapshot> {
|
||||
let buf = self.minute_data.read().await;
|
||||
let start = buf.len().saturating_sub(last_n);
|
||||
buf.iter().skip(start).cloned().collect()
|
||||
}
|
||||
|
||||
/// Get quarter-hour-resolution data for the last N entries.
|
||||
pub async fn history_quarter_hours(&self, last_n: usize) -> Vec<MetricSnapshot> {
|
||||
let buf = self.quarter_hour_data.read().await;
|
||||
let start = buf.len().saturating_sub(last_n);
|
||||
buf.iter().skip(start).cloned().collect()
|
||||
}
|
||||
}
|
||||
|
||||
/// Spawn the background metrics collector (runs every 60 seconds).
|
||||
pub fn spawn_metrics_collector(store: Arc<MetricsStore>) {
|
||||
tokio::spawn(async move {
|
||||
// Wait 30s for system to stabilize after boot
|
||||
tokio::time::sleep(std::time::Duration::from_secs(30)).await;
|
||||
|
||||
let mut interval = tokio::time::interval(std::time::Duration::from_secs(60));
|
||||
|
||||
loop {
|
||||
interval.tick().await;
|
||||
|
||||
match collector::collect_snapshot().await {
|
||||
Ok(snapshot) => {
|
||||
store.push(snapshot).await;
|
||||
debug!("Metrics snapshot collected");
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Failed to collect metrics: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_metrics_store_new() {
|
||||
let store = MetricsStore::new();
|
||||
assert_eq!(store.ws_connections.load(Ordering::Relaxed), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ws_connection_tracking() {
|
||||
let store = MetricsStore::new();
|
||||
store.increment_ws();
|
||||
store.increment_ws();
|
||||
assert_eq!(store.ws_connections.load(Ordering::Relaxed), 2);
|
||||
store.decrement_ws();
|
||||
assert_eq!(store.ws_connections.load(Ordering::Relaxed), 1);
|
||||
store.decrement_ws();
|
||||
assert_eq!(store.ws_connections.load(Ordering::Relaxed), 0);
|
||||
// Decrement below zero should stay at 0
|
||||
store.decrement_ws();
|
||||
assert_eq!(store.ws_connections.load(Ordering::Relaxed), 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_push_and_latest() {
|
||||
let store = MetricsStore::new();
|
||||
assert!(store.latest().await.is_none());
|
||||
|
||||
let snapshot = MetricSnapshot {
|
||||
timestamp: 1000,
|
||||
system: SystemMetrics {
|
||||
cpu_percent: 25.0,
|
||||
mem_used_bytes: 1_000_000,
|
||||
mem_total_bytes: 4_000_000,
|
||||
disk_used_bytes: 500_000,
|
||||
disk_total_bytes: 1_000_000,
|
||||
net_rx_bytes: 100,
|
||||
net_tx_bytes: 200,
|
||||
load_avg_1: 1.0,
|
||||
load_avg_5: 0.5,
|
||||
load_avg_15: 0.3,
|
||||
},
|
||||
containers: vec![],
|
||||
rpc_latency_ms: 0.0,
|
||||
ws_connections: 0,
|
||||
};
|
||||
|
||||
store.push(snapshot).await;
|
||||
let latest = store.latest().await.unwrap();
|
||||
assert_eq!(latest.timestamp, 1000);
|
||||
assert_eq!(latest.system.cpu_percent, 25.0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_rpc_latency_recording() {
|
||||
let store = MetricsStore::new();
|
||||
store.record_rpc_latency(10.0).await;
|
||||
store.record_rpc_latency(20.0).await;
|
||||
store.record_rpc_latency(30.0).await;
|
||||
|
||||
let snapshot = MetricSnapshot {
|
||||
timestamp: 2000,
|
||||
system: SystemMetrics {
|
||||
cpu_percent: 0.0,
|
||||
mem_used_bytes: 0,
|
||||
mem_total_bytes: 0,
|
||||
disk_used_bytes: 0,
|
||||
disk_total_bytes: 0,
|
||||
net_rx_bytes: 0,
|
||||
net_tx_bytes: 0,
|
||||
load_avg_1: 0.0,
|
||||
load_avg_5: 0.0,
|
||||
load_avg_15: 0.0,
|
||||
},
|
||||
containers: vec![],
|
||||
rpc_latency_ms: 0.0,
|
||||
ws_connections: 0,
|
||||
};
|
||||
|
||||
store.push(snapshot).await;
|
||||
let latest = store.latest().await.unwrap();
|
||||
assert_eq!(latest.rpc_latency_ms, 20.0); // average of 10+20+30 = 20
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_history_minutes() {
|
||||
let store = MetricsStore::new();
|
||||
|
||||
for i in 0..5 {
|
||||
let snapshot = MetricSnapshot {
|
||||
timestamp: i * 60,
|
||||
system: SystemMetrics {
|
||||
cpu_percent: i as f64,
|
||||
mem_used_bytes: 0,
|
||||
mem_total_bytes: 0,
|
||||
disk_used_bytes: 0,
|
||||
disk_total_bytes: 0,
|
||||
net_rx_bytes: 0,
|
||||
net_tx_bytes: 0,
|
||||
load_avg_1: 0.0,
|
||||
load_avg_5: 0.0,
|
||||
load_avg_15: 0.0,
|
||||
},
|
||||
containers: vec![],
|
||||
rpc_latency_ms: 0.0,
|
||||
ws_connections: 0,
|
||||
};
|
||||
store.push(snapshot).await;
|
||||
}
|
||||
|
||||
let history = store.history_minutes(3).await;
|
||||
assert_eq!(history.len(), 3);
|
||||
assert_eq!(history[0].timestamp, 120);
|
||||
assert_eq!(history[2].timestamp, 240);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ring_buffer_eviction() {
|
||||
let store = MetricsStore::new();
|
||||
|
||||
// Push more than MAX_1MIN_ENTRIES
|
||||
for i in 0..(MAX_1MIN_ENTRIES + 10) {
|
||||
let snapshot = MetricSnapshot {
|
||||
timestamp: i as i64,
|
||||
system: SystemMetrics {
|
||||
cpu_percent: 0.0,
|
||||
mem_used_bytes: 0,
|
||||
mem_total_bytes: 0,
|
||||
disk_used_bytes: 0,
|
||||
disk_total_bytes: 0,
|
||||
net_rx_bytes: 0,
|
||||
net_tx_bytes: 0,
|
||||
load_avg_1: 0.0,
|
||||
load_avg_5: 0.0,
|
||||
load_avg_15: 0.0,
|
||||
},
|
||||
containers: vec![],
|
||||
rpc_latency_ms: 0.0,
|
||||
ws_connections: 0,
|
||||
};
|
||||
store.push(snapshot).await;
|
||||
}
|
||||
|
||||
let all = store.history_minutes(MAX_1MIN_ENTRIES + 100).await;
|
||||
assert_eq!(all.len(), MAX_1MIN_ENTRIES);
|
||||
// Oldest entry should be 10 (first 10 were evicted)
|
||||
assert_eq!(all[0].timestamp, 10);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_quarter_hour_downsampling() {
|
||||
let store = MetricsStore::new();
|
||||
|
||||
// Push exactly 15 entries to trigger one quarter-hour sample
|
||||
for i in 0..15 {
|
||||
let snapshot = MetricSnapshot {
|
||||
timestamp: i * 60,
|
||||
system: SystemMetrics {
|
||||
cpu_percent: 50.0,
|
||||
mem_used_bytes: 0,
|
||||
mem_total_bytes: 0,
|
||||
disk_used_bytes: 0,
|
||||
disk_total_bytes: 0,
|
||||
net_rx_bytes: 0,
|
||||
net_tx_bytes: 0,
|
||||
load_avg_1: 0.0,
|
||||
load_avg_5: 0.0,
|
||||
load_avg_15: 0.0,
|
||||
},
|
||||
containers: vec![],
|
||||
rpc_latency_ms: 0.0,
|
||||
ws_connections: 0,
|
||||
};
|
||||
store.push(snapshot).await;
|
||||
}
|
||||
|
||||
let qh = store.history_quarter_hours(10).await;
|
||||
assert_eq!(qh.len(), 1);
|
||||
assert_eq!(qh[0].timestamp, 14 * 60); // The 15th entry (index 14)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_constants() {
|
||||
assert_eq!(MAX_1MIN_ENTRIES, 1440);
|
||||
assert_eq!(MAX_15MIN_ENTRIES, 672);
|
||||
}
|
||||
}
|
||||
@ -2,6 +2,7 @@ use crate::api::ApiHandler;
|
||||
use crate::config::{Config, ContainerRuntime};
|
||||
use crate::container::{docker_packages, DockerPackageScanner};
|
||||
use crate::identity::{self, NodeIdentity};
|
||||
use crate::monitoring::MetricsStore;
|
||||
use crate::node_message;
|
||||
use crate::nostr_discovery;
|
||||
use crate::peers;
|
||||
@ -13,7 +14,7 @@ use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::net::TcpListener;
|
||||
use tracing::{debug, error, info};
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
pub struct Server {
|
||||
_config: Config,
|
||||
@ -74,7 +75,14 @@ impl Server {
|
||||
info!("🔑 Node identity: {} (pubkey: {}...)", identity.node_id(), &identity.pubkey_hex()[..16.min(identity.pubkey_hex().len())]);
|
||||
|
||||
let identity = Arc::new(identity);
|
||||
let api_handler = Arc::new(ApiHandler::new(config.clone(), state_manager.clone()).await?);
|
||||
|
||||
// Create metrics store and spawn background collector
|
||||
let metrics_store = Arc::new(MetricsStore::new());
|
||||
crate::monitoring::spawn_metrics_collector(metrics_store.clone());
|
||||
|
||||
let api_handler = Arc::new(
|
||||
ApiHandler::new(config.clone(), state_manager.clone(), metrics_store).await?,
|
||||
);
|
||||
|
||||
// Periodic Tor address refresh (runs regardless of dev_mode)
|
||||
// Picks up hostname when Tor creates it after startup/rotation (30-60s delay)
|
||||
@ -131,6 +139,9 @@ impl Server {
|
||||
});
|
||||
}
|
||||
|
||||
// Container health monitoring — auto-restart unhealthy containers
|
||||
crate::health_monitor::spawn_health_monitor(state_manager.clone());
|
||||
|
||||
Ok(Self {
|
||||
_config: config,
|
||||
_identity: identity,
|
||||
@ -140,37 +151,71 @@ impl Server {
|
||||
}
|
||||
|
||||
pub async fn serve(&self, addr: SocketAddr) -> Result<()> {
|
||||
self.serve_with_shutdown(addr, std::future::pending()).await
|
||||
}
|
||||
|
||||
/// Serve with a graceful shutdown signal.
|
||||
/// When the shutdown future completes, stop accepting new connections and drain in-flight requests.
|
||||
pub async fn serve_with_shutdown(
|
||||
&self,
|
||||
addr: SocketAddr,
|
||||
shutdown: impl std::future::Future<Output = ()>,
|
||||
) -> Result<()> {
|
||||
let listener = TcpListener::bind(addr).await?;
|
||||
let active_connections = Arc::new(tokio::sync::Semaphore::new(1024));
|
||||
|
||||
tokio::pin!(shutdown);
|
||||
|
||||
loop {
|
||||
let (stream, peer_addr) = match listener.accept().await {
|
||||
Ok(conn) => conn,
|
||||
Err(e) => {
|
||||
error!("Failed to accept connection: {}", e);
|
||||
continue;
|
||||
tokio::select! {
|
||||
result = listener.accept() => {
|
||||
let (stream, peer_addr) = match result {
|
||||
Ok(conn) => conn,
|
||||
Err(e) => {
|
||||
error!("Failed to accept connection: {}", e);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let handler = self.api_handler.clone();
|
||||
let permit = active_connections.clone().acquire_owned().await;
|
||||
|
||||
tokio::spawn(async move {
|
||||
let _permit = permit;
|
||||
let service = service_fn(move |req| {
|
||||
let handler = handler.clone();
|
||||
async move {
|
||||
handler.handle_request(req).await
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, format!("{}", e)))
|
||||
}
|
||||
});
|
||||
|
||||
if let Err(e) = Http::new()
|
||||
.http1_keep_alive(false)
|
||||
.serve_connection(stream, service)
|
||||
.with_upgrades()
|
||||
.await
|
||||
{
|
||||
error!("Error serving connection from {}: {}", peer_addr, e);
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
let handler = self.api_handler.clone();
|
||||
|
||||
tokio::spawn(async move {
|
||||
let service = service_fn(move |req| {
|
||||
let handler = handler.clone();
|
||||
async move {
|
||||
handler.handle_request(req).await
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, format!("{}", e)))
|
||||
_ = &mut shutdown => {
|
||||
info!("Shutdown signal received, draining connections...");
|
||||
// Wait up to 5 seconds for in-flight requests to complete
|
||||
let drain_start = std::time::Instant::now();
|
||||
let drain_timeout = std::time::Duration::from_secs(5);
|
||||
while active_connections.available_permits() < 1024 {
|
||||
if drain_start.elapsed() > drain_timeout {
|
||||
warn!("Drain timeout reached, forcing shutdown");
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
|
||||
}
|
||||
});
|
||||
|
||||
if let Err(e) = Http::new()
|
||||
.http1_keep_alive(false)
|
||||
.serve_connection(stream, service)
|
||||
.with_upgrades()
|
||||
.await
|
||||
{
|
||||
error!("Error serving connection from {}: {}", peer_addr, e);
|
||||
info!("Shutdown complete");
|
||||
return Ok(());
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -326,7 +326,7 @@
|
||||
|
||||
- [x] **COMM-03** — Set up issue tracker and roadmap. Configure GitHub Issues with labels, templates, and project board. Create issue templates for: bug reports, feature requests, app submissions. **Acceptance**: Issue tracker ready for community use.
|
||||
|
||||
- [ ] **COMM-04** — Publish v0.9.0 release. Final pre-1.0 release. Full ISO builds, comprehensive release notes, migration guide from 0.8. **Acceptance**: Published release, tested on 3+ hardware configs.
|
||||
- [ ] **COMM-04** — (BLOCKED: requires physical ISO build on server and testing on 3+ hardware configs — cannot be automated from code) Publish v0.9.0 release. Final pre-1.0 release. Full ISO builds, comprehensive release notes, migration guide from 0.8. **Acceptance**: Published release, tested on 3+ hardware configs.
|
||||
|
||||
---
|
||||
|
||||
@ -336,7 +336,7 @@
|
||||
|
||||
#### Sprint 27: Advanced Monitoring (Week 1-4)
|
||||
|
||||
- [ ] **MON-01** — Implement real-time metrics collection. Add `core/archipelago/src/monitoring/collector.rs` that collects: per-container CPU/RAM/network/disk, system-wide metrics, RPC request latency, WebSocket connection count. Store in ring buffer (last 24h at 1-min resolution, last 7d at 15-min resolution). **Acceptance**: Metrics collected and queryable via RPC.
|
||||
- [x] **MON-01** — Implement real-time metrics collection. Add `core/archipelago/src/monitoring/collector.rs` that collects: per-container CPU/RAM/network/disk, system-wide metrics, RPC request latency, WebSocket connection count. Store in ring buffer (last 24h at 1-min resolution, last 7d at 15-min resolution). **Acceptance**: Metrics collected and queryable via RPC.
|
||||
|
||||
- [ ] **MON-02** — Add monitoring dashboard page. Create `neode-ui/src/views/Monitoring.vue` at `/dashboard/monitoring` with: real-time line charts (CPU, RAM, network), per-container resource breakdown, alert history, system health timeline. Use canvas-based charts (no heavy library -- build simple line chart component). **Acceptance**: Real-time metrics visible with 5s refresh.
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user