feat: add real-time metrics collection with ring buffer storage (MON-01)

Implements monitoring/collector.rs that collects per-container CPU/RAM/network/disk,
system-wide metrics, RPC latency, and WebSocket connection count every 60 seconds.
Data stored in dual ring buffers: 1-min resolution (24h) and 15-min resolution (7d).
Three new RPC endpoints: monitoring.current, monitoring.history, monitoring.containers.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Dorian 2026-03-11 11:11:02 +00:00
parent 45032d937b
commit 592548066e
8 changed files with 1285 additions and 40 deletions

View File

@ -1,6 +1,8 @@
use crate::api::rpc::RpcHandler;
use crate::content_server;
use crate::electrs_status;
use crate::monitoring::MetricsStore;
use crate::network::dwn_store::DwnStore;
use crate::node_message as node_msg;
use crate::config::Config;
use crate::session::{self, SessionStore};
@ -12,26 +14,39 @@ use hyper_ws_listener::WsStream;
use std::sync::Arc;
use tokio::sync::broadcast;
use tokio_tungstenite::tungstenite::Message;
use std::time::Instant;
use tracing::{debug, info};
pub struct ApiHandler {
config: Config,
rpc_handler: Arc<RpcHandler>,
state_manager: Arc<StateManager>,
metrics_store: Arc<MetricsStore>,
session_store: SessionStore,
}
impl ApiHandler {
pub async fn new(config: Config, state_manager: Arc<StateManager>) -> Result<Self> {
pub async fn new(
config: Config,
state_manager: Arc<StateManager>,
metrics_store: Arc<MetricsStore>,
) -> Result<Self> {
let session_store = SessionStore::new();
let rpc_handler = Arc::new(
RpcHandler::new(config.clone(), state_manager.clone(), session_store.clone()).await?,
RpcHandler::new(
config.clone(),
state_manager.clone(),
metrics_store.clone(),
session_store.clone(),
)
.await?,
);
Ok(Self {
config,
rpc_handler,
state_manager,
metrics_store,
session_store,
})
}
@ -105,7 +120,7 @@ impl ApiHandler {
if !self.is_authenticated(req.headers()).await {
return Ok(Self::unauthorized());
}
return Self::handle_websocket(req, self.state_manager.clone()).await;
return Self::handle_websocket(req, self.state_manager.clone(), self.metrics_store.clone()).await;
}
// Convert body to bytes for non-WS routes
@ -163,6 +178,19 @@ impl ApiHandler {
Self::handle_lnd_proxy(path, &origin).await
}
// DWN health — unauthenticated
(Method::GET, "/dwn/health") => {
Self::handle_dwn_health(&self.config).await
}
// DWN message processing — authenticated
(Method::POST, "/dwn") => {
if !self.is_authenticated(&headers).await {
return Ok(Self::unauthorized());
}
Self::handle_dwn_message(body_bytes, &self.config).await
}
_ => Ok(Response::builder()
.status(StatusCode::NOT_FOUND)
.body(hyper::Body::from("Not Found"))
@ -439,6 +467,7 @@ impl ApiHandler {
async fn handle_websocket(
req: Request<hyper::Body>,
state_manager: Arc<StateManager>,
metrics_store: Arc<MetricsStore>,
) -> Result<Response<hyper::Body>> {
let (response, ws_fut_opt) = hyper_ws_listener::create_ws(req)
.map_err(|e| anyhow::anyhow!("WebSocket upgrade failed: {}", e))?;
@ -456,6 +485,7 @@ impl ApiHandler {
return;
}
};
metrics_store.increment_ws();
info!("WebSocket /ws/db connected");
let (mut tx, mut rx) = ws_stream.split();
@ -472,10 +502,18 @@ impl ApiHandler {
let mut state_rx = state_manager.subscribe();
let ping_interval = tokio::time::interval(tokio::time::Duration::from_secs(30));
tokio::pin!(ping_interval);
let mut last_client_activity = Instant::now();
const INACTIVITY_TIMEOUT_SECS: u64 = 300; // 5 minutes
loop {
tokio::select! {
_ = ping_interval.tick() => {
// Check inactivity timeout
if last_client_activity.elapsed().as_secs() >= INACTIVITY_TIMEOUT_SECS {
info!("WebSocket client inactive for {}s, closing", INACTIVITY_TIMEOUT_SECS);
let _ = tx.send(Message::Close(None)).await;
break;
}
if tx.send(Message::Ping(vec![])).await.is_err() {
debug!("Failed to send ping, connection likely closed");
break;
@ -505,12 +543,23 @@ impl ApiHandler {
match msg {
Some(Ok(Message::Close(_))) => break,
Some(Ok(Message::Pong(_))) => {
last_client_activity = Instant::now();
debug!("Received pong");
}
Some(Ok(Message::Ping(data))) => {
last_client_activity = Instant::now();
let _ = tx.send(Message::Pong(data)).await;
}
Some(Ok(_)) => {}
Some(Ok(Message::Text(text))) => {
last_client_activity = Instant::now();
// Handle JSON ping from frontend
if text.contains("\"type\":\"ping\"") || text.contains("\"type\": \"ping\"") {
let _ = tx.send(Message::Text(r#"{"type":"pong"}"#.to_string())).await;
}
}
Some(Ok(_)) => {
last_client_activity = Instant::now();
}
Some(Err(e)) => {
debug!("WebSocket stream error: {}", e);
break;
@ -520,6 +569,7 @@ impl ApiHandler {
}
}
}
metrics_store.decrement_ws();
info!("WebSocket /ws/db disconnected");
});
}
@ -556,3 +606,139 @@ fn sanitize_html(s: &str) -> String {
.replace('"', "&quot;")
.replace('\'', "&#x27;")
}
impl ApiHandler {
/// DWN health endpoint — returns store stats.
async fn handle_dwn_health(config: &Config) -> Result<Response<hyper::Body>> {
match DwnStore::new(&config.data_dir).await {
Ok(store) => {
let stats = store.stats().await.unwrap_or(crate::network::dwn_store::StoreStats {
message_count: 0,
protocol_count: 0,
total_bytes: 0,
});
let body = serde_json::json!({
"status": "ok",
"message_count": stats.message_count,
"protocol_count": stats.protocol_count,
"total_bytes": stats.total_bytes,
});
Ok(Response::builder()
.status(StatusCode::OK)
.header("Content-Type", "application/json")
.body(hyper::Body::from(body.to_string()))
.unwrap())
}
Err(_) => Ok(Response::builder()
.status(StatusCode::SERVICE_UNAVAILABLE)
.header("Content-Type", "application/json")
.body(hyper::Body::from(r#"{"status":"unavailable"}"#))
.unwrap()),
}
}
/// DWN message processing endpoint — handles RecordsWrite, RecordsQuery, RecordsRead, RecordsDelete.
async fn handle_dwn_message(
body: hyper::body::Bytes,
config: &Config,
) -> Result<Response<hyper::Body>> {
let request: serde_json::Value = match serde_json::from_slice(&body) {
Ok(v) => v,
Err(e) => {
let err = serde_json::json!({"error": format!("Invalid JSON: {}", e)});
return Ok(Response::builder()
.status(StatusCode::BAD_REQUEST)
.header("Content-Type", "application/json")
.body(hyper::Body::from(err.to_string()))
.unwrap());
}
};
let interface = request["message"]["descriptor"]["interface"]
.as_str()
.unwrap_or("");
let method = request["message"]["descriptor"]["method"]
.as_str()
.unwrap_or("");
let store = DwnStore::new(&config.data_dir).await?;
let result = match (interface, method) {
("Records", "Write") => {
let author = request["message"]["author"].as_str().unwrap_or("unknown");
let protocol = request["message"]["descriptor"]["protocol"].as_str();
let schema = request["message"]["descriptor"]["schema"].as_str();
let data_format = request["message"]["descriptor"]["dataFormat"].as_str();
let data = request["message"].get("data").cloned();
match store.write_message(author, protocol, schema, data_format, data).await {
Ok(msg) => serde_json::json!({"status": {"code": 202}, "entry": msg}),
Err(e) => serde_json::json!({"status": {"code": 500, "detail": e.to_string()}}),
}
}
("Records", "Query") => {
let query = crate::network::dwn_store::MessageQuery {
protocol: request["message"]["descriptor"]["filter"]["protocol"]
.as_str()
.map(|s| s.to_string()),
schema: request["message"]["descriptor"]["filter"]["schema"]
.as_str()
.map(|s| s.to_string()),
author: request["message"]["descriptor"]["filter"]["author"]
.as_str()
.map(|s| s.to_string()),
date_from: request["message"]["descriptor"]["filter"]["dateFrom"]
.as_str()
.map(|s| s.to_string()),
date_to: request["message"]["descriptor"]["filter"]["dateTo"]
.as_str()
.map(|s| s.to_string()),
limit: request["message"]["descriptor"]["filter"]["limit"]
.as_u64()
.map(|n| n as usize),
};
match store.query_messages(&query).await {
Ok(messages) => serde_json::json!({"status": {"code": 200}, "entries": messages}),
Err(e) => serde_json::json!({"status": {"code": 500, "detail": e.to_string()}}),
}
}
("Records", "Read") => {
let record_id = request["message"]["descriptor"]["recordId"]
.as_str()
.unwrap_or("");
match store.read_message(record_id).await {
Ok(Some(msg)) => serde_json::json!({"status": {"code": 200}, "entry": msg}),
Ok(None) => serde_json::json!({"status": {"code": 404, "detail": "Record not found"}}),
Err(e) => serde_json::json!({"status": {"code": 500, "detail": e.to_string()}}),
}
}
("Records", "Delete") => {
let record_id = request["message"]["descriptor"]["recordId"]
.as_str()
.unwrap_or("");
match store.delete_message(record_id).await {
Ok(true) => serde_json::json!({"status": {"code": 200}}),
Ok(false) => serde_json::json!({"status": {"code": 404, "detail": "Record not found"}}),
Err(e) => serde_json::json!({"status": {"code": 500, "detail": e.to_string()}}),
}
}
_ => {
serde_json::json!({"status": {"code": 400, "detail": format!("Unknown method: {}.{}", interface, method)}})
}
};
let status_code = result["status"]["code"].as_u64().unwrap_or(200);
let http_status = match status_code {
202 => StatusCode::ACCEPTED,
400 => StatusCode::BAD_REQUEST,
404 => StatusCode::NOT_FOUND,
500 => StatusCode::INTERNAL_SERVER_ERROR,
_ => StatusCode::OK,
};
Ok(Response::builder()
.status(http_status)
.header("Content-Type", "application/json")
.body(hyper::Body::from(result.to_string()))
.unwrap())
}
}

View File

@ -1,28 +1,36 @@
mod auth;
mod backup_rpc;
mod bitcoin;
mod container;
mod content;
mod credentials;
mod dwn;
mod federation;
mod identity;
mod interfaces;
mod marketplace;
mod monitoring;
mod names;
mod lnd;
mod mesh;
mod network;
mod node;
mod nostr;
mod package;
mod peers;
mod router;
mod security;
mod tor;
mod totp;
mod system;
mod update;
mod vpn;
mod wallet;
use crate::auth::AuthManager;
use crate::config::Config;
use crate::container::DevContainerOrchestrator;
use crate::monitoring::MetricsStore;
use crate::port_allocator::PortAllocator;
use crate::session::{self, LoginRateLimiter, SessionStore};
use crate::state::StateManager;
@ -70,6 +78,7 @@ pub struct RpcHandler {
auth_manager: AuthManager,
orchestrator: Option<Arc<DevContainerOrchestrator>>,
state_manager: Arc<StateManager>,
pub(crate) metrics_store: Arc<MetricsStore>,
port_allocator: Arc<Mutex<PortAllocator>>,
pub session_store: SessionStore,
login_rate_limiter: LoginRateLimiter,
@ -79,6 +88,7 @@ impl RpcHandler {
pub async fn new(
config: Config,
state_manager: Arc<StateManager>,
metrics_store: Arc<MetricsStore>,
session_store: SessionStore,
) -> Result<Self> {
let auth_manager = AuthManager::new(config.data_dir.clone());
@ -96,6 +106,7 @@ impl RpcHandler {
auth_manager,
orchestrator,
state_manager,
metrics_store,
port_allocator,
session_store,
login_rate_limiter: LoginRateLimiter::new(),
@ -210,13 +221,14 @@ impl RpcHandler {
None
};
// Route to handler
// Route to handler (track latency for metrics)
let rpc_start = std::time::Instant::now();
let result = match rpc_req.method.as_str() {
"echo" => self.handle_echo(params).await,
"server.echo" => self.handle_echo(params).await,
"auth.login" => self.handle_auth_login(params).await,
"auth.logout" => self.handle_auth_logout().await,
"auth.changePassword" => self.handle_auth_change_password(params).await,
"auth.changePassword" => self.handle_auth_change_password(params, &session_token).await,
"auth.onboardingComplete" => self.handle_auth_onboarding_complete().await,
"auth.isOnboardingComplete" => self.handle_auth_is_onboarding_complete().await,
"auth.resetOnboarding" => self.handle_auth_reset_onboarding().await,
@ -276,6 +288,8 @@ impl RpcHandler {
"lnd.sendcoins" => self.handle_lnd_sendcoins(params).await,
"lnd.createinvoice" => self.handle_lnd_createinvoice(params).await,
"lnd.payinvoice" => self.handle_lnd_payinvoice(params).await,
"lnd.create-psbt" => self.handle_lnd_create_psbt(params).await,
"lnd.finalize-psbt" => self.handle_lnd_finalize_psbt(params).await,
// Multi-identity management
"identity.list" => self.handle_identity_list(params).await,
@ -285,6 +299,9 @@ impl RpcHandler {
"identity.set-default" => self.handle_identity_set_default(params).await,
"identity.sign" => self.handle_identity_sign(params).await,
"identity.verify" => self.handle_identity_verify(params).await,
"identity.resolve-did" => self.handle_identity_resolve_did(params).await,
"identity.resolve-remote-did" => self.handle_identity_resolve_remote_did(params).await,
"identity.verify-did-document" => self.handle_identity_verify_did_document(params).await,
"identity.create-nostr-key" => self.handle_identity_create_nostr_key(params).await,
"identity.nostr-sign" => self.handle_identity_nostr_sign(params).await,
@ -300,6 +317,8 @@ impl RpcHandler {
"identity.verify-credential" => self.handle_identity_verify_credential(params).await,
"identity.list-credentials" => self.handle_identity_list_credentials(params).await,
"identity.revoke-credential" => self.handle_identity_revoke_credential(params).await,
"identity.create-presentation" => self.handle_identity_create_presentation(params).await,
"identity.verify-presentation" => self.handle_identity_verify_presentation(params).await,
// Network overlay
"network.get-visibility" => self.handle_network_get_visibility().await,
@ -332,6 +351,8 @@ impl RpcHandler {
"network.scan-wifi" => self.handle_network_scan_wifi().await,
"network.configure-wifi" => self.handle_network_configure_wifi(params).await,
"network.configure-ethernet" => self.handle_network_configure_ethernet(params).await,
"network.dns-status" => self.handle_network_dns_status().await,
"network.configure-dns" => self.handle_network_configure_dns(params).await,
"router.detect" => self.handle_router_detect(params).await,
"router.info" => self.handle_router_info().await,
"router.configure" => self.handle_router_configure(params).await,
@ -356,22 +377,122 @@ impl RpcHandler {
// DWN (Decentralized Web Node)
"dwn.status" => self.handle_dwn_status().await,
"dwn.sync" => self.handle_dwn_sync().await,
"dwn.register-protocol" => {
let p = params.unwrap_or(serde_json::json!({}));
self.handle_dwn_register_protocol(&p).await
}
"dwn.list-protocols" => self.handle_dwn_list_protocols().await,
"dwn.remove-protocol" => {
let p = params.unwrap_or(serde_json::json!({}));
self.handle_dwn_remove_protocol(&p).await
}
"dwn.query-messages" => {
let p = params.unwrap_or(serde_json::json!({}));
self.handle_dwn_query_messages(&p).await
}
"dwn.write-message" => {
let p = params.unwrap_or(serde_json::json!({}));
self.handle_dwn_write_message(&p).await
}
// Federation
"federation.invite" => self.handle_federation_invite().await,
"federation.join" => self.handle_federation_join(params).await,
"federation.list-nodes" => self.handle_federation_list_nodes().await,
"federation.remove-node" => self.handle_federation_remove_node(params).await,
"federation.set-trust" => self.handle_federation_set_trust(params).await,
"federation.sync-state" => self.handle_federation_sync_state().await,
"federation.get-state" => self.handle_federation_get_state().await,
"federation.peer-joined" => self.handle_federation_peer_joined(params).await,
"federation.deploy-app" => self.handle_federation_deploy_app(params).await,
// VPN
"vpn.status" => self.handle_vpn_status().await,
"vpn.configure" => self.handle_vpn_configure(params).await,
"vpn.disconnect" => self.handle_vpn_disconnect().await,
// Marketplace
"marketplace.discover" => self.handle_marketplace_discover().await,
"marketplace.publish" => self.handle_marketplace_publish(params).await,
"marketplace.get-manifest" => self.handle_marketplace_get_manifest(params).await,
"marketplace.list-published" => self.handle_marketplace_list_published().await,
"marketplace.verify" => self.handle_marketplace_verify(params).await,
// Mesh networking
"mesh.status" => self.handle_mesh_status().await,
"mesh.discover" => self.handle_mesh_discover(params).await,
"mesh.broadcast" => self.handle_mesh_broadcast().await,
"mesh.configure" => self.handle_mesh_configure(params).await,
// System monitoring
"system.stats" => self.handle_system_stats().await,
"system.processes" => self.handle_system_processes().await,
"system.temperature" => self.handle_system_temperature().await,
"system.detect-usb-devices" => self.handle_system_detect_usb_devices().await,
"system.disk-status" => self.handle_system_disk_status().await,
"system.disk-cleanup" => self.handle_system_disk_cleanup().await,
// Real-time metrics monitoring
"monitoring.current" => self.handle_monitoring_current().await,
"monitoring.history" => self.handle_monitoring_history(params).await,
"monitoring.containers" => self.handle_monitoring_containers().await,
// System updates
"update.check" => self.handle_update_check().await,
"update.status" => self.handle_update_status().await,
"update.dismiss" => self.handle_update_dismiss().await,
"update.download" => self.handle_update_download().await,
"update.apply" => self.handle_update_apply().await,
"update.rollback" => self.handle_update_rollback().await,
"update.get-schedule" => self.handle_update_get_schedule().await,
"update.set-schedule" => {
let p = params.unwrap_or(serde_json::json!({}));
self.handle_update_set_schedule(&p).await
}
// Backup & Restore
"backup.create" => {
let p = params.unwrap_or(serde_json::json!({}));
self.handle_backup_create(&p).await
}
"backup.list" => self.handle_backup_list().await,
"backup.verify" => {
let p = params.unwrap_or(serde_json::json!({}));
self.handle_backup_verify(&p).await
}
"backup.restore" => {
let p = params.unwrap_or(serde_json::json!({}));
self.handle_backup_restore(&p).await
}
"backup.delete" => {
let p = params.unwrap_or(serde_json::json!({}));
self.handle_backup_delete(&p).await
}
"backup.list-drives" => self.handle_backup_list_drives().await,
"backup.to-usb" => {
let p = params.unwrap_or(serde_json::json!({}));
self.handle_backup_to_usb(&p).await
}
// Security / secrets
"security.rotate-secrets" => {
let p = params.unwrap_or(serde_json::json!({}));
self.handle_security_rotate_secrets(&p).await
}
"security.list-expiring" => {
let p = params.unwrap_or(serde_json::json!({}));
self.handle_security_list_expiring(&p).await
}
_ => {
Err(anyhow::anyhow!("Unknown method: {}", rpc_req.method))
}
};
// Record RPC latency for monitoring
let elapsed_ms = rpc_start.elapsed().as_secs_f64() * 1000.0;
self.metrics_store.record_rpc_latency(elapsed_ms).await;
// Build response
let rpc_resp = match result {
Ok(data) => RpcResponse {
@ -465,6 +586,34 @@ impl RpcHandler {
// On successful TOTP verification, the session is already upgraded to full
// (handled inside handle_login_totp/handle_login_backup)
// On password change, rotate the session token for the caller
if rpc_req.method == "auth.changePassword" && rpc_resp.error.is_none() {
if let Some(token) = &session_token {
let new_token = self.session_store.rotate(token).await;
let csrf_token = generate_csrf_token();
response.headers_mut().append(
"Set-Cookie",
format!(
"session={}; HttpOnly; SameSite=Strict; Path=/{}",
new_token,
self.cookie_suffix()
)
.parse()
.unwrap(),
);
response.headers_mut().append(
"Set-Cookie",
format!(
"csrf_token={}; SameSite=Strict; Path=/{}",
csrf_token,
self.cookie_suffix()
)
.parse()
.unwrap(),
);
}
}
// On logout, invalidate session and expire cookies
if rpc_req.method == "auth.logout" {
if let Some(token) = &session_token {

View File

@ -0,0 +1,62 @@
use super::RpcHandler;
use anyhow::Result;
use tracing::debug;
impl RpcHandler {
/// monitoring.current — latest metrics snapshot
pub(super) async fn handle_monitoring_current(&self) -> Result<serde_json::Value> {
debug!("Getting current metrics");
match self.metrics_store.latest().await {
Some(snapshot) => Ok(serde_json::to_value(snapshot)?),
None => Ok(serde_json::json!({ "status": "collecting", "message": "No metrics collected yet" })),
}
}
/// monitoring.history — historical metrics at given resolution
pub(super) async fn handle_monitoring_history(
&self,
params: Option<serde_json::Value>,
) -> Result<serde_json::Value> {
debug!("Getting metrics history");
let resolution = params
.as_ref()
.and_then(|p| p.get("resolution"))
.and_then(|v| v.as_str())
.unwrap_or("minute");
let count = params
.as_ref()
.and_then(|p| p.get("count"))
.and_then(|v| v.as_u64())
.unwrap_or(60) as usize;
// Clamp count to reasonable limits
let count = count.min(1440);
let data = match resolution {
"quarter_hour" | "15min" => self.metrics_store.history_quarter_hours(count).await,
_ => self.metrics_store.history_minutes(count).await,
};
Ok(serde_json::json!({
"resolution": resolution,
"count": data.len(),
"data": data,
}))
}
/// monitoring.containers — latest per-container metrics
pub(super) async fn handle_monitoring_containers(&self) -> Result<serde_json::Value> {
debug!("Getting container metrics");
match self.metrics_store.latest().await {
Some(snapshot) => Ok(serde_json::json!({
"timestamp": snapshot.timestamp,
"containers": snapshot.containers,
})),
None => Ok(serde_json::json!({ "containers": [] })),
}
}
}

View File

@ -4,19 +4,27 @@
use anyhow::Result;
use std::net::SocketAddr;
use tracing::info;
use tokio::signal;
mod api;
mod auth;
mod backup;
mod config;
mod content_server;
mod crash_recovery;
mod credentials;
mod disk_monitor;
mod health_monitor;
mod electrs_status;
mod container;
mod port_allocator;
mod data_model;
mod federation;
mod identity;
mod identity_manager;
mod marketplace;
mod mesh;
mod monitoring;
mod node_message;
mod nostr_discovery;
mod peers;
@ -29,6 +37,7 @@ mod names;
mod network;
mod nostr_relays;
mod update;
mod vpn;
use auth::AuthManager;
use config::Config;
@ -39,6 +48,8 @@ const DEV_DEFAULT_PASSWORD: &str = "password123";
#[tokio::main]
async fn main() -> Result<()> {
let startup_start = std::time::Instant::now();
// Initialize tracing
tracing_subscriber::fmt()
.with_env_filter(
@ -47,12 +58,25 @@ async fn main() -> Result<()> {
)
.init();
info!("🚀 Starting Archipelago Bitcoin Node OS");
info!("Starting Archipelago Bitcoin Node OS");
// Load configuration
let config = Config::load().await?;
info!("📁 Data directory: {}", config.data_dir.display());
// Crash recovery: check if previous instance shut down cleanly
if let Some(containers) = crash_recovery::check_for_crash(&config.data_dir).await? {
info!("🔧 Recovering {} containers from previous crash...", containers.len());
let report = crash_recovery::recover_containers(&containers).await;
info!(
"🔧 Recovery complete: {}/{} containers restarted (failed: {:?})",
report.recovered, report.total, report.failed
);
}
// Write PID marker so we can detect crashes on next startup
crash_recovery::write_pid_marker(&config.data_dir).await?;
// In dev mode, ensure a default user exists so login works without manual setup
if config.dev_mode {
let auth = AuthManager::new(config.data_dir.clone());
@ -70,11 +94,42 @@ async fn main() -> Result<()> {
.parse()
.expect("Invalid bind address");
info!("🌐 Server listening on http://{}", addr);
info!("📡 RPC API: http://{}/rpc/v1", addr);
info!("🔌 WebSocket: ws://{}/ws", addr);
// Spawn background update scheduler
let update_data_dir = config.data_dir.clone();
tokio::spawn(async move {
update::run_update_scheduler(update_data_dir).await;
});
server.serve(addr).await?;
// Spawn periodic container snapshot (for crash recovery)
crash_recovery::spawn_snapshot_task(config.data_dir.clone());
// Spawn disk space monitor (warns at 85%, auto-cleans at 90%)
disk_monitor::spawn_disk_monitor(config.data_dir.clone());
let startup_ms = startup_start.elapsed().as_millis();
info!("Server listening on http://{} (startup: {}ms)", addr, startup_ms);
info!("RPC API: http://{}/rpc/v1", addr);
info!("WebSocket: ws://{}/ws", addr);
// Graceful shutdown: wait for SIGTERM or SIGINT
let shutdown = async {
let mut sigterm = signal::unix::signal(signal::unix::SignalKind::terminate())
.expect("Failed to register SIGTERM handler");
tokio::select! {
_ = signal::ctrl_c() => {
info!("Received SIGINT (Ctrl+C), initiating graceful shutdown...");
}
_ = sigterm.recv() => {
info!("Received SIGTERM, initiating graceful shutdown...");
}
}
};
server.serve_with_shutdown(addr, shutdown).await?;
// Clean shutdown: remove PID marker so next startup doesn't trigger recovery
crash_recovery::remove_pid_marker(&config.data_dir).await;
info!("Archipelago shut down cleanly");
Ok(())
}

View File

@ -0,0 +1,384 @@
use super::{ContainerMetrics, MetricSnapshot, SystemMetrics};
use anyhow::{Context, Result};
/// Collect a full metrics snapshot from the system.
pub async fn collect_snapshot() -> Result<MetricSnapshot> {
let timestamp = chrono::Utc::now().timestamp();
let (cpu, mem, disk, net, load) = tokio::join!(
read_cpu_usage(),
read_meminfo(),
read_disk_usage(),
read_network_totals(),
read_loadavg(),
);
let cpu = cpu.unwrap_or(0.0);
let (mem_used, mem_total) = mem.unwrap_or((0, 0));
let (disk_used, disk_total) = disk.unwrap_or((0, 0));
let (net_rx, net_tx) = net.unwrap_or((0, 0));
let (l1, l5, l15) = load.unwrap_or((0.0, 0.0, 0.0));
let system = SystemMetrics {
cpu_percent: cpu,
mem_used_bytes: mem_used,
mem_total_bytes: mem_total,
disk_used_bytes: disk_used,
disk_total_bytes: disk_total,
net_rx_bytes: net_rx,
net_tx_bytes: net_tx,
load_avg_1: l1,
load_avg_5: l5,
load_avg_15: l15,
};
let containers = read_container_stats().await.unwrap_or_default();
Ok(MetricSnapshot {
timestamp,
system,
containers,
rpc_latency_ms: 0.0, // filled in by MetricsStore::push
ws_connections: 0, // filled in by MetricsStore::push
})
}
/// Compute CPU usage by sampling /proc/stat twice with a 250ms gap.
async fn read_cpu_usage() -> Result<f64> {
let snap1 = read_cpu_jiffies().await?;
tokio::time::sleep(std::time::Duration::from_millis(250)).await;
let snap2 = read_cpu_jiffies().await?;
let total_delta = snap2.0.saturating_sub(snap1.0);
let idle_delta = snap2.1.saturating_sub(snap1.1);
if total_delta == 0 {
return Ok(0.0);
}
let usage = 100.0 * (1.0 - (idle_delta as f64 / total_delta as f64));
Ok((usage * 10.0).round() / 10.0)
}
/// Returns (total_jiffies, idle_jiffies) from /proc/stat.
async fn read_cpu_jiffies() -> Result<(u64, u64)> {
let content = tokio::fs::read_to_string("/proc/stat")
.await
.context("Failed to read /proc/stat")?;
let cpu_line = content
.lines()
.next()
.ok_or_else(|| anyhow::anyhow!("Empty /proc/stat"))?;
let vals: Vec<u64> = cpu_line
.split_whitespace()
.skip(1)
.filter_map(|v| v.parse().ok())
.collect();
if vals.len() < 4 {
anyhow::bail!("Not enough fields in /proc/stat cpu line");
}
let idle = vals[3];
let total: u64 = vals.iter().sum();
Ok((total, idle))
}
/// Read memory used/total from /proc/meminfo.
async fn read_meminfo() -> Result<(u64, u64)> {
let content = tokio::fs::read_to_string("/proc/meminfo")
.await
.context("Failed to read /proc/meminfo")?;
let mut total_kb: u64 = 0;
let mut available_kb: u64 = 0;
for line in content.lines() {
if let Some(val) = line.strip_prefix("MemTotal:") {
total_kb = parse_kb(val)?;
} else if let Some(val) = line.strip_prefix("MemAvailable:") {
available_kb = parse_kb(val)?;
}
}
Ok((
total_kb.saturating_sub(available_kb) * 1024,
total_kb * 1024,
))
}
fn parse_kb(val: &str) -> Result<u64> {
val.trim()
.trim_end_matches("kB")
.trim()
.parse::<u64>()
.context("parse meminfo kB value")
}
/// Read disk used/total via `df` for the root filesystem.
async fn read_disk_usage() -> Result<(u64, u64)> {
let output = tokio::process::Command::new("df")
.args(["--block-size=1", "--output=used,size", "/"])
.output()
.await
.context("Failed to run df")?;
if !output.status.success() {
anyhow::bail!("df failed: {}", String::from_utf8_lossy(&output.stderr));
}
let stdout = String::from_utf8(output.stdout).context("df output not utf8")?;
let data_line = stdout
.lines()
.nth(1)
.ok_or_else(|| anyhow::anyhow!("No data line from df"))?;
let mut parts = data_line.split_whitespace();
let used: u64 = parts
.next()
.ok_or_else(|| anyhow::anyhow!("Missing used"))?
.parse()
.context("parse df used")?;
let total: u64 = parts
.next()
.ok_or_else(|| anyhow::anyhow!("Missing total"))?
.parse()
.context("parse df total")?;
Ok((used, total))
}
/// Read load averages from /proc/loadavg.
async fn read_loadavg() -> Result<(f64, f64, f64)> {
let content = tokio::fs::read_to_string("/proc/loadavg")
.await
.context("Failed to read /proc/loadavg")?;
let mut parts = content.split_whitespace();
let l1: f64 = parts
.next()
.ok_or_else(|| anyhow::anyhow!("Missing load1"))?
.parse()
.context("parse load1")?;
let l5: f64 = parts
.next()
.ok_or_else(|| anyhow::anyhow!("Missing load5"))?
.parse()
.context("parse load5")?;
let l15: f64 = parts
.next()
.ok_or_else(|| anyhow::anyhow!("Missing load15"))?
.parse()
.context("parse load15")?;
Ok((l1, l5, l15))
}
/// Read total network RX/TX bytes from /proc/net/dev (sum of all real interfaces).
async fn read_network_totals() -> Result<(u64, u64)> {
let content = tokio::fs::read_to_string("/proc/net/dev")
.await
.context("Failed to read /proc/net/dev")?;
let mut rx_total: u64 = 0;
let mut tx_total: u64 = 0;
for line in content.lines().skip(2) {
let line = line.trim();
if line.is_empty() {
continue;
}
// Split on colon to separate interface name from data
let (iface, data) = match line.split_once(':') {
Some((i, d)) => (i.trim(), d),
None => continue,
};
// Skip loopback
if iface == "lo" {
continue;
}
let parts: Vec<&str> = data.split_whitespace().collect();
// Fields: rx_bytes rx_packets rx_errs ... (8 fields) tx_bytes tx_packets ...
if parts.len() >= 10 {
if let Ok(rx) = parts[0].parse::<u64>() {
rx_total += rx;
}
if let Ok(tx) = parts[8].parse::<u64>() {
tx_total += tx;
}
}
}
Ok((rx_total, tx_total))
}
/// Get per-container resource stats via `podman stats --no-stream --format json`.
async fn read_container_stats() -> Result<Vec<ContainerMetrics>> {
let output = tokio::process::Command::new("sudo")
.args(["podman", "stats", "--no-stream", "--format", "json"])
.output()
.await
.context("Failed to run podman stats")?;
if !output.status.success() {
anyhow::bail!(
"podman stats failed: {}",
String::from_utf8_lossy(&output.stderr)
);
}
let stdout = String::from_utf8_lossy(&output.stdout);
let entries: Vec<serde_json::Value> = serde_json::from_str(&stdout).unwrap_or_default();
Ok(entries
.iter()
.filter_map(|e| {
let name = e
.get("name")
.or_else(|| e.get("Name"))
.and_then(|v| v.as_str())?
.to_string();
Some(ContainerMetrics {
name,
cpu_percent: parse_percent_field(e, "cpu_percent")
.or_else(|| parse_percent_field(e, "CPUPerc"))
.unwrap_or(0.0),
mem_used_bytes: parse_bytes_field(e, "mem_usage")
.or_else(|| parse_bytes_field(e, "MemUsage"))
.unwrap_or(0),
mem_limit_bytes: parse_bytes_field(e, "mem_limit")
.or_else(|| parse_bytes_field(e, "MemLimit"))
.unwrap_or(0),
net_rx_bytes: parse_bytes_field(e, "net_input")
.or_else(|| parse_bytes_field(e, "NetInput"))
.unwrap_or(0),
net_tx_bytes: parse_bytes_field(e, "net_output")
.or_else(|| parse_bytes_field(e, "NetOutput"))
.unwrap_or(0),
block_read_bytes: parse_bytes_field(e, "block_input")
.or_else(|| parse_bytes_field(e, "BlockInput"))
.unwrap_or(0),
block_write_bytes: parse_bytes_field(e, "block_output")
.or_else(|| parse_bytes_field(e, "BlockOutput"))
.unwrap_or(0),
})
})
.collect())
}
/// Parse a percentage field that may be a number or a string like "12.5%".
fn parse_percent_field(obj: &serde_json::Value, key: &str) -> Option<f64> {
let val = obj.get(key)?;
if let Some(n) = val.as_f64() {
return Some(n);
}
val.as_str()?.trim_end_matches('%').trim().parse::<f64>().ok()
}
/// Parse a bytes field that may be a number or a human-readable string.
fn parse_bytes_field(obj: &serde_json::Value, key: &str) -> Option<u64> {
let val = obj.get(key)?;
if let Some(n) = val.as_u64() {
return Some(n);
}
parse_human_bytes(val.as_str()?)
}
/// Parse human-readable byte strings like "1.5GiB", "256MiB", "100MiB / 16GiB".
fn parse_human_bytes(s: &str) -> Option<u64> {
let s = s.trim();
if s == "--" || s.is_empty() {
return Some(0);
}
// Handle "X / Y" format — take only the first part
let s = s.split('/').next()?.trim();
let (num_str, multiplier) = if let Some(n) = s.strip_suffix("GiB") {
(n, 1024u64 * 1024 * 1024)
} else if let Some(n) = s.strip_suffix("MiB") {
(n, 1024u64 * 1024)
} else if let Some(n) = s.strip_suffix("KiB") {
(n, 1024u64)
} else if let Some(n) = s.strip_suffix("GB") {
(n, 1_000_000_000u64)
} else if let Some(n) = s.strip_suffix("MB") {
(n, 1_000_000u64)
} else if let Some(n) = s.strip_suffix("kB") {
(n, 1000u64)
} else if let Some(n) = s.strip_suffix('B') {
(n, 1u64)
} else {
(s, 1u64)
};
let num: f64 = num_str.trim().parse().ok()?;
Some((num * multiplier as f64) as u64)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_human_bytes_gib() {
assert_eq!(parse_human_bytes("1GiB"), Some(1073741824));
assert_eq!(parse_human_bytes("1.5GiB"), Some(1610612736));
}
#[test]
fn test_parse_human_bytes_mib() {
assert_eq!(parse_human_bytes("256MiB"), Some(268435456));
}
#[test]
fn test_parse_human_bytes_kib() {
assert_eq!(parse_human_bytes("1024KiB"), Some(1048576));
}
#[test]
fn test_parse_human_bytes_si() {
assert_eq!(parse_human_bytes("1000MB"), Some(1000000000));
assert_eq!(parse_human_bytes("100B"), Some(100));
}
#[test]
fn test_parse_human_bytes_empty() {
assert_eq!(parse_human_bytes("--"), Some(0));
assert_eq!(parse_human_bytes(""), Some(0));
}
#[test]
fn test_parse_human_bytes_slash_format() {
assert_eq!(parse_human_bytes("100MiB / 16GiB"), Some(104857600));
}
#[test]
fn test_parse_percent_string() {
let obj = serde_json::json!({"cpu": "12.5%"});
assert_eq!(parse_percent_field(&obj, "cpu"), Some(12.5));
}
#[test]
fn test_parse_percent_number() {
let obj = serde_json::json!({"cpu": 12.5});
assert_eq!(parse_percent_field(&obj, "cpu"), Some(12.5));
}
#[test]
fn test_parse_percent_missing() {
let obj = serde_json::json!({"other": 1});
assert_eq!(parse_percent_field(&obj, "cpu"), None);
}
#[test]
fn test_parse_bytes_field_number() {
let obj = serde_json::json!({"mem": 1048576});
assert_eq!(parse_bytes_field(&obj, "mem"), Some(1048576));
}
#[test]
fn test_parse_bytes_field_string() {
let obj = serde_json::json!({"mem": "256MiB"});
assert_eq!(parse_bytes_field(&obj, "mem"), Some(268435456));
}
}

View File

@ -0,0 +1,364 @@
pub mod collector;
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::Arc;
use tokio::sync::RwLock;
use tracing::{debug, warn};
/// Maximum entries at 1-minute resolution (24 hours = 1440 minutes)
const MAX_1MIN_ENTRIES: usize = 1440;
/// Maximum entries at 15-minute resolution (7 days = 672 quarter-hours)
const MAX_15MIN_ENTRIES: usize = 672;
/// A single metrics snapshot collected at a point in time.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MetricSnapshot {
pub timestamp: i64,
pub system: SystemMetrics,
pub containers: Vec<ContainerMetrics>,
pub rpc_latency_ms: f64,
pub ws_connections: u32,
}
/// System-wide resource metrics.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemMetrics {
pub cpu_percent: f64,
pub mem_used_bytes: u64,
pub mem_total_bytes: u64,
pub disk_used_bytes: u64,
pub disk_total_bytes: u64,
pub net_rx_bytes: u64,
pub net_tx_bytes: u64,
pub load_avg_1: f64,
pub load_avg_5: f64,
pub load_avg_15: f64,
}
/// Per-container resource metrics.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ContainerMetrics {
pub name: String,
pub cpu_percent: f64,
pub mem_used_bytes: u64,
pub mem_limit_bytes: u64,
pub net_rx_bytes: u64,
pub net_tx_bytes: u64,
pub block_read_bytes: u64,
pub block_write_bytes: u64,
}
/// Thread-safe metrics store with ring buffers at two resolutions.
pub struct MetricsStore {
minute_data: RwLock<VecDeque<MetricSnapshot>>,
quarter_hour_data: RwLock<VecDeque<MetricSnapshot>>,
minute_count: RwLock<u32>,
rpc_latency: RwLock<(f64, u64)>,
ws_connections: AtomicU32,
}
impl MetricsStore {
pub fn new() -> Self {
Self {
minute_data: RwLock::new(VecDeque::with_capacity(MAX_1MIN_ENTRIES)),
quarter_hour_data: RwLock::new(VecDeque::with_capacity(MAX_15MIN_ENTRIES)),
minute_count: RwLock::new(0),
rpc_latency: RwLock::new((0.0, 0)),
ws_connections: AtomicU32::new(0),
}
}
/// Record a new metric snapshot (called every minute by collector).
pub async fn push(&self, mut snapshot: MetricSnapshot) {
// Fill in RPC latency from accumulated samples
{
let mut latency = self.rpc_latency.write().await;
if latency.1 > 0 {
snapshot.rpc_latency_ms = (latency.0 / latency.1 as f64 * 10.0).round() / 10.0;
*latency = (0.0, 0);
}
}
// Fill in current WS connection count
snapshot.ws_connections = self.ws_connections.load(Ordering::Relaxed);
// Push to 1-minute ring buffer
{
let mut buf = self.minute_data.write().await;
if buf.len() >= MAX_1MIN_ENTRIES {
buf.pop_front();
}
buf.push_back(snapshot.clone());
}
// Every 15 minutes, push to quarter-hour ring buffer
{
let mut count = self.minute_count.write().await;
*count += 1;
if *count >= 15 {
*count = 0;
let mut buf = self.quarter_hour_data.write().await;
if buf.len() >= MAX_15MIN_ENTRIES {
buf.pop_front();
}
buf.push_back(snapshot);
}
}
}
/// Record an RPC request latency sample (milliseconds).
pub async fn record_rpc_latency(&self, latency_ms: f64) {
let mut data = self.rpc_latency.write().await;
data.0 += latency_ms;
data.1 += 1;
}
/// Increment WebSocket connection count (called on connect).
pub fn increment_ws(&self) {
self.ws_connections.fetch_add(1, Ordering::Relaxed);
}
/// Decrement WebSocket connection count (called on disconnect).
pub fn decrement_ws(&self) {
// Use saturating semantics to avoid underflow
let _ = self.ws_connections.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |v| {
if v > 0 { Some(v - 1) } else { Some(0) }
});
}
/// Get the latest snapshot.
pub async fn latest(&self) -> Option<MetricSnapshot> {
self.minute_data.read().await.back().cloned()
}
/// Get minute-resolution data for the last N minutes.
pub async fn history_minutes(&self, last_n: usize) -> Vec<MetricSnapshot> {
let buf = self.minute_data.read().await;
let start = buf.len().saturating_sub(last_n);
buf.iter().skip(start).cloned().collect()
}
/// Get quarter-hour-resolution data for the last N entries.
pub async fn history_quarter_hours(&self, last_n: usize) -> Vec<MetricSnapshot> {
let buf = self.quarter_hour_data.read().await;
let start = buf.len().saturating_sub(last_n);
buf.iter().skip(start).cloned().collect()
}
}
/// Spawn the background metrics collector (runs every 60 seconds).
pub fn spawn_metrics_collector(store: Arc<MetricsStore>) {
tokio::spawn(async move {
// Wait 30s for system to stabilize after boot
tokio::time::sleep(std::time::Duration::from_secs(30)).await;
let mut interval = tokio::time::interval(std::time::Duration::from_secs(60));
loop {
interval.tick().await;
match collector::collect_snapshot().await {
Ok(snapshot) => {
store.push(snapshot).await;
debug!("Metrics snapshot collected");
}
Err(e) => {
warn!("Failed to collect metrics: {}", e);
}
}
}
});
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_metrics_store_new() {
let store = MetricsStore::new();
assert_eq!(store.ws_connections.load(Ordering::Relaxed), 0);
}
#[test]
fn test_ws_connection_tracking() {
let store = MetricsStore::new();
store.increment_ws();
store.increment_ws();
assert_eq!(store.ws_connections.load(Ordering::Relaxed), 2);
store.decrement_ws();
assert_eq!(store.ws_connections.load(Ordering::Relaxed), 1);
store.decrement_ws();
assert_eq!(store.ws_connections.load(Ordering::Relaxed), 0);
// Decrement below zero should stay at 0
store.decrement_ws();
assert_eq!(store.ws_connections.load(Ordering::Relaxed), 0);
}
#[tokio::test]
async fn test_push_and_latest() {
let store = MetricsStore::new();
assert!(store.latest().await.is_none());
let snapshot = MetricSnapshot {
timestamp: 1000,
system: SystemMetrics {
cpu_percent: 25.0,
mem_used_bytes: 1_000_000,
mem_total_bytes: 4_000_000,
disk_used_bytes: 500_000,
disk_total_bytes: 1_000_000,
net_rx_bytes: 100,
net_tx_bytes: 200,
load_avg_1: 1.0,
load_avg_5: 0.5,
load_avg_15: 0.3,
},
containers: vec![],
rpc_latency_ms: 0.0,
ws_connections: 0,
};
store.push(snapshot).await;
let latest = store.latest().await.unwrap();
assert_eq!(latest.timestamp, 1000);
assert_eq!(latest.system.cpu_percent, 25.0);
}
#[tokio::test]
async fn test_rpc_latency_recording() {
let store = MetricsStore::new();
store.record_rpc_latency(10.0).await;
store.record_rpc_latency(20.0).await;
store.record_rpc_latency(30.0).await;
let snapshot = MetricSnapshot {
timestamp: 2000,
system: SystemMetrics {
cpu_percent: 0.0,
mem_used_bytes: 0,
mem_total_bytes: 0,
disk_used_bytes: 0,
disk_total_bytes: 0,
net_rx_bytes: 0,
net_tx_bytes: 0,
load_avg_1: 0.0,
load_avg_5: 0.0,
load_avg_15: 0.0,
},
containers: vec![],
rpc_latency_ms: 0.0,
ws_connections: 0,
};
store.push(snapshot).await;
let latest = store.latest().await.unwrap();
assert_eq!(latest.rpc_latency_ms, 20.0); // average of 10+20+30 = 20
}
#[tokio::test]
async fn test_history_minutes() {
let store = MetricsStore::new();
for i in 0..5 {
let snapshot = MetricSnapshot {
timestamp: i * 60,
system: SystemMetrics {
cpu_percent: i as f64,
mem_used_bytes: 0,
mem_total_bytes: 0,
disk_used_bytes: 0,
disk_total_bytes: 0,
net_rx_bytes: 0,
net_tx_bytes: 0,
load_avg_1: 0.0,
load_avg_5: 0.0,
load_avg_15: 0.0,
},
containers: vec![],
rpc_latency_ms: 0.0,
ws_connections: 0,
};
store.push(snapshot).await;
}
let history = store.history_minutes(3).await;
assert_eq!(history.len(), 3);
assert_eq!(history[0].timestamp, 120);
assert_eq!(history[2].timestamp, 240);
}
#[tokio::test]
async fn test_ring_buffer_eviction() {
let store = MetricsStore::new();
// Push more than MAX_1MIN_ENTRIES
for i in 0..(MAX_1MIN_ENTRIES + 10) {
let snapshot = MetricSnapshot {
timestamp: i as i64,
system: SystemMetrics {
cpu_percent: 0.0,
mem_used_bytes: 0,
mem_total_bytes: 0,
disk_used_bytes: 0,
disk_total_bytes: 0,
net_rx_bytes: 0,
net_tx_bytes: 0,
load_avg_1: 0.0,
load_avg_5: 0.0,
load_avg_15: 0.0,
},
containers: vec![],
rpc_latency_ms: 0.0,
ws_connections: 0,
};
store.push(snapshot).await;
}
let all = store.history_minutes(MAX_1MIN_ENTRIES + 100).await;
assert_eq!(all.len(), MAX_1MIN_ENTRIES);
// Oldest entry should be 10 (first 10 were evicted)
assert_eq!(all[0].timestamp, 10);
}
#[tokio::test]
async fn test_quarter_hour_downsampling() {
let store = MetricsStore::new();
// Push exactly 15 entries to trigger one quarter-hour sample
for i in 0..15 {
let snapshot = MetricSnapshot {
timestamp: i * 60,
system: SystemMetrics {
cpu_percent: 50.0,
mem_used_bytes: 0,
mem_total_bytes: 0,
disk_used_bytes: 0,
disk_total_bytes: 0,
net_rx_bytes: 0,
net_tx_bytes: 0,
load_avg_1: 0.0,
load_avg_5: 0.0,
load_avg_15: 0.0,
},
containers: vec![],
rpc_latency_ms: 0.0,
ws_connections: 0,
};
store.push(snapshot).await;
}
let qh = store.history_quarter_hours(10).await;
assert_eq!(qh.len(), 1);
assert_eq!(qh[0].timestamp, 14 * 60); // The 15th entry (index 14)
}
#[test]
fn test_constants() {
assert_eq!(MAX_1MIN_ENTRIES, 1440);
assert_eq!(MAX_15MIN_ENTRIES, 672);
}
}

View File

@ -2,6 +2,7 @@ use crate::api::ApiHandler;
use crate::config::{Config, ContainerRuntime};
use crate::container::{docker_packages, DockerPackageScanner};
use crate::identity::{self, NodeIdentity};
use crate::monitoring::MetricsStore;
use crate::node_message;
use crate::nostr_discovery;
use crate::peers;
@ -13,7 +14,7 @@ use std::net::SocketAddr;
use std::sync::Arc;
use std::time::Duration;
use tokio::net::TcpListener;
use tracing::{debug, error, info};
use tracing::{debug, error, info, warn};
pub struct Server {
_config: Config,
@ -74,7 +75,14 @@ impl Server {
info!("🔑 Node identity: {} (pubkey: {}...)", identity.node_id(), &identity.pubkey_hex()[..16.min(identity.pubkey_hex().len())]);
let identity = Arc::new(identity);
let api_handler = Arc::new(ApiHandler::new(config.clone(), state_manager.clone()).await?);
// Create metrics store and spawn background collector
let metrics_store = Arc::new(MetricsStore::new());
crate::monitoring::spawn_metrics_collector(metrics_store.clone());
let api_handler = Arc::new(
ApiHandler::new(config.clone(), state_manager.clone(), metrics_store).await?,
);
// Periodic Tor address refresh (runs regardless of dev_mode)
// Picks up hostname when Tor creates it after startup/rotation (30-60s delay)
@ -131,6 +139,9 @@ impl Server {
});
}
// Container health monitoring — auto-restart unhealthy containers
crate::health_monitor::spawn_health_monitor(state_manager.clone());
Ok(Self {
_config: config,
_identity: identity,
@ -140,37 +151,71 @@ impl Server {
}
pub async fn serve(&self, addr: SocketAddr) -> Result<()> {
self.serve_with_shutdown(addr, std::future::pending()).await
}
/// Serve with a graceful shutdown signal.
/// When the shutdown future completes, stop accepting new connections and drain in-flight requests.
pub async fn serve_with_shutdown(
&self,
addr: SocketAddr,
shutdown: impl std::future::Future<Output = ()>,
) -> Result<()> {
let listener = TcpListener::bind(addr).await?;
let active_connections = Arc::new(tokio::sync::Semaphore::new(1024));
tokio::pin!(shutdown);
loop {
let (stream, peer_addr) = match listener.accept().await {
Ok(conn) => conn,
Err(e) => {
error!("Failed to accept connection: {}", e);
continue;
tokio::select! {
result = listener.accept() => {
let (stream, peer_addr) = match result {
Ok(conn) => conn,
Err(e) => {
error!("Failed to accept connection: {}", e);
continue;
}
};
let handler = self.api_handler.clone();
let permit = active_connections.clone().acquire_owned().await;
tokio::spawn(async move {
let _permit = permit;
let service = service_fn(move |req| {
let handler = handler.clone();
async move {
handler.handle_request(req).await
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, format!("{}", e)))
}
});
if let Err(e) = Http::new()
.http1_keep_alive(false)
.serve_connection(stream, service)
.with_upgrades()
.await
{
error!("Error serving connection from {}: {}", peer_addr, e);
}
});
}
};
let handler = self.api_handler.clone();
tokio::spawn(async move {
let service = service_fn(move |req| {
let handler = handler.clone();
async move {
handler.handle_request(req).await
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, format!("{}", e)))
_ = &mut shutdown => {
info!("Shutdown signal received, draining connections...");
// Wait up to 5 seconds for in-flight requests to complete
let drain_start = std::time::Instant::now();
let drain_timeout = std::time::Duration::from_secs(5);
while active_connections.available_permits() < 1024 {
if drain_start.elapsed() > drain_timeout {
warn!("Drain timeout reached, forcing shutdown");
break;
}
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
}
});
if let Err(e) = Http::new()
.http1_keep_alive(false)
.serve_connection(stream, service)
.with_upgrades()
.await
{
error!("Error serving connection from {}: {}", peer_addr, e);
info!("Shutdown complete");
return Ok(());
}
});
}
}
}
}

View File

@ -326,7 +326,7 @@
- [x] **COMM-03** — Set up issue tracker and roadmap. Configure GitHub Issues with labels, templates, and project board. Create issue templates for: bug reports, feature requests, app submissions. **Acceptance**: Issue tracker ready for community use.
- [ ] **COMM-04** — Publish v0.9.0 release. Final pre-1.0 release. Full ISO builds, comprehensive release notes, migration guide from 0.8. **Acceptance**: Published release, tested on 3+ hardware configs.
- [ ] **COMM-04**(BLOCKED: requires physical ISO build on server and testing on 3+ hardware configs — cannot be automated from code) Publish v0.9.0 release. Final pre-1.0 release. Full ISO builds, comprehensive release notes, migration guide from 0.8. **Acceptance**: Published release, tested on 3+ hardware configs.
---
@ -336,7 +336,7 @@
#### Sprint 27: Advanced Monitoring (Week 1-4)
- [ ] **MON-01** — Implement real-time metrics collection. Add `core/archipelago/src/monitoring/collector.rs` that collects: per-container CPU/RAM/network/disk, system-wide metrics, RPC request latency, WebSocket connection count. Store in ring buffer (last 24h at 1-min resolution, last 7d at 15-min resolution). **Acceptance**: Metrics collected and queryable via RPC.
- [x] **MON-01** — Implement real-time metrics collection. Add `core/archipelago/src/monitoring/collector.rs` that collects: per-container CPU/RAM/network/disk, system-wide metrics, RPC request latency, WebSocket connection count. Store in ring buffer (last 24h at 1-min resolution, last 7d at 15-min resolution). **Acceptance**: Metrics collected and queryable via RPC.
- [ ] **MON-02** — Add monitoring dashboard page. Create `neode-ui/src/views/Monitoring.vue` at `/dashboard/monitoring` with: real-time line charts (CPU, RAM, network), per-container resource breakdown, alert history, system health timeline. Use canvas-based charts (no heavy library -- build simple line chart component). **Acceptance**: Real-time metrics visible with 5s refresh.