archy/core/archipelago/src/monitoring/mod.rs

pub mod collector;
pub(crate) mod alerts;
mod notifications;
pub mod store;
mod telemetry;
pub mod types;

// Re-export public types for external consumers
pub use store::MetricsStore;
pub use telemetry::spawn_telemetry_reporter;
pub use types::*;

use std::path::PathBuf;
use std::sync::Arc;
use tracing::{debug, warn};

/// Spawn the background metrics collector (runs every 300 seconds / 5 minutes).
/// Evaluates alert rules on each snapshot and dispatches notifications.
/// Note: health_monitor.rs handles container state polling at 120s intervals.
/// This collector handles system-level metrics (CPU, disk, network) and only
/// calls podman stats every 5 minutes to avoid duplicate subprocess overhead.
pub fn spawn_metrics_collector(
    store: Arc<MetricsStore>,
    state: Option<Arc<crate::state::StateManager>>,
    data_dir: Option<PathBuf>,
) {
    tokio::spawn(async move {
        // Wait 60s for system to stabilize after boot
        tokio::time::sleep(std::time::Duration::from_secs(60)).await;

        let mut interval = tokio::time::interval(std::time::Duration::from_secs(300));
        interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);

        loop {
            interval.tick().await;

            match collector::collect_snapshot().await {
                Ok(snapshot) => {
                    let alerts = store.check_alerts(&snapshot).await;
                    store.push(snapshot).await;
                    debug!("Metrics snapshot collected");

                    if !alerts.is_empty() {
                        if let Some(ref state_mgr) = state {
                            notifications::push_alert_notifications(state_mgr, &alerts).await;
                        }
                        if let Some(ref dir) = data_dir {
                            notifications::deliver_alert_webhooks(dir, &alerts).await;
                        }
                    }
                }
                Err(e) => {
                    warn!("Failed to collect metrics: {}", e);
                }
            }
        }
    });
}
feat: add real-time metrics collection with ring buffer storage (MON-01) Implements monitoring/collector.rs that collects per-container CPU/RAM/network/disk, system-wide metrics, RPC latency, and WebSocket connection count every 60 seconds. Data stored in dual ring buffers: 1-min resolution (24h) and 15-min resolution (7d). Three new RPC endpoints: monitoring.current, monitoring.history, monitoring.containers. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 11:11:02 +00:00			`pub mod collector;`
bug fixing and deploy and build diagnostics 2026-03-22 03:30:21 +00:00			`pub(crate) mod alerts;`
			`mod notifications;`
			`pub mod store;`
			`mod telemetry;`
			`pub mod types;`

			`// Re-export public types for external consumers`
			`pub use store::MetricsStore;`
			`pub use telemetry::spawn_telemetry_reporter;`
			`pub use types::*;`
feat: add real-time metrics collection with ring buffer storage (MON-01) Implements monitoring/collector.rs that collects per-container CPU/RAM/network/disk, system-wide metrics, RPC latency, and WebSocket connection count every 60 seconds. Data stored in dual ring buffers: 1-min resolution (24h) and 15-min resolution (7d). Three new RPC endpoints: monitoring.current, monitoring.history, monitoring.containers. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 11:11:02 +00:00
patches on sxsw ai working api key working container hardened plus many more 2026-03-12 22:19:04 +00:00			`use std::path::PathBuf;`
feat: add real-time metrics collection with ring buffer storage (MON-01) Implements monitoring/collector.rs that collects per-container CPU/RAM/network/disk, system-wide metrics, RPC latency, and WebSocket connection count every 60 seconds. Data stored in dual ring buffers: 1-min resolution (24h) and 15-min resolution (7d). Three new RPC endpoints: monitoring.current, monitoring.history, monitoring.containers. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 11:11:02 +00:00			`use std::sync::Arc;`
bug fixing and deploy and build diagnostics 2026-03-22 03:30:21 +00:00			`use tracing::{debug, warn};`
patches on sxsw ai working api key working container hardened plus many more 2026-03-12 22:19:04 +00:00
fix: overhaul container lifecycle — recovery, health, uninstall, UI state Container recovery: - Health monitor: MAX_RESTART_ATTEMPTS 3→10, interval 60s→120s - Dependency-aware restarts: won't restart services before their deps - Reset dependent counters when a dependency recovers - Handle "created" state containers (were invisible to health monitor) - Added IndeedHub, mempool-api, mysql to tier system - Crash recovery: podman start timeout 30s→120s with retry - Podman client: socket timeout 5s→30s, added restart policy UI state representation: - Exit code 0 shows "stopped" (gray), not "crashed" (red) - Exit code 137 shows "killed (OOM)" - Non-zero exit shows "crashed" (red) - Added exit_code field to PackageDataEntry Install/uninstall fixes: - Install returns error when container doesn't start (was silent success) - Post-install hooks awaited instead of fire-and-forget tokio::spawn - Uninstall: graceful rm before force, volume prune, network cleanup - Uninstall returns error on partial failure (was 200 OK) Config consistency: - DB passwords read from /var/lib/archipelago/secrets/ (was hardcoded) - Bitcoin: added ZMQ ports 28332/28333 for LND block notifications - IndeedHub port 7777→8190 (was conflicting with strfry) - Marketplace versions: LND 0.17.4→0.18.4, Mempool 2.5.0→3.0.0 Performance: - Metrics collector interval 60s→300s (was duplicating health monitor) - Podman client: proper error propagation instead of unwrap_or_default Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-31 07:03:57 +01:00			`/// Spawn the background metrics collector (runs every 300 seconds / 5 minutes).`
bug fixing and deploy and build diagnostics 2026-03-22 03:30:21 +00:00			`/// Evaluates alert rules on each snapshot and dispatches notifications.`
fix: overhaul container lifecycle — recovery, health, uninstall, UI state Container recovery: - Health monitor: MAX_RESTART_ATTEMPTS 3→10, interval 60s→120s - Dependency-aware restarts: won't restart services before their deps - Reset dependent counters when a dependency recovers - Handle "created" state containers (were invisible to health monitor) - Added IndeedHub, mempool-api, mysql to tier system - Crash recovery: podman start timeout 30s→120s with retry - Podman client: socket timeout 5s→30s, added restart policy UI state representation: - Exit code 0 shows "stopped" (gray), not "crashed" (red) - Exit code 137 shows "killed (OOM)" - Non-zero exit shows "crashed" (red) - Added exit_code field to PackageDataEntry Install/uninstall fixes: - Install returns error when container doesn't start (was silent success) - Post-install hooks awaited instead of fire-and-forget tokio::spawn - Uninstall: graceful rm before force, volume prune, network cleanup - Uninstall returns error on partial failure (was 200 OK) Config consistency: - DB passwords read from /var/lib/archipelago/secrets/ (was hardcoded) - Bitcoin: added ZMQ ports 28332/28333 for LND block notifications - IndeedHub port 7777→8190 (was conflicting with strfry) - Marketplace versions: LND 0.17.4→0.18.4, Mempool 2.5.0→3.0.0 Performance: - Metrics collector interval 60s→300s (was duplicating health monitor) - Podman client: proper error propagation instead of unwrap_or_default Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-31 07:03:57 +01:00			`/// Note: health_monitor.rs handles container state polling at 120s intervals.`
			`/// This collector handles system-level metrics (CPU, disk, network) and only`
			`/// calls podman stats every 5 minutes to avoid duplicate subprocess overhead.`
feat: add alerting system with configurable rules and UI (MON-02, MON-03) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 12:28:44 +00:00			`pub fn spawn_metrics_collector(`
			`store: Arc<MetricsStore>,`
			`state: Option<Arc<crate::state::StateManager>>,`
fix: add webhook delivery for monitoring alerts DiskUsage and ContainerCrash alerts now fire webhooks via send_webhook() after pushing WebSocket notifications. Added data_dir parameter to spawn_metrics_collector for webhook config access. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-12 22:32:19 +00:00			`data_dir: Option<PathBuf>,`
feat: add alerting system with configurable rules and UI (MON-02, MON-03) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 12:28:44 +00:00			`) {`
feat: add real-time metrics collection with ring buffer storage (MON-01) Implements monitoring/collector.rs that collects per-container CPU/RAM/network/disk, system-wide metrics, RPC latency, and WebSocket connection count every 60 seconds. Data stored in dual ring buffers: 1-min resolution (24h) and 15-min resolution (7d). Three new RPC endpoints: monitoring.current, monitoring.history, monitoring.containers. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 11:11:02 +00:00			`tokio::spawn(async move {`
fix: overhaul container lifecycle — recovery, health, uninstall, UI state Container recovery: - Health monitor: MAX_RESTART_ATTEMPTS 3→10, interval 60s→120s - Dependency-aware restarts: won't restart services before their deps - Reset dependent counters when a dependency recovers - Handle "created" state containers (were invisible to health monitor) - Added IndeedHub, mempool-api, mysql to tier system - Crash recovery: podman start timeout 30s→120s with retry - Podman client: socket timeout 5s→30s, added restart policy UI state representation: - Exit code 0 shows "stopped" (gray), not "crashed" (red) - Exit code 137 shows "killed (OOM)" - Non-zero exit shows "crashed" (red) - Added exit_code field to PackageDataEntry Install/uninstall fixes: - Install returns error when container doesn't start (was silent success) - Post-install hooks awaited instead of fire-and-forget tokio::spawn - Uninstall: graceful rm before force, volume prune, network cleanup - Uninstall returns error on partial failure (was 200 OK) Config consistency: - DB passwords read from /var/lib/archipelago/secrets/ (was hardcoded) - Bitcoin: added ZMQ ports 28332/28333 for LND block notifications - IndeedHub port 7777→8190 (was conflicting with strfry) - Marketplace versions: LND 0.17.4→0.18.4, Mempool 2.5.0→3.0.0 Performance: - Metrics collector interval 60s→300s (was duplicating health monitor) - Podman client: proper error propagation instead of unwrap_or_default Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-31 07:03:57 +01:00			`// Wait 60s for system to stabilize after boot`
			`tokio::time::sleep(std::time::Duration::from_secs(60)).await;`
feat: add real-time metrics collection with ring buffer storage (MON-01) Implements monitoring/collector.rs that collects per-container CPU/RAM/network/disk, system-wide metrics, RPC latency, and WebSocket connection count every 60 seconds. Data stored in dual ring buffers: 1-min resolution (24h) and 15-min resolution (7d). Three new RPC endpoints: monitoring.current, monitoring.history, monitoring.containers. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 11:11:02 +00:00
fix: overhaul container lifecycle — recovery, health, uninstall, UI state Container recovery: - Health monitor: MAX_RESTART_ATTEMPTS 3→10, interval 60s→120s - Dependency-aware restarts: won't restart services before their deps - Reset dependent counters when a dependency recovers - Handle "created" state containers (were invisible to health monitor) - Added IndeedHub, mempool-api, mysql to tier system - Crash recovery: podman start timeout 30s→120s with retry - Podman client: socket timeout 5s→30s, added restart policy UI state representation: - Exit code 0 shows "stopped" (gray), not "crashed" (red) - Exit code 137 shows "killed (OOM)" - Non-zero exit shows "crashed" (red) - Added exit_code field to PackageDataEntry Install/uninstall fixes: - Install returns error when container doesn't start (was silent success) - Post-install hooks awaited instead of fire-and-forget tokio::spawn - Uninstall: graceful rm before force, volume prune, network cleanup - Uninstall returns error on partial failure (was 200 OK) Config consistency: - DB passwords read from /var/lib/archipelago/secrets/ (was hardcoded) - Bitcoin: added ZMQ ports 28332/28333 for LND block notifications - IndeedHub port 7777→8190 (was conflicting with strfry) - Marketplace versions: LND 0.17.4→0.18.4, Mempool 2.5.0→3.0.0 Performance: - Metrics collector interval 60s→300s (was duplicating health monitor) - Podman client: proper error propagation instead of unwrap_or_default Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-31 07:03:57 +01:00			`let mut interval = tokio::time::interval(std::time::Duration::from_secs(300));`
perf: skip missed ticks on all intervals, reduce scan frequency Prevents burst of health checks, scans, and snapshots after slow podman responses by using MissedTickBehavior::Skip. Bumps container scan interval from 30s to 60s to reduce DB lock contention. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-04-07 20:25:09 +01:00			`interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);`
feat: add real-time metrics collection with ring buffer storage (MON-01) Implements monitoring/collector.rs that collects per-container CPU/RAM/network/disk, system-wide metrics, RPC latency, and WebSocket connection count every 60 seconds. Data stored in dual ring buffers: 1-min resolution (24h) and 15-min resolution (7d). Three new RPC endpoints: monitoring.current, monitoring.history, monitoring.containers. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 11:11:02 +00:00
			`loop {`
			`interval.tick().await;`

			`match collector::collect_snapshot().await {`
			`Ok(snapshot) => {`
feat: add alerting system with configurable rules and UI (MON-02, MON-03) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 12:28:44 +00:00			`let alerts = store.check_alerts(&snapshot).await;`
feat: add real-time metrics collection with ring buffer storage (MON-01) Implements monitoring/collector.rs that collects per-container CPU/RAM/network/disk, system-wide metrics, RPC latency, and WebSocket connection count every 60 seconds. Data stored in dual ring buffers: 1-min resolution (24h) and 15-min resolution (7d). Three new RPC endpoints: monitoring.current, monitoring.history, monitoring.containers. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 11:11:02 +00:00			`store.push(snapshot).await;`
			`debug!("Metrics snapshot collected");`
feat: add alerting system with configurable rules and UI (MON-02, MON-03) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 12:28:44 +00:00
			`if !alerts.is_empty() {`
			`if let Some(ref state_mgr) = state {`
bug fixing and deploy and build diagnostics 2026-03-22 03:30:21 +00:00			`notifications::push_alert_notifications(state_mgr, &alerts).await;`
feat: add alerting system with configurable rules and UI (MON-02, MON-03) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 12:28:44 +00:00			`}`
fix: add webhook delivery for monitoring alerts DiskUsage and ContainerCrash alerts now fire webhooks via send_webhook() after pushing WebSocket notifications. Added data_dir parameter to spawn_metrics_collector for webhook config access. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-12 22:32:19 +00:00			`if let Some(ref dir) = data_dir {`
bug fixing and deploy and build diagnostics 2026-03-22 03:30:21 +00:00			`notifications::deliver_alert_webhooks(dir, &alerts).await;`
fix: add webhook delivery for monitoring alerts DiskUsage and ContainerCrash alerts now fire webhooks via send_webhook() after pushing WebSocket notifications. Added data_dir parameter to spawn_metrics_collector for webhook config access. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-12 22:32:19 +00:00			`}`
feat: add alerting system with configurable rules and UI (MON-02, MON-03) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 12:28:44 +00:00			`}`
feat: add real-time metrics collection with ring buffer storage (MON-01) Implements monitoring/collector.rs that collects per-container CPU/RAM/network/disk, system-wide metrics, RPC latency, and WebSocket connection count every 60 seconds. Data stored in dual ring buffers: 1-min resolution (24h) and 15-min resolution (7d). Three new RPC endpoints: monitoring.current, monitoring.history, monitoring.containers. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> 2026-03-11 11:11:02 +00:00			`}`
			`Err(e) => {`
			`warn!("Failed to collect metrics: {}", e);`
			`}`
			`}`
			`}`
			`});`
			`}`