2026-03-11 11:11:02 +00:00
|
|
|
pub mod collector;
|
2026-03-22 03:30:21 +00:00
|
|
|
pub(crate) mod alerts;
|
|
|
|
|
mod notifications;
|
|
|
|
|
pub mod store;
|
|
|
|
|
mod telemetry;
|
|
|
|
|
pub mod types;
|
|
|
|
|
|
|
|
|
|
// Re-export public types for external consumers
|
|
|
|
|
pub use store::MetricsStore;
|
|
|
|
|
pub use telemetry::spawn_telemetry_reporter;
|
|
|
|
|
pub use types::*;
|
2026-03-11 11:11:02 +00:00
|
|
|
|
2026-03-12 22:19:04 +00:00
|
|
|
use std::path::PathBuf;
|
2026-03-11 11:11:02 +00:00
|
|
|
use std::sync::Arc;
|
2026-03-22 03:30:21 +00:00
|
|
|
use tracing::{debug, warn};
|
2026-03-12 22:19:04 +00:00
|
|
|
|
fix: overhaul container lifecycle — recovery, health, uninstall, UI state
Container recovery:
- Health monitor: MAX_RESTART_ATTEMPTS 3→10, interval 60s→120s
- Dependency-aware restarts: won't restart services before their deps
- Reset dependent counters when a dependency recovers
- Handle "created" state containers (were invisible to health monitor)
- Added IndeedHub, mempool-api, mysql to tier system
- Crash recovery: podman start timeout 30s→120s with retry
- Podman client: socket timeout 5s→30s, added restart policy
UI state representation:
- Exit code 0 shows "stopped" (gray), not "crashed" (red)
- Exit code 137 shows "killed (OOM)"
- Non-zero exit shows "crashed" (red)
- Added exit_code field to PackageDataEntry
Install/uninstall fixes:
- Install returns error when container doesn't start (was silent success)
- Post-install hooks awaited instead of fire-and-forget tokio::spawn
- Uninstall: graceful rm before force, volume prune, network cleanup
- Uninstall returns error on partial failure (was 200 OK)
Config consistency:
- DB passwords read from /var/lib/archipelago/secrets/ (was hardcoded)
- Bitcoin: added ZMQ ports 28332/28333 for LND block notifications
- IndeedHub port 7777→8190 (was conflicting with strfry)
- Marketplace versions: LND 0.17.4→0.18.4, Mempool 2.5.0→3.0.0
Performance:
- Metrics collector interval 60s→300s (was duplicating health monitor)
- Podman client: proper error propagation instead of unwrap_or_default
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 07:03:57 +01:00
|
|
|
/// Spawn the background metrics collector (runs every 300 seconds / 5 minutes).
|
2026-03-22 03:30:21 +00:00
|
|
|
/// Evaluates alert rules on each snapshot and dispatches notifications.
|
fix: overhaul container lifecycle — recovery, health, uninstall, UI state
Container recovery:
- Health monitor: MAX_RESTART_ATTEMPTS 3→10, interval 60s→120s
- Dependency-aware restarts: won't restart services before their deps
- Reset dependent counters when a dependency recovers
- Handle "created" state containers (were invisible to health monitor)
- Added IndeedHub, mempool-api, mysql to tier system
- Crash recovery: podman start timeout 30s→120s with retry
- Podman client: socket timeout 5s→30s, added restart policy
UI state representation:
- Exit code 0 shows "stopped" (gray), not "crashed" (red)
- Exit code 137 shows "killed (OOM)"
- Non-zero exit shows "crashed" (red)
- Added exit_code field to PackageDataEntry
Install/uninstall fixes:
- Install returns error when container doesn't start (was silent success)
- Post-install hooks awaited instead of fire-and-forget tokio::spawn
- Uninstall: graceful rm before force, volume prune, network cleanup
- Uninstall returns error on partial failure (was 200 OK)
Config consistency:
- DB passwords read from /var/lib/archipelago/secrets/ (was hardcoded)
- Bitcoin: added ZMQ ports 28332/28333 for LND block notifications
- IndeedHub port 7777→8190 (was conflicting with strfry)
- Marketplace versions: LND 0.17.4→0.18.4, Mempool 2.5.0→3.0.0
Performance:
- Metrics collector interval 60s→300s (was duplicating health monitor)
- Podman client: proper error propagation instead of unwrap_or_default
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 07:03:57 +01:00
|
|
|
/// Note: health_monitor.rs handles container state polling at 120s intervals.
|
|
|
|
|
/// This collector handles system-level metrics (CPU, disk, network) and only
|
|
|
|
|
/// calls podman stats every 5 minutes to avoid duplicate subprocess overhead.
|
2026-03-11 12:28:44 +00:00
|
|
|
pub fn spawn_metrics_collector(
|
|
|
|
|
store: Arc<MetricsStore>,
|
|
|
|
|
state: Option<Arc<crate::state::StateManager>>,
|
2026-03-12 22:32:19 +00:00
|
|
|
data_dir: Option<PathBuf>,
|
2026-03-11 12:28:44 +00:00
|
|
|
) {
|
2026-03-11 11:11:02 +00:00
|
|
|
tokio::spawn(async move {
|
fix: overhaul container lifecycle — recovery, health, uninstall, UI state
Container recovery:
- Health monitor: MAX_RESTART_ATTEMPTS 3→10, interval 60s→120s
- Dependency-aware restarts: won't restart services before their deps
- Reset dependent counters when a dependency recovers
- Handle "created" state containers (were invisible to health monitor)
- Added IndeedHub, mempool-api, mysql to tier system
- Crash recovery: podman start timeout 30s→120s with retry
- Podman client: socket timeout 5s→30s, added restart policy
UI state representation:
- Exit code 0 shows "stopped" (gray), not "crashed" (red)
- Exit code 137 shows "killed (OOM)"
- Non-zero exit shows "crashed" (red)
- Added exit_code field to PackageDataEntry
Install/uninstall fixes:
- Install returns error when container doesn't start (was silent success)
- Post-install hooks awaited instead of fire-and-forget tokio::spawn
- Uninstall: graceful rm before force, volume prune, network cleanup
- Uninstall returns error on partial failure (was 200 OK)
Config consistency:
- DB passwords read from /var/lib/archipelago/secrets/ (was hardcoded)
- Bitcoin: added ZMQ ports 28332/28333 for LND block notifications
- IndeedHub port 7777→8190 (was conflicting with strfry)
- Marketplace versions: LND 0.17.4→0.18.4, Mempool 2.5.0→3.0.0
Performance:
- Metrics collector interval 60s→300s (was duplicating health monitor)
- Podman client: proper error propagation instead of unwrap_or_default
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 07:03:57 +01:00
|
|
|
// Wait 60s for system to stabilize after boot
|
|
|
|
|
tokio::time::sleep(std::time::Duration::from_secs(60)).await;
|
2026-03-11 11:11:02 +00:00
|
|
|
|
fix: overhaul container lifecycle — recovery, health, uninstall, UI state
Container recovery:
- Health monitor: MAX_RESTART_ATTEMPTS 3→10, interval 60s→120s
- Dependency-aware restarts: won't restart services before their deps
- Reset dependent counters when a dependency recovers
- Handle "created" state containers (were invisible to health monitor)
- Added IndeedHub, mempool-api, mysql to tier system
- Crash recovery: podman start timeout 30s→120s with retry
- Podman client: socket timeout 5s→30s, added restart policy
UI state representation:
- Exit code 0 shows "stopped" (gray), not "crashed" (red)
- Exit code 137 shows "killed (OOM)"
- Non-zero exit shows "crashed" (red)
- Added exit_code field to PackageDataEntry
Install/uninstall fixes:
- Install returns error when container doesn't start (was silent success)
- Post-install hooks awaited instead of fire-and-forget tokio::spawn
- Uninstall: graceful rm before force, volume prune, network cleanup
- Uninstall returns error on partial failure (was 200 OK)
Config consistency:
- DB passwords read from /var/lib/archipelago/secrets/ (was hardcoded)
- Bitcoin: added ZMQ ports 28332/28333 for LND block notifications
- IndeedHub port 7777→8190 (was conflicting with strfry)
- Marketplace versions: LND 0.17.4→0.18.4, Mempool 2.5.0→3.0.0
Performance:
- Metrics collector interval 60s→300s (was duplicating health monitor)
- Podman client: proper error propagation instead of unwrap_or_default
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 07:03:57 +01:00
|
|
|
let mut interval = tokio::time::interval(std::time::Duration::from_secs(300));
|
2026-04-07 20:25:09 +01:00
|
|
|
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
|
2026-03-11 11:11:02 +00:00
|
|
|
|
|
|
|
|
loop {
|
|
|
|
|
interval.tick().await;
|
|
|
|
|
|
|
|
|
|
match collector::collect_snapshot().await {
|
|
|
|
|
Ok(snapshot) => {
|
2026-03-11 12:28:44 +00:00
|
|
|
let alerts = store.check_alerts(&snapshot).await;
|
2026-03-11 11:11:02 +00:00
|
|
|
store.push(snapshot).await;
|
|
|
|
|
debug!("Metrics snapshot collected");
|
2026-03-11 12:28:44 +00:00
|
|
|
|
|
|
|
|
if !alerts.is_empty() {
|
|
|
|
|
if let Some(ref state_mgr) = state {
|
2026-03-22 03:30:21 +00:00
|
|
|
notifications::push_alert_notifications(state_mgr, &alerts).await;
|
2026-03-11 12:28:44 +00:00
|
|
|
}
|
2026-03-12 22:32:19 +00:00
|
|
|
if let Some(ref dir) = data_dir {
|
2026-03-22 03:30:21 +00:00
|
|
|
notifications::deliver_alert_webhooks(dir, &alerts).await;
|
2026-03-12 22:32:19 +00:00
|
|
|
}
|
2026-03-11 12:28:44 +00:00
|
|
|
}
|
2026-03-11 11:11:02 +00:00
|
|
|
}
|
|
|
|
|
Err(e) => {
|
|
|
|
|
warn!("Failed to collect metrics: {}", e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
}
|