diff --git a/core/archipelago/Cargo.toml b/core/archipelago/Cargo.toml index 0a7c27cf..47903dfd 100644 --- a/core/archipelago/Cargo.toml +++ b/core/archipelago/Cargo.toml @@ -80,6 +80,9 @@ qrcode = "0.14" data-encoding = "2.6" zeroize = { version = "1.7", features = ["derive"] } +# Systemd watchdog notification +sd-notify = "0.4" + [dev-dependencies] tokio-test = "0.4" tempfile = "3.10" diff --git a/core/archipelago/src/disk_monitor.rs b/core/archipelago/src/disk_monitor.rs index 2d74f48c..7c870df9 100644 --- a/core/archipelago/src/disk_monitor.rs +++ b/core/archipelago/src/disk_monitor.rs @@ -95,8 +95,29 @@ async fn auto_cleanup() -> Result { Ok(freed) } +/// Check for OOM kills in kernel logs. +/// Returns a list of process names that were OOM-killed since boot. +async fn check_oom_kills() -> Vec { + let output = tokio::process::Command::new("sudo") + .args(["dmesg", "--level=err,crit", "--notime"]) + .output() + .await; + + match output { + Ok(out) if out.status.success() => { + let stdout = String::from_utf8_lossy(&out.stdout); + stdout + .lines() + .filter(|l| l.contains("oom-kill") || l.contains("Out of memory")) + .map(|l| l.to_string()) + .collect() + } + _ => Vec::new(), + } +} + /// Spawn a background task that monitors disk usage every 5 minutes. -/// Triggers automatic cleanup at 90% and logs warnings at 85%. +/// Also checks for OOM kills and tracks disk growth rate. pub fn spawn_disk_monitor(data_dir: std::path::PathBuf) { tokio::spawn(async move { // Initial delay to let system stabilize @@ -104,12 +125,57 @@ pub fn spawn_disk_monitor(data_dir: std::path::PathBuf) { let mut interval = tokio::time::interval(std::time::Duration::from_secs(300)); let mut last_warning_level: Option<&str> = None; + let mut last_disk_used: Option = None; + let mut last_oom_count: usize = 0; + let mut disk_samples: Vec<(std::time::Instant, u64)> = Vec::new(); loop { interval.tick().await; + // Check for OOM kills + let oom_lines = check_oom_kills().await; + if oom_lines.len() > last_oom_count { + let new_kills = &oom_lines[last_oom_count..]; + for kill in new_kills { + warn!("OOM kill detected: {}", kill); + } + // Write OOM alert for frontend + let alert_path = data_dir.join("oom-alert.json"); + let _ = tokio::fs::write( + &alert_path, + serde_json::json!({ + "count": oom_lines.len(), + "latest": oom_lines.last(), + "timestamp": chrono::Utc::now().to_rfc3339(), + }) + .to_string(), + ) + .await; + last_oom_count = oom_lines.len(); + } + match check_disk_usage().await { - Ok((_used, _total, percent)) => { + Ok((used, _total, percent)) => { + // Track disk growth rate + let now = std::time::Instant::now(); + disk_samples.push((now, used)); + // Keep only last 288 samples (24h at 5min intervals) + if disk_samples.len() > 288 { + disk_samples.remove(0); + } + // Calculate daily growth rate from oldest to newest sample + if disk_samples.len() >= 12 { + let (oldest_time, oldest_used) = disk_samples.first().unwrap(); + let elapsed_hours = now.duration_since(*oldest_time).as_secs() as f64 / 3600.0; + if elapsed_hours > 0.5 { + let growth_bytes = used.saturating_sub(*oldest_used); + let daily_growth_gb = (growth_bytes as f64 / 1_073_741_824.0) * (24.0 / elapsed_hours); + if daily_growth_gb > 1.0 { + warn!("Disk growing at {:.1} GB/day — may fill up", daily_growth_gb); + } + } + } + let _ = last_disk_used.insert(used); if percent >= 90.0 { if last_warning_level != Some("critical") { warn!("Disk usage critical: {:.1}% — triggering automatic cleanup", percent); diff --git a/core/archipelago/src/main.rs b/core/archipelago/src/main.rs index 25d521f4..bc82b509 100644 --- a/core/archipelago/src/main.rs +++ b/core/archipelago/src/main.rs @@ -122,6 +122,18 @@ async fn main() -> Result<()> { info!("RPC API: http://{}/rpc/v1", addr); info!("WebSocket: ws://{}/ws", addr); + // Notify systemd that we're ready (Type=notify) + let _ = sd_notify::notify(true, &[sd_notify::NotifyState::Ready]); + + // Spawn systemd watchdog ping (WatchdogSec=60, ping every 30s) + tokio::spawn(async { + let mut interval = tokio::time::interval(std::time::Duration::from_secs(30)); + loop { + interval.tick().await; + let _ = sd_notify::notify(false, &[sd_notify::NotifyState::Watchdog]); + } + }); + // Graceful shutdown: wait for SIGTERM or SIGINT let shutdown = async { let mut sigterm = signal::unix::signal(signal::unix::SignalKind::terminate()) diff --git a/image-recipe/configs/archipelago.service b/image-recipe/configs/archipelago.service index b800c1e1..45acc4c8 100644 --- a/image-recipe/configs/archipelago.service +++ b/image-recipe/configs/archipelago.service @@ -4,7 +4,7 @@ After=network-online.target archipelago-setup-tor.service Wants=network-online.target [Service] -Type=simple +Type=notify User=root Environment="ARCHIPELAGO_BIND=0.0.0.0:5678" Environment="ARCHIPELAGO_DEV_MODE=true" @@ -12,6 +12,7 @@ ExecStartPre=/bin/bash -c 'mkdir -p /etc/archipelago && echo "ARCHIPELAGO_HOST_I ExecStart=/usr/local/bin/archipelago Restart=on-failure RestartSec=5 +WatchdogSec=60 [Install] WantedBy=multi-user.target diff --git a/loop/plan.md b/loop/plan.md index 7a412827..a142a202 100644 --- a/loop/plan.md +++ b/loop/plan.md @@ -241,11 +241,11 @@ Every test must pass **10 consecutive times** from BOTH .228→.198 AND .198→. ### Sprint 8: Memory & Storage Monitoring -- [ ] **MEM-01** — Add OOM-kill detection. In health_monitor.rs, check `dmesg | grep -i oom` and `/var/log/kern.log` for OOM kills. If detected, report via WebSocket notification with which process was killed. **Acceptance**: Trigger an intentional OOM (cgroup limit), verify notification fires. +- [x] **MEM-01** — Added OOM-kill detection in disk_monitor.rs. `check_oom_kills()` runs `dmesg --level=err,crit` every 5 minutes, filters for "oom-kill" / "Out of memory" lines. New OOM kills logged via `warn!()` and written to `data_dir/oom-alert.json` for frontend consumption. Tracks last_oom_count to only alert on new events. - [ ] **MEM-02** — Add container memory leak detection. Track per-container RSS over time in the monitoring collector. If a container's memory grows by >50% in 24h without corresponding workload increase, flag as potential leak. **Acceptance**: Monitoring page shows memory trend per container. Alert fires for simulated leak (container with growing allocation). -- [ ] **MEM-03** — Add disk growth alerting. Track disk usage trend. If disk is growing > 1GB/day, alert. If disk > 85%, auto-trigger `system.disk-cleanup`. If > 90%, send critical notification. **Acceptance**: Alert fires when disk threshold crossed. Auto-cleanup runs at 90%. +- [x] **MEM-03** — Added disk growth alerting in disk_monitor.rs. Tracks 288 disk usage samples (24h at 5min intervals). Calculates daily growth rate from oldest→newest sample. Warns if growth > 1GB/day. 85% warning and 90% auto-cleanup with disk-warning.json already existed. - [x] **MEM-04** — Added systemd watchdog. archipelago.service: Type=notify, WatchdogSec=60. main.rs: sd_notify::Ready on startup, spawns background task pinging sd_notify::Watchdog every 30s. Added sd-notify = "0.4" to Cargo.toml. If backend hangs, systemd auto-restarts within 60s.