From 46c50961c2f07eeb751869cf1b516de44c697143 Mon Sep 17 00:00:00 2001 From: Dorian Date: Sun, 29 Mar 2026 17:15:56 +0100 Subject: [PATCH] =?UTF-8?q?feat:=20TASK-49=20container=20reliability=20?= =?UTF-8?q?=E2=80=94=20tests,=20orchestration,=20MASTER=5FPLAN?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add orchestration_tests.rs + mock_podman.rs (container unit tests) - Add container-tests.yml CI workflow - Add dev-container-test.sh for local testing - MASTER_PLAN.md: add TASK-49 (P0) with 6-phase plan - Login.vue: minor fixes from user testing - AppCard.vue: enter key handler fix Co-Authored-By: Claude Opus 4.6 (1M context) --- .gitea/workflows/container-tests.yml | 60 +++ core/archipelago/src/container/mock_podman.rs | 265 ++++++++++ core/archipelago/tests/orchestration_tests.rs | 496 ++++++++++++++++++ docs/MASTER_PLAN.md | 94 ++++ neode-ui/src/views/Login.vue | 37 +- neode-ui/src/views/apps/AppCard.vue | 10 +- scripts/dev-container-test.sh | 259 +++++++++ 7 files changed, 1213 insertions(+), 8 deletions(-) create mode 100644 .gitea/workflows/container-tests.yml create mode 100644 core/archipelago/src/container/mock_podman.rs create mode 100644 core/archipelago/tests/orchestration_tests.rs create mode 100755 scripts/dev-container-test.sh diff --git a/.gitea/workflows/container-tests.yml b/.gitea/workflows/container-tests.yml new file mode 100644 index 00000000..f1b13036 --- /dev/null +++ b/.gitea/workflows/container-tests.yml @@ -0,0 +1,60 @@ +name: Container Orchestration Tests +on: + push: + branches: [dev-iso, main] + paths: + - 'core/archipelago/src/**' + - 'core/container/src/**' + - 'scripts/container-*.sh' + - 'scripts/reconcile-*.sh' + - 'scripts/image-versions.sh' + workflow_dispatch: + +jobs: + unit-tests: + runs-on: ubuntu-latest + timeout-minutes: 15 + steps: + - uses: actions/checkout@v4 + + - name: Cache cargo registry + uses: actions/cache@v3 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + core/target + key: cargo-test-${{ hashFiles('core/Cargo.lock') }} + + - name: Run orchestration unit tests + working-directory: core + run: | + echo "=== Container crate tests ===" + cargo test -p archipelago-container --no-fail-fast 2>&1 + + echo "" + echo "=== Orchestration integration tests ===" + cargo test --test orchestration_tests --no-fail-fast 2>&1 + + - name: Verify cargo check (full crate) + working-directory: core + run: cargo check --release 2>&1 + + smoke-tests: + runs-on: ubuntu-latest + needs: unit-tests + timeout-minutes: 10 + steps: + - uses: actions/checkout@v4 + + - name: Run container smoke tests on .228 + env: + ARCHIPELAGO_SSH_KEY: ~/.ssh/archipelago-deploy + run: | + # Only run if SSH key exists (CI runner has deploy access) + if [ -f "$ARCHIPELAGO_SSH_KEY" ]; then + bash scripts/dev-container-test.sh --once + else + echo "⚠ SSH key not available — skipping live smoke tests" + echo " To enable: add archipelago-deploy key to CI runner" + fi diff --git a/core/archipelago/src/container/mock_podman.rs b/core/archipelago/src/container/mock_podman.rs new file mode 100644 index 00000000..fcdb20df --- /dev/null +++ b/core/archipelago/src/container/mock_podman.rs @@ -0,0 +1,265 @@ +//! Mock container runtime for unit testing orchestration logic. +//! +//! Simulates podman behavior in-memory: container lifecycle, health checks, +//! image pulls (with configurable failures for retry testing). + +use std::collections::HashMap; +use std::sync::{Arc, Mutex, atomic::{AtomicBool, AtomicU32, Ordering}}; + +/// Container state matching podman's real states. +#[derive(Debug, Clone, PartialEq)] +pub enum MockContainerState { + Created, + Running, + Exited(i32), // exit code + Stopped, +} + +impl MockContainerState { + pub fn as_str(&self) -> &str { + match self { + Self::Created => "created", + Self::Running => "running", + Self::Exited(_) => "exited", + Self::Stopped => "stopped", + } + } +} + +/// A simulated container. +#[derive(Debug, Clone)] +pub struct MockContainer { + pub name: String, + pub image: String, + pub state: MockContainerState, + pub stop_timeout_used: Option, +} + +/// Mock podman runtime for testing orchestration logic without real containers. +pub struct MockPodman { + containers: Arc>>, + /// When true, `podman pull` will fail (simulates registry down). + pub fail_pull: Arc, + /// When true, containers exit immediately after start (simulates crash). + pub fail_start: Arc, + /// Count of pull attempts (for retry testing). + pub pull_attempt_count: Arc, + /// Count of start attempts. + pub start_attempt_count: Arc, + /// Images that have been "pulled" (exist locally). + images: Arc>>, +} + +impl MockPodman { + pub fn new() -> Self { + Self { + containers: Arc::new(Mutex::new(HashMap::new())), + fail_pull: Arc::new(AtomicBool::new(false)), + fail_start: Arc::new(AtomicBool::new(false)), + pull_attempt_count: Arc::new(AtomicU32::new(0)), + start_attempt_count: Arc::new(AtomicU32::new(0)), + images: Arc::new(Mutex::new(Vec::new())), + } + } + + /// Simulate `podman pull `. Respects fail_pull flag. + pub fn pull_image(&self, image: &str) -> Result<(), String> { + self.pull_attempt_count.fetch_add(1, Ordering::SeqCst); + if self.fail_pull.load(Ordering::SeqCst) { + return Err(format!("Error: initializing source docker://{}: connection refused", image)); + } + self.images.lock().unwrap().push(image.to_string()); + Ok(()) + } + + /// Check if an image exists locally (was pulled). + pub fn image_exists(&self, image: &str) -> bool { + self.images.lock().unwrap().iter().any(|i| i == image) + } + + /// Simulate `podman run -d --name `. + pub fn create_and_start(&self, name: &str, image: &str) -> Result { + self.start_attempt_count.fetch_add(1, Ordering::SeqCst); + + if !self.image_exists(image) { + return Err(format!("Error: {} not found", image)); + } + + let state = if self.fail_start.load(Ordering::SeqCst) { + MockContainerState::Exited(1) + } else { + MockContainerState::Running + }; + + let container = MockContainer { + name: name.to_string(), + image: image.to_string(), + state, + stop_timeout_used: None, + }; + + self.containers.lock().unwrap().insert(name.to_string(), container); + Ok(format!("abc123def456_{}", name)) + } + + /// Simulate `podman start `. + pub fn start(&self, name: &str) -> Result<(), String> { + let mut containers = self.containers.lock().unwrap(); + match containers.get_mut(name) { + Some(c) => { + if self.fail_start.load(Ordering::SeqCst) { + c.state = MockContainerState::Exited(1); + } else { + c.state = MockContainerState::Running; + } + Ok(()) + } + None => Err(format!("Error: no such container {}", name)), + } + } + + /// Simulate `podman stop -t `. + pub fn stop(&self, name: &str, timeout: u64) -> Result<(), String> { + let mut containers = self.containers.lock().unwrap(); + match containers.get_mut(name) { + Some(c) => { + c.state = MockContainerState::Stopped; + c.stop_timeout_used = Some(timeout); + Ok(()) + } + None => Err(format!("Error: no such container {}", name)), + } + } + + /// Simulate `podman rm -f `. + pub fn remove(&self, name: &str) -> Result<(), String> { + self.containers.lock().unwrap().remove(name); + Ok(()) + } + + /// Simulate `podman inspect --format {{.State.Status}}`. + pub fn inspect_state(&self, name: &str) -> Option { + self.containers.lock().unwrap() + .get(name) + .map(|c| c.state.as_str().to_string()) + } + + /// List all containers (like `podman ps -a`). + pub fn list_all(&self) -> Vec { + self.containers.lock().unwrap().values().cloned().collect() + } + + /// Get a specific container. + pub fn get(&self, name: &str) -> Option { + self.containers.lock().unwrap().get(name).cloned() + } + + /// Pre-load an image (as if it was already pulled or bundled). + pub fn preload_image(&self, image: &str) { + self.images.lock().unwrap().push(image.to_string()); + } + + /// Pre-load a container in a specific state. + pub fn preload_container(&self, name: &str, image: &str, state: MockContainerState) { + self.containers.lock().unwrap().insert(name.to_string(), MockContainer { + name: name.to_string(), + image: image.to_string(), + state, + stop_timeout_used: None, + }); + } + + /// Get the stop timeout that was used for a container. + pub fn get_stop_timeout(&self, name: &str) -> Option { + self.containers.lock().unwrap() + .get(name) + .and_then(|c| c.stop_timeout_used) + } + + /// Reset all counters and state. + pub fn reset(&self) { + self.containers.lock().unwrap().clear(); + self.images.lock().unwrap().clear(); + self.fail_pull.store(false, Ordering::SeqCst); + self.fail_start.store(false, Ordering::SeqCst); + self.pull_attempt_count.store(0, Ordering::SeqCst); + self.start_attempt_count.store(0, Ordering::SeqCst); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_pull_and_start() { + let mock = MockPodman::new(); + mock.pull_image("test:latest").unwrap(); + assert!(mock.image_exists("test:latest")); + mock.create_and_start("test-container", "test:latest").unwrap(); + assert_eq!(mock.inspect_state("test-container"), Some("running".to_string())); + } + + #[test] + fn test_pull_failure() { + let mock = MockPodman::new(); + mock.fail_pull.store(true, Ordering::SeqCst); + assert!(mock.pull_image("test:latest").is_err()); + assert!(!mock.image_exists("test:latest")); + assert_eq!(mock.pull_attempt_count.load(Ordering::SeqCst), 1); + } + + #[test] + fn test_start_failure() { + let mock = MockPodman::new(); + mock.preload_image("test:latest"); + mock.fail_start.store(true, Ordering::SeqCst); + mock.create_and_start("crasher", "test:latest").unwrap(); + assert_eq!(mock.inspect_state("crasher"), Some("exited".to_string())); + } + + #[test] + fn test_stop_records_timeout() { + let mock = MockPodman::new(); + mock.preload_image("test:latest"); + mock.create_and_start("test", "test:latest").unwrap(); + mock.stop("test", 600).unwrap(); + assert_eq!(mock.get_stop_timeout("test"), Some(600)); + assert_eq!(mock.inspect_state("test"), Some("stopped".to_string())); + } + + #[test] + fn test_remove() { + let mock = MockPodman::new(); + mock.preload_image("test:latest"); + mock.create_and_start("removeme", "test:latest").unwrap(); + mock.remove("removeme").unwrap(); + assert!(mock.inspect_state("removeme").is_none()); + } + + #[test] + fn test_start_without_image_fails() { + let mock = MockPodman::new(); + assert!(mock.create_and_start("nope", "missing:latest").is_err()); + } + + #[test] + fn test_preload_container() { + let mock = MockPodman::new(); + mock.preload_container("existing", "img:1.0", MockContainerState::Running); + assert_eq!(mock.inspect_state("existing"), Some("running".to_string())); + assert_eq!(mock.list_all().len(), 1); + } + + #[test] + fn test_reset() { + let mock = MockPodman::new(); + mock.preload_image("img:1"); + mock.preload_container("c1", "img:1", MockContainerState::Running); + mock.fail_pull.store(true, Ordering::SeqCst); + mock.reset(); + assert!(!mock.image_exists("img:1")); + assert!(mock.list_all().is_empty()); + assert!(!mock.fail_pull.load(Ordering::SeqCst)); + } +} diff --git a/core/archipelago/tests/orchestration_tests.rs b/core/archipelago/tests/orchestration_tests.rs new file mode 100644 index 00000000..8cece48f --- /dev/null +++ b/core/archipelago/tests/orchestration_tests.rs @@ -0,0 +1,496 @@ +//! Container orchestration tests. +//! +//! Tests the orchestration LOGIC without real containers: +//! - Stop grace periods per container type +//! - Image pull retry with exponential backoff +//! - Restart tracker persistence across process restarts +//! - Health monitor tier ordering and user-stopped filtering +//! - Crash recovery snapshot loading +//! - Failsafe install verification +//! +//! Self-contained: no imports from the archipelago binary crate. +//! Uses inline mock + duplicated logic functions to test correctness. + +#[path = "../src/container/mock_podman.rs"] +mod mock_podman; + +// ── Stop Grace Periods ───────────────────────────────────────────────── + +mod stop_grace_periods { + /// Mirror of runtime.rs stop_timeout_secs — kept in sync. + /// Tests verify the logic; the real function lives in runtime.rs. + fn stop_timeout_secs(container_name: &str) -> &'static str { + let id = container_name.strip_prefix("archy-").unwrap_or(container_name); + match id { + "bitcoin-knots" | "bitcoin-core" | "bitcoin" => "600", + "lnd" => "330", + "electrumx" | "electrs" | "mempool-electrs" => "300", + "btcpay-db" | "mempool-db" | "penpot-postgres" | "immich_postgres" + | "nextcloud-db" | "endurain-db" => "120", + "btcpay-server" | "nbxplorer" | "fedimint" | "fedimint-gateway" => "60", + _ => "30", + } + } + + #[test] + fn bitcoin_core_gets_600s() { + assert_eq!(stop_timeout_secs("bitcoin-knots"), "600"); + assert_eq!(stop_timeout_secs("bitcoin-core"), "600"); + assert_eq!(stop_timeout_secs("bitcoin"), "600"); + } + + #[test] + fn bitcoin_with_archy_prefix() { + assert_eq!(stop_timeout_secs("archy-bitcoin-knots"), "600"); + } + + #[test] + fn lnd_gets_330s() { + assert_eq!(stop_timeout_secs("lnd"), "330"); + assert_eq!(stop_timeout_secs("archy-lnd"), "330"); + } + + #[test] + fn indexers_get_300s() { + assert_eq!(stop_timeout_secs("electrumx"), "300"); + assert_eq!(stop_timeout_secs("electrs"), "300"); + assert_eq!(stop_timeout_secs("mempool-electrs"), "300"); + } + + #[test] + fn databases_get_120s() { + assert_eq!(stop_timeout_secs("btcpay-db"), "120"); + assert_eq!(stop_timeout_secs("archy-mempool-db"), "120"); + assert_eq!(stop_timeout_secs("penpot-postgres"), "120"); + assert_eq!(stop_timeout_secs("immich_postgres"), "120"); + } + + #[test] + fn btcpay_services_get_60s() { + assert_eq!(stop_timeout_secs("btcpay-server"), "60"); + assert_eq!(stop_timeout_secs("nbxplorer"), "60"); + assert_eq!(stop_timeout_secs("fedimint"), "60"); + } + + #[test] + fn default_is_30s() { + assert_eq!(stop_timeout_secs("grafana"), "30"); + assert_eq!(stop_timeout_secs("filebrowser"), "30"); + assert_eq!(stop_timeout_secs("searxng"), "30"); + assert_eq!(stop_timeout_secs("ollama"), "30"); + assert_eq!(stop_timeout_secs("unknown-app"), "30"); + } + + #[test] + fn ui_containers_get_30s() { + assert_eq!(stop_timeout_secs("archy-bitcoin-ui"), "30"); + assert_eq!(stop_timeout_secs("archy-lnd-ui"), "30"); + assert_eq!(stop_timeout_secs("archy-electrs-ui"), "30"); + } +} + +// ── Image Pull Retry Logic ───────────────────────────────────────────── + +mod pull_retry { + use crate::mock_podman::MockPodman; + use std::sync::atomic::Ordering; + + /// Simulate the retry logic from install.rs: 3 attempts, backoff. + fn pull_with_retry(mock: &MockPodman, image: &str) -> Result<(), String> { + const MAX_ATTEMPTS: u32 = 3; + + for attempt in 1..=MAX_ATTEMPTS { + match mock.pull_image(image) { + Ok(()) => return Ok(()), + Err(e) if attempt < MAX_ATTEMPTS => { + // In real code, we'd sleep here. In tests, just continue. + let _ = e; + } + Err(e) => return Err(format!("Failed after {} attempts: {}", MAX_ATTEMPTS, e)), + } + } + unreachable!() + } + + #[test] + fn succeeds_first_try() { + let mock = MockPodman::new(); + pull_with_retry(&mock, "test:1.0").unwrap(); + assert_eq!(mock.pull_attempt_count.load(Ordering::SeqCst), 1); + assert!(mock.image_exists("test:1.0")); + } + + #[test] + fn fails_then_succeeds() { + let mock = MockPodman::new(); + // Simulate: fail attempt 1, succeed attempt 2 + mock.fail_pull.store(true, Ordering::SeqCst); + + // Attempt 1: fails + assert!(mock.pull_image("test:1.0").is_err()); + assert_eq!(mock.pull_attempt_count.load(Ordering::SeqCst), 1); + + // Registry comes back + mock.fail_pull.store(false, Ordering::SeqCst); + + // Attempt 2: succeeds + assert!(mock.pull_image("test:1.0").is_ok()); + assert_eq!(mock.pull_attempt_count.load(Ordering::SeqCst), 2); + assert!(mock.image_exists("test:1.0")); + } + + #[test] + fn all_attempts_fail() { + let mock = MockPodman::new(); + mock.fail_pull.store(true, Ordering::SeqCst); + let result = pull_with_retry(&mock, "test:1.0"); + assert!(result.is_err()); + assert_eq!(mock.pull_attempt_count.load(Ordering::SeqCst), 3); + assert!(!mock.image_exists("test:1.0")); + } +} + +// ── Restart Tracker Persistence ──────────────────────────────────────── + +mod restart_tracker { + use tempfile::TempDir; + use std::collections::HashMap; + + // Inline the serialization structs (same as health_monitor.rs) + #[derive(serde::Serialize, serde::Deserialize, Default)] + struct RestartHistory { + containers: HashMap, + } + + #[derive(serde::Serialize, serde::Deserialize, Clone)] + struct ContainerRestartRecord { + attempts: u32, + last_failure_epoch: i64, + } + + #[test] + fn save_and_load_roundtrip() { + let tmp = TempDir::new().unwrap(); + let path = tmp.path().join("restart-tracker.json"); + + let mut history = RestartHistory::default(); + history.containers.insert("bitcoin-knots".to_string(), ContainerRestartRecord { + attempts: 2, + last_failure_epoch: 1700000000, + }); + history.containers.insert("lnd".to_string(), ContainerRestartRecord { + attempts: 1, + last_failure_epoch: 1700000100, + }); + + // Save + let json = serde_json::to_string(&history).unwrap(); + std::fs::write(&path, &json).unwrap(); + + // Load + let loaded_json = std::fs::read_to_string(&path).unwrap(); + let loaded: RestartHistory = serde_json::from_str(&loaded_json).unwrap(); + + assert_eq!(loaded.containers.len(), 2); + assert_eq!(loaded.containers["bitcoin-knots"].attempts, 2); + assert_eq!(loaded.containers["lnd"].attempts, 1); + } + + #[test] + fn missing_file_returns_empty() { + let tmp = TempDir::new().unwrap(); + let path = tmp.path().join("restart-tracker.json"); + + let result = std::fs::read_to_string(&path); + assert!(result.is_err()); + + // Same behavior as health_monitor.rs: unwrap_or_default + let history: RestartHistory = result + .ok() + .and_then(|s| serde_json::from_str(&s).ok()) + .unwrap_or_default(); + assert!(history.containers.is_empty()); + } + + #[test] + fn corrupt_file_returns_empty() { + let tmp = TempDir::new().unwrap(); + let path = tmp.path().join("restart-tracker.json"); + std::fs::write(&path, "not valid json {{{").unwrap(); + + let content = std::fs::read_to_string(&path).unwrap(); + let history: RestartHistory = serde_json::from_str(&content).unwrap_or_default(); + assert!(history.containers.is_empty()); + } + + #[test] + fn clear_removes_container() { + let mut history = RestartHistory::default(); + history.containers.insert("test".to_string(), ContainerRestartRecord { + attempts: 3, + last_failure_epoch: 1700000000, + }); + history.containers.remove("test"); + assert!(history.containers.is_empty()); + } + + #[test] + fn stability_window_check() { + let now = chrono::Utc::now().timestamp(); + let one_hour_ago = now - 3601; + let five_min_ago = now - 300; + + // Old failure: should reset + let old_record = ContainerRestartRecord { + attempts: 3, + last_failure_epoch: one_hour_ago, + }; + assert!(now - old_record.last_failure_epoch >= 3600); + + // Recent failure: should NOT reset + let recent_record = ContainerRestartRecord { + attempts: 3, + last_failure_epoch: five_min_ago, + }; + assert!(now - recent_record.last_failure_epoch < 3600); + } +} + +// ── Failsafe Install ────────────────────────────────────────────────── + +mod failsafe_install { + use crate::mock_podman::{MockPodman, MockContainerState}; + use std::sync::atomic::Ordering; + + #[test] + fn successful_install_flow() { + let mock = MockPodman::new(); + // Pull succeeds + mock.pull_image("registry/app:1.0").unwrap(); + // Image exists + assert!(mock.image_exists("registry/app:1.0")); + // Container starts + mock.create_and_start("test-app", "registry/app:1.0").unwrap(); + // Running state + assert_eq!(mock.inspect_state("test-app"), Some("running".to_string())); + } + + #[test] + fn rollback_on_immediate_exit() { + let mock = MockPodman::new(); + mock.preload_image("registry/app:1.0"); + mock.fail_start.store(true, Ordering::SeqCst); + + // Container is created but exits immediately + mock.create_and_start("crasher", "registry/app:1.0").unwrap(); + assert_eq!(mock.inspect_state("crasher"), Some("exited".to_string())); + + // Rollback: remove the failed container + mock.remove("crasher").unwrap(); + assert!(mock.inspect_state("crasher").is_none()); + } + + #[test] + fn no_image_after_pull_is_error() { + let mock = MockPodman::new(); + // Don't pull — image doesn't exist + let result = mock.create_and_start("no-image", "missing:1.0"); + assert!(result.is_err()); + } +} + +// ── Health Monitor Logic ────────────────────────────────────────────── + +mod health_monitor_logic { + use crate::mock_podman::{MockPodman, MockContainerState}; + + /// Mirrors the tier ordering from health_monitor.rs + fn container_tier(name: &str) -> u8 { + let id = name.strip_prefix("archy-").unwrap_or(name); + match id { + "btcpay-db" | "mempool-db" | "penpot-postgres" | "immich_postgres" + | "immich_redis" | "penpot-valkey" | "endurain-db" | "nextcloud-db" => 0, + "bitcoin-knots" | "bitcoin-core" | "bitcoin" => 1, + "lnd" | "electrumx" | "mempool-electrs" | "electrs" | "nbxplorer" => 2, + "mempool-web" | "bitcoin-ui" | "lnd-ui" | "electrs-ui" + | "penpot-frontend" | "penpot-exporter" => 4, + _ => 3, + } + } + + #[test] + fn tier_ordering_databases_first() { + assert!(container_tier("btcpay-db") < container_tier("bitcoin-knots")); + assert!(container_tier("mempool-db") < container_tier("lnd")); + } + + #[test] + fn tier_ordering_core_before_services() { + assert!(container_tier("bitcoin-knots") < container_tier("lnd")); + assert!(container_tier("bitcoin-knots") < container_tier("electrumx")); + } + + #[test] + fn tier_ordering_services_before_apps() { + assert!(container_tier("lnd") < container_tier("grafana")); + assert!(container_tier("electrumx") < container_tier("filebrowser")); + } + + #[test] + fn tier_ordering_apps_before_uis() { + assert!(container_tier("grafana") < container_tier("bitcoin-ui")); + assert!(container_tier("filebrowser") < container_tier("lnd-ui")); + } + + #[test] + fn user_stopped_containers_skipped() { + let user_stopped: std::collections::HashSet = + ["archy-grafana".to_string(), "filebrowser".to_string()].into(); + + // Simulated unhealthy containers + let unhealthy = vec!["archy-grafana", "filebrowser", "lnd"]; + + let to_restart: Vec<&str> = unhealthy + .into_iter() + .filter(|name| !user_stopped.contains(*name)) + .collect(); + + assert_eq!(to_restart, vec!["lnd"]); + } + + #[test] + fn ui_containers_skipped() { + let containers = vec![ + ("bitcoin-knots", "exited"), + ("archy-bitcoin-ui", "exited"), + ("archy-lnd-ui", "exited"), + ("grafana", "exited"), + ]; + + let skip_suffixes = ["-ui"]; + let skip_backends = ["btcpay-db", "nbxplorer", "mempool-db", "mempool-api"]; + + let to_check: Vec<&str> = containers + .iter() + .filter(|(name, _)| { + let id = name.strip_prefix("archy-").unwrap_or(name); + !skip_suffixes.iter().any(|s| id.ends_with(s)) + && !skip_backends.contains(&id) + }) + .map(|(name, _)| *name) + .collect(); + + assert_eq!(to_check, vec!["bitcoin-knots", "grafana"]); + } + + #[test] + fn restart_sorted_by_tier() { + let mut unhealthy = vec![ + "grafana", // tier 3 + "lnd", // tier 2 + "btcpay-db", // tier 0 + "bitcoin-knots", // tier 1 + ]; + + unhealthy.sort_by_key(|name| container_tier(name)); + + assert_eq!(unhealthy, vec!["btcpay-db", "bitcoin-knots", "lnd", "grafana"]); + } +} + +// ── Crash Recovery ──────────────────────────────────────────────────── + +mod crash_recovery { + use tempfile::TempDir; + + #[derive(serde::Serialize, serde::Deserialize)] + struct ContainerSnapshot { + timestamp: u64, + containers: Vec, + } + + #[derive(serde::Serialize, serde::Deserialize)] + struct RunningContainerRecord { + name: String, + image: String, + } + + #[test] + fn snapshot_roundtrip() { + let tmp = TempDir::new().unwrap(); + let path = tmp.path().join("running-containers.json"); + + let snapshot = ContainerSnapshot { + timestamp: 1700000000, + containers: vec![ + RunningContainerRecord { + name: "bitcoin-knots".to_string(), + image: "bitcoin-knots:28.1".to_string(), + }, + RunningContainerRecord { + name: "lnd".to_string(), + image: "lnd:0.18.5".to_string(), + }, + ], + }; + + let json = serde_json::to_string_pretty(&snapshot).unwrap(); + std::fs::write(&path, &json).unwrap(); + + let loaded_json = std::fs::read_to_string(&path).unwrap(); + let loaded: ContainerSnapshot = serde_json::from_str(&loaded_json).unwrap(); + + assert_eq!(loaded.containers.len(), 2); + assert_eq!(loaded.containers[0].name, "bitcoin-knots"); + } + + #[test] + fn user_stopped_filtering() { + let user_stopped: std::collections::HashSet = + ["grafana".to_string()].into(); + + let snapshot_containers = vec![ + "bitcoin-knots".to_string(), + "lnd".to_string(), + "grafana".to_string(), + ]; + + let to_recover: Vec<&String> = snapshot_containers + .iter() + .filter(|name| !user_stopped.contains(name.as_str())) + .collect(); + + assert_eq!(to_recover.len(), 2); + assert!(!to_recover.iter().any(|n| n.as_str() == "grafana")); + } + + #[test] + fn boot_tier_ordering() { + fn boot_tier(name: &str) -> u8 { + let id = name.strip_prefix("archy-").unwrap_or(name); + match id { + "btcpay-db" | "mempool-db" => 0, + "bitcoin-knots" | "bitcoin-core" => 1, + "lnd" | "electrumx" => 2, + "mempool-web" | "bitcoin-ui" | "lnd-ui" => 4, + _ => 3, + } + } + + let mut containers = vec![ + "mempool-web", + "lnd", + "btcpay-db", + "bitcoin-knots", + "grafana", + ]; + + containers.sort_by_key(|name| boot_tier(name)); + + assert_eq!(containers[0], "btcpay-db"); + assert_eq!(containers[1], "bitcoin-knots"); + assert_eq!(containers[2], "lnd"); + assert_eq!(containers[3], "grafana"); + assert_eq!(containers[4], "mempool-web"); + } +} diff --git a/docs/MASTER_PLAN.md b/docs/MASTER_PLAN.md index 42d321d2..e6aef58e 100644 --- a/docs/MASTER_PLAN.md +++ b/docs/MASTER_PLAN.md @@ -18,6 +18,7 @@ | **TASK-12** | **Beta telemetry — reporter + toggle + collector POST** | **P1** | IN PROGRESS | - | | **TASK-39** | **Finish .198 rootless container migration** | **P1** | PLANNED | TASK-11 | | **TASK-42** | **LUKS2 full-partition encryption for /var/lib/archipelago/** | **P1** | IN PROGRESS | - | +| **TASK-49** | **Container app reliability — bulletproof installs + recovery** | **P0** | PLANNED | - | | **BUG-44** | **App iframe shows blank/broken when container is starting or crashed** | **P2** | PLANNED | - | | **TASK-45** | **Deploy script: auto-chown data dirs after rootful→rootless migration** | **P2** | PLANNED | - | | **BUG-46** | **FileBrowser missing in unbundled ISO + Cloud auto-login broken** | **P1** | IN PROGRESS | - | @@ -149,6 +150,99 @@ Encrypt all Archipelago app data at rest using LUKS2 full-partition encryption. - `core/archipelago/src/api/rpc/system.rs` — password change handler - `core/archipelago/src/server.rs` — startup checks +### TASK-49: Container app reliability — bulletproof installs + recovery (PLANNED) +**Priority**: P0 — Critical +**Status**: PLANNED (2026-03-29) + +Every marketplace app must install cleanly, survive failures, auto-recover from unhealthy states, and uninstall without residue. Currently: some apps fail silently, health checks are inconsistent, and there's no systematic testing. + +**Scope**: All 25+ marketplace apps — install, health, restart, uninstall, dependency chains. + +#### Phase A: Audit & Fix Install Flow (Days 1-2) +Test every app install on a fresh .198 node. Fix failures as found. + +- [ ] **A1**: Create install test matrix — spreadsheet of all apps with columns: installs?, starts?, healthy?, UI loads?, uninstalls?, deps correct? +- [ ] **A2**: Test core apps: Bitcoin Knots, LND, Mempool, BTCPay, Electrumx, FileBrowser +- [ ] **A3**: Test recommended apps: Fedimint, Vaultwarden, Grafana, SearXNG, Tailscale, Portainer +- [ ] **A4**: Test optional apps: Home Assistant, Jellyfin, PhotoPrism, Nextcloud, Ollama, Immich, Penpot, OnlyOffice +- [ ] **A5**: Test web-only/L484 apps: noStrudel, BotFights, NWNN, IndeedHub, DWN +- [ ] **A6**: Test Nostr relay (nostr-rs-relay) install + relay functionality +- [ ] **A7**: Fix all install failures found in A2-A6 + +#### Phase B: Health Checks & Restart Policies (Days 2-3) +Ensure every container has proper health checks and restart policies. + +- [ ] **B1**: Audit all container manifests for `--health-cmd`, `--health-interval`, `--health-retries` +- [ ] **B2**: Add health checks to containers missing them (curl endpoint or process check) +- [ ] **B3**: Verify `--restart unless-stopped` on all containers +- [ ] **B4**: Test failure recovery: `podman kill ` → verify auto-restart +- [ ] **B5**: Test OOM recovery: set low memory limit → trigger OOM → verify restart +- [ ] **B6**: Verify container-doctor.sh runs on timer and fixes unhealthy containers +- [ ] **B7**: Verify reconcile-containers.sh detects and recreates missing containers + +#### Phase C: Dependency Chain Validation (Day 3) +Apps with dependencies (BTCPay→Bitcoin+Postgres, Mempool→Bitcoin+MariaDB) must handle missing deps gracefully. + +- [ ] **C1**: Map all dependency chains (which app needs which) +- [ ] **C2**: Test installing dependent app without dependency → verify error message +- [ ] **C3**: Test stopping dependency while dependent is running → verify graceful degradation +- [ ] **C4**: Test restarting dependency → verify dependent reconnects automatically +- [ ] **C5**: Ensure backend `dependency_resolver.rs` handles all chains correctly + +#### Phase D: Uninstall & Cleanup (Day 4) +Every app must uninstall cleanly — no orphaned volumes, networks, or config. + +- [ ] **D1**: Test uninstall for each app — verify container, volumes, config removed +- [ ] **D2**: Verify no orphaned podman volumes after uninstall (`podman volume ls`) +- [ ] **D3**: Verify no orphaned networks after uninstall +- [ ] **D4**: Test reinstall after uninstall — must work cleanly +- [ ] **D5**: Fix any cleanup issues found + +#### Phase E: Stress & Soak Testing (Day 5) +Multi-day uptime test with all core apps running. + +- [ ] **E1**: Install all core + recommended apps on .198 +- [ ] **E2**: Let run for 24h — check for crashes, memory leaks, disk growth +- [ ] **E3**: Simulate power failure (hard reboot) — verify all apps come back +- [ ] **E4**: Simulate network failure — verify apps recover when network returns +- [ ] **E5**: Run container-doctor after soak test — should report all healthy + +#### Phase E2: FileBrowser Auto-Login (Day 5) +FileBrowser must auto-login seamlessly after install — user should never see a separate login screen. Still protected via nginx session cookie validation. + +- [ ] **E2a**: Fix FileBrowser auto-login flow: nginx auth_request validates Archipelago session, injects FileBrowser auth token +- [ ] **E2b**: Verify auto-login works on fresh bundled install (first boot) +- [ ] **E2c**: Verify auto-login works on unbundled install (Marketplace install) +- [ ] **E2d**: Verify FileBrowser is NOT accessible without valid Archipelago session (security) +- [ ] **E2e**: Test auto-login after session expiry → re-login to Archipelago → FileBrowser works again + +#### Phase F: Frontend UX (Day 5-6) +The UI must accurately reflect container state at all times. + +- [ ] **F1**: Installing state persists across navigation (DONE — TASK-49 server store) +- [ ] **F2**: App card shows correct state: stopped, starting, running, unhealthy, crashed +- [ ] **F3**: App iframe shows contextual error when container is down (BUG-44) +- [ ] **F4**: Uninstall progress shown in My Apps +- [ ] **F5**: Error toast when install fails with actionable message + +**Key files**: +- `core/archipelago/src/container/` — PodmanClient, manifests, health +- `core/archipelago/src/api/rpc/package/` — install/uninstall RPC handlers +- `scripts/container-doctor.sh` — health check + auto-fix +- `scripts/reconcile-containers.sh` — recreate missing containers +- `scripts/image-versions.sh` — pinned image versions +- `scripts/first-boot-containers.sh` — first-boot container creation +- `neode-ui/src/views/marketplace/` — install UI +- `neode-ui/src/views/apps/` — My Apps state display + +**Testing approach**: +- Fresh .198 install as test bed +- SSH in, run installs via web UI, check with `podman ps -a` +- Automated: `scripts/container-doctor.sh --local` after each test +- Manual: kill containers, pull power, break networks, verify recovery + +--- + ### BUG-44: App iframe shows blank/broken when container is starting or crashed (PLANNED) **Priority**: P2 — Medium **Status**: PLANNED (2026-03-21) diff --git a/neode-ui/src/views/Login.vue b/neode-ui/src/views/Login.vue index 98925561..ca57337a 100644 --- a/neode-ui/src/views/Login.vue +++ b/neode-ui/src/views/Login.vue @@ -15,7 +15,8 @@

- {{ t('login.setupTitle') }} +   + {{ t('login.setupTitle') }} {{ t('login.title') }}

@@ -38,8 +39,16 @@ {{ error }} + +
+ + + + +
+ -