diff --git a/core/archipelago/src/api/rpc/container.rs b/core/archipelago/src/api/rpc/container.rs index a5fba05c..dafcbfa3 100644 --- a/core/archipelago/src/api/rpc/container.rs +++ b/core/archipelago/src/api/rpc/container.rs @@ -1,4 +1,5 @@ use super::package::validate_app_id; +use super::transitional::Op; use super::RpcHandler; use anyhow::{Context, Result}; @@ -62,11 +63,6 @@ impl RpcHandler { &self, params: Option, ) -> Result { - let orchestrator = self - .orchestrator - .as_ref() - .ok_or_else(|| anyhow::anyhow!("Container orchestrator not available"))?; - let params = params.ok_or_else(|| anyhow::anyhow!("Missing params"))?; let app_id = params .get("app_id") @@ -74,23 +70,23 @@ impl RpcHandler { .ok_or_else(|| anyhow::anyhow!("Missing app_id"))?; validate_app_id(app_id)?; - orchestrator - .start(app_id) - .await - .context("Failed to start container")?; + // User explicitly started the app — clear the user-stopped marker so + // crash recovery / health monitor won't second-guess it. Must happen + // BEFORE the spawn (see runtime.rs:145-148 for the symmetric stop + // side and the ordering contract crash recovery depends on). + crate::crash_recovery::clear_user_stopped(&self.config.data_dir, app_id).await; - Ok(serde_json::json!({ "status": "started" })) + // spawn_transitional returns as soon as the background task is + // launched (<1s). The UI sees Starting… immediately via WebSocket. + self.spawn_transitional(Op::Start, app_id.to_string()).await?; + + Ok(serde_json::json!({ "status": "starting" })) } pub(super) async fn handle_container_stop( &self, params: Option, ) -> Result { - let orchestrator = self - .orchestrator - .as_ref() - .ok_or_else(|| anyhow::anyhow!("Container orchestrator not available"))?; - let params = params.ok_or_else(|| anyhow::anyhow!("Missing params"))?; let app_id = params .get("app_id") @@ -98,12 +94,40 @@ impl RpcHandler { .ok_or_else(|| anyhow::anyhow!("Missing app_id"))?; validate_app_id(app_id)?; - orchestrator - .stop(app_id) - .await - .context("Failed to stop container")?; + // Mark as user-stopped BEFORE the spawn — ordering is load-bearing + // (crash recovery / health monitor inspect this flag concurrently + // with the in-flight stop; see runtime.rs:145-148 for the package + // path that also writes this in the same order). + crate::crash_recovery::mark_user_stopped(&self.config.data_dir, app_id).await; - Ok(serde_json::json!({ "status": "stopped" })) + // podman stop -t 600 (bitcoin-core) / -t 330 (lnd) runs in the + // background; the RPC returns now with "stopping". + self.spawn_transitional(Op::Stop, app_id.to_string()).await?; + + Ok(serde_json::json!({ "status": "stopping" })) + } + + pub(super) async fn handle_container_restart( + &self, + params: Option, + ) -> Result { + let params = params.ok_or_else(|| anyhow::anyhow!("Missing params"))?; + let app_id = params + .get("app_id") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing app_id"))?; + validate_app_id(app_id)?; + + // Restart does not mark user-stopped (the user wants the app to + // keep running). Clear the marker as a defensive measure in case a + // prior stop left it set and the restart is intended to revive the + // normal running state. + crate::crash_recovery::clear_user_stopped(&self.config.data_dir, app_id).await; + + self.spawn_transitional(Op::Restart, app_id.to_string()) + .await?; + + Ok(serde_json::json!({ "status": "restarting" })) } pub(super) async fn handle_container_remove( @@ -145,12 +169,25 @@ impl RpcHandler { .package_data .iter() .map(|(id, pkg)| { + // Keep this mapping in sync with the UI's + // ContainerStatus.state union in + // neode-ui/src/api/container-client.ts. The UI maps + // transitional variants to single-button labels + // (Stopping… / Starting… / Restarting…). let state = match &pkg.state { crate::data_model::PackageState::Running => "running", crate::data_model::PackageState::Stopped => "stopped", crate::data_model::PackageState::Exited => "exited", - crate::data_model::PackageState::Starting => "created", - _ => "unknown", + crate::data_model::PackageState::Starting => "starting", + crate::data_model::PackageState::Stopping => "stopping", + crate::data_model::PackageState::Restarting => "restarting", + crate::data_model::PackageState::Installing => "installing", + crate::data_model::PackageState::Installed => "installed", + crate::data_model::PackageState::Updating => "updating", + crate::data_model::PackageState::Removing => "removing", + crate::data_model::PackageState::CreatingBackup => "creating-backup", + crate::data_model::PackageState::RestoringBackup => "restoring-backup", + crate::data_model::PackageState::BackingUp => "backing-up", }; let lan = pkg .installed diff --git a/core/archipelago/src/api/rpc/dispatcher.rs b/core/archipelago/src/api/rpc/dispatcher.rs index 4536cfc7..d96dc7a2 100644 --- a/core/archipelago/src/api/rpc/dispatcher.rs +++ b/core/archipelago/src/api/rpc/dispatcher.rs @@ -36,6 +36,7 @@ impl RpcHandler { "container-install" => self.handle_container_install(params).await, "container-start" => self.handle_container_start(params).await, "container-stop" => self.handle_container_stop(params).await, + "container-restart" => self.handle_container_restart(params).await, "container-remove" => self.handle_container_remove(params).await, "container-list" => self.handle_container_list().await, "container-status" => self.handle_container_status(params).await, diff --git a/core/archipelago/src/api/rpc/package/runtime.rs b/core/archipelago/src/api/rpc/package/runtime.rs index 3253c11a..dd54105b 100644 --- a/core/archipelago/src/api/rpc/package/runtime.rs +++ b/core/archipelago/src/api/rpc/package/runtime.rs @@ -3,7 +3,10 @@ use super::dependencies::ordered_containers_for_start; use super::install::install_log; use super::validation::validate_app_id; use crate::api::rpc::RpcHandler; +use crate::data_model::PackageState; use anyhow::{Context, Result}; +use std::sync::Arc; +use tracing::warn; /// Per-container graceful shutdown timeout in seconds. /// Bitcoin Core needs 600s to flush UTXO set, LND 330s for channel state, @@ -25,6 +28,12 @@ pub fn stop_timeout_secs(container_name: &str) -> &'static str { impl RpcHandler { /// Start a package: start all containers in dependency order. + /// + /// Returns immediately with `{ "status": "starting" }` after flipping + /// the package state to `Starting` in the StateManager. The actual + /// podman-start sequence + post-start exit verification runs in a + /// background task. On success the state becomes `Running`; on error + /// it reverts to the pre-transition state. pub(in crate::api::rpc) async fn handle_package_start( &self, params: Option, @@ -42,83 +51,52 @@ impl RpcHandler { return Err(anyhow::anyhow!("No containers found for {}", package_id)); } - // Clear user-stopped flag — user explicitly started this app + // Clear user-stopped flag — user explicitly started this app. + // Must happen BEFORE the spawn (ordering contract with crash recovery). crate::crash_recovery::clear_user_stopped(&self.config.data_dir, package_id).await; for name in &to_start { crate::crash_recovery::clear_user_stopped(&self.config.data_dir, name).await; } + let package_id_owned = package_id.to_string(); + let state_manager = Arc::clone(&self.state_manager); + let pre_state = + flip_package_state(&state_manager, &package_id_owned, PackageState::Starting).await; + install_log(&format!( "START: {} (containers: {:?})", - package_id, to_start + package_id_owned, to_start )) .await; - let mut errors = Vec::new(); - for (i, name) in to_start.iter().enumerate() { - // Brief delay between dependent containers to allow initialization - if i > 0 { - tokio::time::sleep(std::time::Duration::from_secs(2)).await; - } - tracing::info!("Starting container: {}", name); - let out = tokio::process::Command::new("podman") - .args(["start", name]) - .output() - .await - .context(format!("Failed to exec podman start {}", name))?; - if !out.status.success() { - let stderr = String::from_utf8_lossy(&out.stderr).trim().to_string(); - tracing::error!("Failed to start {}: {}", name, stderr); - install_log(&format!("START FAIL: {} — {}", name, stderr)).await; - errors.push(format!("{}: {}", name, stderr)); - } - } - if !errors.is_empty() { - return Err(anyhow::anyhow!("Start failed: {}", errors.join("; "))); - } - - // Verify containers actually reached running state (podman start can - // succeed even if the container exits immediately after) - tokio::time::sleep(std::time::Duration::from_secs(3)).await; - for name in &to_start { - let status = tokio::process::Command::new("podman") - .args(["inspect", name, "--format", "{{.State.Status}}"]) - .output() - .await; - if let Ok(o) = status { - let state = String::from_utf8_lossy(&o.stdout).trim().to_string(); - if state == "exited" { - let logs = tokio::process::Command::new("podman") - .args(["logs", "--tail", "5", name]) - .output() + tokio::spawn(async move { + match do_package_start(&to_start).await { + Ok(()) => { + set_package_state(&state_manager, &package_id_owned, PackageState::Running) .await; - let log_text = logs - .map(|o| { - let combined = format!( - "{}{}", - String::from_utf8_lossy(&o.stdout), - String::from_utf8_lossy(&o.stderr) - ); - combined.chars().take(200).collect::() - }) - .unwrap_or_default(); - tracing::error!("Container {} exited after start: {}", name, log_text); - install_log(&format!("START EXITED: {} — {}", name, log_text)).await; - errors.push(format!("{}: exited after start", name)); + } + Err(e) => { + tracing::error!("package.start {} failed: {:#}", package_id_owned, e); + install_log(&format!("START FAIL: {} — {:#}", package_id_owned, e)).await; + if let Some(prev) = pre_state { + set_package_state(&state_manager, &package_id_owned, prev).await; + } else { + warn!( + "package.start {}: no pre-state recorded; relying on next scan", + package_id_owned + ); + } } } - } + }); - if !errors.is_empty() { - return Err(anyhow::anyhow!( - "Containers exited after start: {}", - errors.join("; ") - )); - } - Ok(serde_json::Value::Null) + Ok(serde_json::json!({ "status": "starting" })) } /// Stop a package: mark as user-stopped and stop all containers. + /// + /// Returns immediately with `{ "status": "stopping" }`. podman stop + /// (up to 600s for bitcoin-core) runs in the background. pub(in crate::api::rpc) async fn handle_package_stop( &self, params: Option, @@ -136,43 +114,48 @@ impl RpcHandler { return Err(anyhow::anyhow!("No containers found for {}", package_id)); } - install_log(&format!( - "STOP: {} (containers: {:?})", - package_id, containers - )) - .await; - // Mark as user-stopped so health monitor and crash recovery don't auto-restart + // Mark as user-stopped BEFORE the spawn so health monitor and + // crash recovery don't auto-restart mid-flight. Ordering is + // load-bearing — see runtime.rs:145-148 original note. crate::crash_recovery::mark_user_stopped(&self.config.data_dir, package_id).await; for name in &containers { crate::crash_recovery::mark_user_stopped(&self.config.data_dir, name).await; } - let mut errors = Vec::new(); - for name in &containers { - tracing::info!( - "Stopping container: {} (timeout: {}s)", - name, - stop_timeout_secs(name) - ); - let out = tokio::process::Command::new("podman") - .args(["stop", "-t", stop_timeout_secs(name), name]) - .output() - .await - .context(format!("Failed to exec podman stop {}", name))?; - if !out.status.success() { - let stderr = String::from_utf8_lossy(&out.stderr).trim().to_string(); - tracing::error!("Failed to stop {}: {}", name, stderr); - errors.push(format!("{}: {}", name, stderr)); - } - } + let package_id_owned = package_id.to_string(); + let state_manager = Arc::clone(&self.state_manager); + let pre_state = + flip_package_state(&state_manager, &package_id_owned, PackageState::Stopping).await; - if !errors.is_empty() { - return Err(anyhow::anyhow!("Stop failed: {}", errors.join("; "))); - } - Ok(serde_json::Value::Null) + install_log(&format!( + "STOP: {} (containers: {:?})", + package_id_owned, containers + )) + .await; + + tokio::spawn(async move { + match do_package_stop(&containers).await { + Ok(()) => { + set_package_state(&state_manager, &package_id_owned, PackageState::Stopped) + .await; + } + Err(e) => { + tracing::error!("package.stop {} failed: {:#}", package_id_owned, e); + install_log(&format!("STOP FAIL: {} — {:#}", package_id_owned, e)).await; + if let Some(prev) = pre_state { + set_package_state(&state_manager, &package_id_owned, prev).await; + } + } + } + }); + + Ok(serde_json::json!({ "status": "stopping" })) } /// Restart a package: restart all containers. + /// + /// Returns immediately with `{ "status": "restarting" }`. The restart + /// (up to 600s per container for bitcoin-core) runs in the background. pub(in crate::api::rpc) async fn handle_package_restart( &self, params: Option, @@ -190,55 +173,42 @@ impl RpcHandler { return Err(anyhow::anyhow!("No containers found for {}", package_id)); } + // Restart does not mark user-stopped; user wants the app to keep + // running. Clear any lingering marker so downstream layers don't + // interpret the brief podman stop as user intent. + crate::crash_recovery::clear_user_stopped(&self.config.data_dir, package_id).await; + for name in &containers { + crate::crash_recovery::clear_user_stopped(&self.config.data_dir, name).await; + } + + let package_id_owned = package_id.to_string(); + let state_manager = Arc::clone(&self.state_manager); + let pre_state = + flip_package_state(&state_manager, &package_id_owned, PackageState::Restarting).await; + install_log(&format!( "RESTART: {} (containers: {:?})", - package_id, containers + package_id_owned, containers )) .await; - let mut errors = Vec::new(); - for name in &containers { - tracing::info!("Restarting container: {}", name); - let out = tokio::process::Command::new("podman") - .args(["restart", "-t", stop_timeout_secs(name), name]) - .output() - .await - .context(format!("Failed to exec podman restart {}", name))?; - if !out.status.success() { - let stderr = String::from_utf8_lossy(&out.stderr).trim().to_string(); - tracing::warn!( - "podman restart {} failed: {}, trying stop+start", - name, - stderr - ); - - // Fallback: stop then start (handles rootless podman loopback issues) - let _ = tokio::process::Command::new("podman") - .args(["stop", "-t", stop_timeout_secs(name), name]) - .output() - .await; - let start_out = tokio::process::Command::new("podman") - .args(["start", name]) - .output() - .await - .context(format!("Failed to exec podman start {}", name))?; - - if !start_out.status.success() { - let start_err = String::from_utf8_lossy(&start_out.stderr) - .trim() - .to_string(); - tracing::error!("stop+start {} also failed: {}", name, start_err); - errors.push(format!("{}: {}", name, start_err)); - } else { - tracing::info!("Restarted {} via stop+start fallback", name); + tokio::spawn(async move { + match do_package_restart(&containers).await { + Ok(()) => { + set_package_state(&state_manager, &package_id_owned, PackageState::Running) + .await; + } + Err(e) => { + tracing::error!("package.restart {} failed: {:#}", package_id_owned, e); + install_log(&format!("RESTART FAIL: {} — {:#}", package_id_owned, e)).await; + if let Some(prev) = pre_state { + set_package_state(&state_manager, &package_id_owned, prev).await; + } } } - } + }); - if !errors.is_empty() { - return Err(anyhow::anyhow!("Restart failed: {}", errors.join("; "))); - } - Ok(serde_json::Value::Null) + Ok(serde_json::json!({ "status": "restarting" })) } /// Uninstall a package: stop and remove all related containers, clean data. @@ -579,3 +549,186 @@ impl RpcHandler { Ok(serde_json::json!({ "status": "stopped", "app_id": app_id })) } } + +// --------------------------------------------------------------------------- +// Background workers for async package lifecycle RPCs. +// +// Extracted from the pre-async RPC handlers so the transitional state is +// visible to the UI immediately. Each worker is pure IO over podman + the +// crash_recovery helpers — no StateManager access here so we don't need +// a handler reference. The caller does state flipping before/after. +// --------------------------------------------------------------------------- + +/// Start containers in dependency order. Includes the post-start 3s wait + +/// exit-check verification from the original synchronous handler (critical +/// for catching "podman start succeeded but container immediately exited" +/// failure modes). +async fn do_package_start(to_start: &[String]) -> Result<()> { + let mut errors = Vec::new(); + for (i, name) in to_start.iter().enumerate() { + if i > 0 { + tokio::time::sleep(std::time::Duration::from_secs(2)).await; + } + tracing::info!("Starting container: {}", name); + let out = tokio::process::Command::new("podman") + .args(["start", name]) + .output() + .await + .context(format!("Failed to exec podman start {}", name))?; + if !out.status.success() { + let stderr = String::from_utf8_lossy(&out.stderr).trim().to_string(); + tracing::error!("Failed to start {}: {}", name, stderr); + install_log(&format!("START FAIL: {} — {}", name, stderr)).await; + errors.push(format!("{}: {}", name, stderr)); + } + } + if !errors.is_empty() { + return Err(anyhow::anyhow!("Start failed: {}", errors.join("; "))); + } + + // Post-start exit verification (podman start can succeed even if the + // container exits immediately after). + tokio::time::sleep(std::time::Duration::from_secs(3)).await; + for name in to_start { + let status = tokio::process::Command::new("podman") + .args(["inspect", name, "--format", "{{.State.Status}}"]) + .output() + .await; + if let Ok(o) = status { + let state = String::from_utf8_lossy(&o.stdout).trim().to_string(); + if state == "exited" { + let logs = tokio::process::Command::new("podman") + .args(["logs", "--tail", "5", name]) + .output() + .await; + let log_text = logs + .map(|o| { + let combined = format!( + "{}{}", + String::from_utf8_lossy(&o.stdout), + String::from_utf8_lossy(&o.stderr) + ); + combined.chars().take(200).collect::() + }) + .unwrap_or_default(); + tracing::error!("Container {} exited after start: {}", name, log_text); + install_log(&format!("START EXITED: {} — {}", name, log_text)).await; + errors.push(format!("{}: exited after start", name)); + } + } + } + if !errors.is_empty() { + return Err(anyhow::anyhow!( + "Containers exited after start: {}", + errors.join("; ") + )); + } + Ok(()) +} + +/// Stop all containers with their per-container graceful-shutdown timeout. +async fn do_package_stop(containers: &[String]) -> Result<()> { + let mut errors = Vec::new(); + for name in containers { + tracing::info!( + "Stopping container: {} (timeout: {}s)", + name, + stop_timeout_secs(name) + ); + let out = tokio::process::Command::new("podman") + .args(["stop", "-t", stop_timeout_secs(name), name]) + .output() + .await + .context(format!("Failed to exec podman stop {}", name))?; + if !out.status.success() { + let stderr = String::from_utf8_lossy(&out.stderr).trim().to_string(); + tracing::error!("Failed to stop {}: {}", name, stderr); + errors.push(format!("{}: {}", name, stderr)); + } + } + if !errors.is_empty() { + return Err(anyhow::anyhow!("Stop failed: {}", errors.join("; "))); + } + Ok(()) +} + +/// Restart via `podman restart`, falling back to stop+start when restart +/// fails (rootless podman loopback issues). +async fn do_package_restart(containers: &[String]) -> Result<()> { + let mut errors = Vec::new(); + for name in containers { + tracing::info!("Restarting container: {}", name); + let out = tokio::process::Command::new("podman") + .args(["restart", "-t", stop_timeout_secs(name), name]) + .output() + .await + .context(format!("Failed to exec podman restart {}", name))?; + + if !out.status.success() { + let stderr = String::from_utf8_lossy(&out.stderr).trim().to_string(); + tracing::warn!( + "podman restart {} failed: {}, trying stop+start", + name, + stderr + ); + // Fallback: stop then start + let _ = tokio::process::Command::new("podman") + .args(["stop", "-t", stop_timeout_secs(name), name]) + .output() + .await; + let start_out = tokio::process::Command::new("podman") + .args(["start", name]) + .output() + .await + .context(format!("Failed to exec podman start {}", name))?; + if !start_out.status.success() { + let start_err = String::from_utf8_lossy(&start_out.stderr) + .trim() + .to_string(); + tracing::error!("stop+start {} also failed: {}", name, start_err); + errors.push(format!("{}: {}", name, start_err)); + } else { + tracing::info!("Restarted {} via stop+start fallback", name); + } + } + } + if !errors.is_empty() { + return Err(anyhow::anyhow!("Restart failed: {}", errors.join("; "))); + } + Ok(()) +} + +/// Flip the primary package entry's state and return the pre-transition +/// state for revert on error. Mirrors `transitional::flip_to_transitional` +/// but lives here because the package path keys by `package_id` (which may +/// differ from the container name used by orchestrator-level entries). +async fn flip_package_state( + state_manager: &crate::state::StateManager, + package_id: &str, + transitional: PackageState, +) -> Option { + let (mut data, _) = state_manager.get_snapshot().await; + let prev = data.package_data.get(package_id).map(|e| e.state.clone()); + if let Some(entry) = data.package_data.get_mut(package_id) { + entry.state = transitional; + state_manager.update_data(data).await; + } + prev +} + +/// Write the package entry's final state. No-op if the entry has since +/// been removed (uninstall race). +async fn set_package_state( + state_manager: &crate::state::StateManager, + package_id: &str, + new_state: PackageState, +) { + let (mut data, _) = state_manager.get_snapshot().await; + if let Some(entry) = data.package_data.get_mut(package_id) { + if entry.state != new_state { + entry.state = new_state; + state_manager.update_data(data).await; + } + } +} +