diff --git a/core/archipelago/src/api/rpc/package/install.rs b/core/archipelago/src/api/rpc/package/install.rs index e7cda442..c3eda09b 100644 --- a/core/archipelago/src/api/rpc/package/install.rs +++ b/core/archipelago/src/api/rpc/package/install.rs @@ -120,113 +120,133 @@ impl RpcHandler { false }; - // Check if container already exists - let check_output = tokio::process::Command::new("podman") - .args([ - "ps", - "-a", - "--format", - "{{.Names}}", - "--filter", - &format!("name=^{}$", package_id), - ]) - .output() - .await - .context("Failed to check existing containers")?; + // For orchestrator-managed apps, skip the legacy "container exists → + // adopt + return" probe entirely. The orchestrator's own install path + // (below) calls ensure_running which: + // - no-ops if the container is already up and healthy, + // - removes + reinstalls if the container is broken, + // - actually verifies health via the manifest's health check + // (whereas the legacy adopt block returns success on a podman + // `start` exit-0, even if the process inside crashed seconds + // later — the .228 bitcoin "running but unreachable" failure + // mode). + // The adoption block is being phased out as apps move to the + // orchestrator path. Non-orchestrator apps still hit it. + let orchestrator_managed = + should_try_orchestrator_install(package_id, self.orchestrator.is_some()); - if !String::from_utf8_lossy(&check_output.stdout) - .trim() - .is_empty() - { - // Container already exists (e.g. created by first-boot) — adopt it - info!( - "Container {} already exists, adopting as installed", - package_id - ); - install_log(&format!( - "INSTALL ADOPT: {} — container already exists", - package_id - )) - .await; - - // Check container state - let state_output = tokio::process::Command::new("podman") - .args(["inspect", package_id, "--format", "{{.State.Status}}"]) + // Check if container already exists (legacy adoption — non-orchestrator + // apps only). + if !orchestrator_managed { + let check_output = tokio::process::Command::new("podman") + .args([ + "ps", + "-a", + "--format", + "{{.Names}}", + "--filter", + &format!("name=^{}$", package_id), + ]) .output() .await - .context("Failed to inspect existing container")?; - let state = String::from_utf8_lossy(&state_output.stdout) - .trim() - .to_string(); + .context("Failed to check existing containers")?; - if state == "running" && repaired_bitcoin_conf { + if !String::from_utf8_lossy(&check_output.stdout) + .trim() + .is_empty() + { + // Container already exists (e.g. created by first-boot) — adopt it info!( - "Restarting existing container {} after bitcoin.conf RPC repair", + "Container {} already exists, adopting as installed", package_id ); - let restart_output = tokio::process::Command::new("podman") - .args(["restart", package_id]) + install_log(&format!( + "INSTALL ADOPT: {} — container already exists", + package_id + )) + .await; + + // Check container state + let state_output = tokio::process::Command::new("podman") + .args(["inspect", package_id, "--format", "{{.State.Status}}"]) .output() .await - .context("Failed to restart existing container after bitcoin.conf repair")?; - if !restart_output.status.success() { - let stderr = String::from_utf8_lossy(&restart_output.stderr); - install_log(&format!( - "INSTALL ADOPT FAIL: {} - restart after RPC repair failed: {}", - package_id, stderr - )) - .await; - return Err(anyhow::anyhow!( - "Container {} exists but failed to restart after RPC repair: {}", - package_id, - stderr - )); - } - let _ = tokio::process::Command::new("podman") - .args(["restart", "archy-bitcoin-ui"]) - .output() - .await; - wait_for_adopted_container(package_id, package_id).await?; - } else if state != "running" { - // Start the stopped/exited container - info!("Starting existing container {} (was {})", package_id, state); - let start_output = tokio::process::Command::new("podman") - .args(["start", package_id]) - .output() - .await - .context("Failed to start existing container")?; - if !start_output.status.success() { - let stderr = String::from_utf8_lossy(&start_output.stderr); - install_log(&format!( - "INSTALL ADOPT FAIL: {} — start failed: {}", - package_id, stderr - )) - .await; - return Err(anyhow::anyhow!( - "Container {} exists but failed to start: {}", - package_id, - stderr - )); + .context("Failed to inspect existing container")?; + let state = String::from_utf8_lossy(&state_output.stdout) + .trim() + .to_string(); + + if state == "running" && repaired_bitcoin_conf { + info!( + "Restarting existing container {} after bitcoin.conf RPC repair", + package_id + ); + let restart_output = tokio::process::Command::new("podman") + .args(["restart", package_id]) + .output() + .await + .context( + "Failed to restart existing container after bitcoin.conf repair", + )?; + if !restart_output.status.success() { + let stderr = String::from_utf8_lossy(&restart_output.stderr); + install_log(&format!( + "INSTALL ADOPT FAIL: {} - restart after RPC repair failed: {}", + package_id, stderr + )) + .await; + return Err(anyhow::anyhow!( + "Container {} exists but failed to restart after RPC repair: {}", + package_id, + stderr + )); + } + let _ = tokio::process::Command::new("podman") + .args(["restart", "archy-bitcoin-ui"]) + .output() + .await; + wait_for_adopted_container(package_id, package_id).await?; + } else if state != "running" { + // Start the stopped/exited container + info!("Starting existing container {} (was {})", package_id, state); + let start_output = tokio::process::Command::new("podman") + .args(["start", package_id]) + .output() + .await + .context("Failed to start existing container")?; + if !start_output.status.success() { + let stderr = String::from_utf8_lossy(&start_output.stderr); + install_log(&format!( + "INSTALL ADOPT FAIL: {} — start failed: {}", + package_id, stderr + )) + .await; + return Err(anyhow::anyhow!( + "Container {} exists but failed to start: {}", + package_id, + stderr + )); + } + + wait_for_adopted_container(package_id, package_id).await?; } - wait_for_adopted_container(package_id, package_id).await?; + install_log(&format!( + "INSTALL ADOPT OK: {} — already running", + package_id + )) + .await; + return Ok(serde_json::json!({ + "success": true, + "package_id": package_id, + "message": format!("Package {} already installed and running", package_id) + })); } - - install_log(&format!( - "INSTALL ADOPT OK: {} — already running", - package_id - )) - .await; - return Ok(serde_json::json!({ - "success": true, - "package_id": package_id, - "message": format!("Package {} already installed and running", package_id) - })); } // Preferred path for apps already modeled in the production orchestrator. // Keep legacy install flow as default while migration is in progress. - if should_try_orchestrator_install(package_id, self.orchestrator.is_some()) { + if orchestrator_managed { let orchestrator_app_id = orchestrator_install_app_id(package_id); self.set_install_phase(package_id, InstallPhase::CreatingContainer) .await; diff --git a/core/archipelago/src/bitcoin_status.rs b/core/archipelago/src/bitcoin_status.rs index d4ecf282..c0e16107 100644 --- a/core/archipelago/src/bitcoin_status.rs +++ b/core/archipelago/src/bitcoin_status.rs @@ -132,11 +132,10 @@ async fn fetch_bitcoin_status() -> Result { .await .context("getindexinfo") .ok(); - let zmq_notifications = - bitcoin_rpc_call(&client, "getzmqnotifications", serde_json::json!([])) - .await - .context("getzmqnotifications") - .ok(); + let zmq_notifications = bitcoin_rpc_call(&client, "getzmqnotifications", serde_json::json!([])) + .await + .context("getzmqnotifications") + .ok(); Ok(BitcoinNodeStatus { ok: true, diff --git a/core/archipelago/src/container/boot_reconciler.rs b/core/archipelago/src/container/boot_reconciler.rs index 44727f9e..3678edbf 100644 --- a/core/archipelago/src/container/boot_reconciler.rs +++ b/core/archipelago/src/container/boot_reconciler.rs @@ -103,9 +103,7 @@ impl BootReconciler { return; } let installed = self.orchestrator.manifest_ids().await; - for (companion, err) in - crate::container::companion::reconcile(&installed).await - { + for (companion, err) in crate::container::companion::reconcile(&installed).await { tracing::warn!( companion = %companion, error = %err, @@ -260,7 +258,8 @@ mod tests { let orch = orch_with_one_running_manifest(rt.clone()).await; let shutdown = Arc::new(Notify::new()); let reconciler = - BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()).without_companion_stage(); + BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()) + .without_companion_stage(); let handle = tokio::spawn(reconciler.run_forever()); // Yield so the spawned task gets CPU to run its initial reconcile. @@ -284,7 +283,8 @@ mod tests { let orch = orch_with_one_running_manifest(rt.clone()).await; let shutdown = Arc::new(Notify::new()); let reconciler = - BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()).without_companion_stage(); + BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()) + .without_companion_stage(); let handle = tokio::spawn(reconciler.run_forever()); tokio::task::yield_now().await; @@ -313,7 +313,8 @@ mod tests { let orch = orch_with_one_running_manifest(rt.clone()).await; let shutdown = Arc::new(Notify::new()); let reconciler = - BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()).without_companion_stage(); + BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()) + .without_companion_stage(); let handle = tokio::spawn(reconciler.run_forever()); tokio::task::yield_now().await; tokio::task::yield_now().await; @@ -347,7 +348,8 @@ mod tests { .await; let shutdown = Arc::new(Notify::new()); let reconciler = - BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()).without_companion_stage(); + BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()) + .without_companion_stage(); let handle = tokio::spawn(reconciler.run_forever()); tokio::task::yield_now().await; diff --git a/core/archipelago/src/container/companion.rs b/core/archipelago/src/container/companion.rs index 1bb8e7e2..d394929f 100644 --- a/core/archipelago/src/container/companion.rs +++ b/core/archipelago/src/container/companion.rs @@ -26,9 +26,7 @@ use tokio::fs; use tokio::process::Command; use tracing::{info, warn}; -use crate::container::quadlet::{ - self, BindMount, NetworkMode, QuadletUnit, -}; +use crate::container::quadlet::{self, BindMount, NetworkMode, QuadletUnit}; use archipelago_container::image_uses_insecure_registry; const COMPANION_REGISTRY: &str = "146.59.87.168:3000/lfg2025"; @@ -152,7 +150,10 @@ pub async fn remove_for(package_id: &str) { pub async fn install_one(spec: &CompanionSpec) -> Result<()> { if let Some(hook) = spec.pre_start { hook().await.with_context(|| { - format!("pre-start hook failed for {} — companion will not start", spec.name) + format!( + "pre-start hook failed for {} — companion will not start", + spec.name + ) })?; } let image = ensure_image_present(spec).await?; @@ -284,7 +285,10 @@ pub async fn reconcile(installed_apps: &[String]) -> Vec<(String, anyhow::Error) match needs_repair(spec).await { Ok(false) => {} Ok(true) => { - info!(companion = spec.name, "reconcile: companion not active, repairing"); + info!( + companion = spec.name, + "reconcile: companion not active, repairing" + ); if let Err(e) = install_one(spec).await { failures.push((spec.name.to_string(), e)); } diff --git a/core/archipelago/src/container/prod_orchestrator.rs b/core/archipelago/src/container/prod_orchestrator.rs index 381e3d1b..d4daf77b 100644 --- a/core/archipelago/src/container/prod_orchestrator.rs +++ b/core/archipelago/src/container/prod_orchestrator.rs @@ -707,11 +707,41 @@ enum HookOutcome { #[async_trait] impl ContainerOrchestrator for ProdContainerOrchestrator { async fn install(&self, app_id: &str) -> Result { + // Idempotent: if the container is already up and healthy, just + // refresh hooks and return. If it's stopped, start it. If it's + // missing or in a wedged state, install fresh. + // + // The RPC install handler used to do its own "container exists → + // adopt + return" probe before calling here, but that path skipped + // health verification (the .228 "running but unreachable" failure + // mode). Routing every install through here means the orchestrator + // is the one source of truth for what "installed" means. let lm = self.loaded(app_id).await?; - let lock = self.app_lock(app_id).await; - let _guard = lock.lock().await; - self.install_fresh(&lm).await?; - Ok(compute_container_name(&lm.manifest)) + let name = compute_container_name(&lm.manifest); + // ensure_running takes the per-app lock itself; release the install + // path lock first if we hold one (we don't — install is the entry + // point). Just delegate. + let action = self.ensure_running(&lm).await?; + match action { + ReconcileAction::NoOp + | ReconcileAction::Started + | ReconcileAction::Installed => Ok(name), + ReconcileAction::Left(state) => { + // Container is in a wedged state (Created / Paused / Unknown). + // Force-recreate so the install RPC has a clean outcome. + tracing::warn!( + app_id = %app_id, + state = %state, + "install: container in wedged state, force-recreating" + ); + let lock = self.app_lock(app_id).await; + let _guard = lock.lock().await; + let _ = self.runtime.stop_container(&name).await; + let _ = self.runtime.remove_container(&name).await; + self.install_fresh(&lm).await?; + Ok(name) + } + } } async fn start(&self, app_id: &str) -> Result<()> { diff --git a/core/archipelago/src/electrs_status.rs b/core/archipelago/src/electrs_status.rs index ecc6662c..c046637c 100644 --- a/core/archipelago/src/electrs_status.rs +++ b/core/archipelago/src/electrs_status.rs @@ -86,11 +86,9 @@ pub fn spawn_status_cache() { fresh.progress_pct = cached.progress_pct; } fresh.stale = true; - fresh.error = Some( - fresh - .error - .unwrap_or_else(|| "ElectrumX is reconnecting; showing last known indexed height.".to_string()), - ); + fresh.error = Some(fresh.error.unwrap_or_else(|| { + "ElectrumX is reconnecting; showing last known indexed height.".to_string() + })); } *cached = fresh; drop(cached); @@ -373,16 +371,16 @@ async fn fetch_electrs_sync_status() -> ElectrsSyncStatus { Err(e) => match electrumx_log_indexed_height().await { Ok(h) if h > 0 => h, _ => { - let err_msg = e.to_string(); - if is_transient_error(&err_msg) { - // ElectrumX is starting up or busy — estimate from data size - let progress_pct = if data_bytes > 0 { - ((data_bytes as f64 / ESTIMATED_FULL_INDEX_BYTES) * 100.0).min(99.0) - } else { - 0.0 - }; - let size_str = index_size.clone().unwrap_or_else(|| "0 MB".to_string()); - return ElectrsSyncStatus { + let err_msg = e.to_string(); + if is_transient_error(&err_msg) { + // ElectrumX is starting up or busy — estimate from data size + let progress_pct = if data_bytes > 0 { + ((data_bytes as f64 / ESTIMATED_FULL_INDEX_BYTES) * 100.0).min(99.0) + } else { + 0.0 + }; + let size_str = index_size.clone().unwrap_or_else(|| "0 MB".to_string()); + return ElectrsSyncStatus { indexed_height: 0, bitcoin_height: bitcoin_blocks, network_height, @@ -396,21 +394,21 @@ async fn fetch_electrs_sync_status() -> ElectrsSyncStatus { index_size, tor_onion, }; + } + // Genuine unexpected error + warn!("ElectrumX status: unexpected error: {}", err_msg); + return ElectrsSyncStatus { + indexed_height: 0, + bitcoin_height: bitcoin_blocks, + network_height, + progress_pct: 0.0, + status: "error".to_string(), + stale: false, + error: Some(format!("ElectrumX: {}", err_msg)), + index_size, + tor_onion, + }; } - // Genuine unexpected error - warn!("ElectrumX status: unexpected error: {}", err_msg); - return ElectrsSyncStatus { - indexed_height: 0, - bitcoin_height: bitcoin_blocks, - network_height, - progress_pct: 0.0, - status: "error".to_string(), - stale: false, - error: Some(format!("ElectrumX: {}", err_msg)), - index_size, - tor_onion, - }; - } }, };