diff --git a/apps/electrumx/manifest.yml b/apps/electrumx/manifest.yml index 14ce4c46..5b985b7b 100644 --- a/apps/electrumx/manifest.yml +++ b/apps/electrumx/manifest.yml @@ -51,6 +51,20 @@ app: - CACHE_MB=1024 - MAX_SEND=10000000 + # The ElectrumX dashboard tile is served by the host-networked companion UI + # (archy-electrs-ui) on port 50002, NOT by this container. Declaring it here + # lets the catalog generator emit electrumx -> 50002 into GENERATED_APP_PORTS + # so the tile resolves a launch URL without relying on the hand-maintained + # override in appSessionConfig.ts (which the generator can clobber). The + # backend only validates this block — it does not proxy/health-check it. + interfaces: + main: + name: Web UI + description: ElectrumX server status and connection details + type: ui + port: 50002 + protocol: http + health_check: type: tcp endpoint: localhost:50001 diff --git a/core/archipelago/src/api/mod.rs b/core/archipelago/src/api/mod.rs index 47d8c90c..47740a4e 100644 --- a/core/archipelago/src/api/mod.rs +++ b/core/archipelago/src/api/mod.rs @@ -1,4 +1,4 @@ mod handler; -mod rpc; +pub(crate) mod rpc; pub use handler::ApiHandler; diff --git a/core/archipelago/src/api/rpc/bitcoin_relay.rs b/core/archipelago/src/api/rpc/bitcoin_relay.rs index 57a222e0..b7eaf421 100644 --- a/core/archipelago/src/api/rpc/bitcoin_relay.rs +++ b/core/archipelago/src/api/rpc/bitcoin_relay.rs @@ -107,7 +107,7 @@ struct TrustedRelayPeer { } #[derive(Debug, Clone)] -struct TxRelayCredentials { +pub(crate) struct TxRelayCredentials { username: String, password: String, } @@ -648,7 +648,13 @@ async fn txrelay_credentials_available(data_dir: &Path) -> bool { && fs::metadata(&client_env_path).await.is_ok() } -async fn ensure_txrelay_credentials(data_dir: &Path) -> Result { +/// Idempotently ensure the tx-relay credential trio exists in the secrets dir: +/// the random password, its derived `rpcauth` line, and the client env file. +/// Bitcoin backend manifests reference `bitcoin-rpc-txrelay-rpcauth` as a +/// required `secret_env`, so this must run before bitcoind starts — otherwise +/// secret resolution hard-fails and the whole Bitcoin stack cascades (the .198 +/// failure). Safe to call repeatedly; it only writes what's missing or stale. +pub(crate) async fn ensure_txrelay_credentials(data_dir: &Path) -> Result { let (password_path, rpcauth_path, client_env_path) = txrelay_secret_paths(data_dir); let password = match read_trimmed(&password_path).await { Some(value) => value, diff --git a/core/archipelago/src/api/rpc/lnd/wallet.rs b/core/archipelago/src/api/rpc/lnd/wallet.rs index 84193973..2501ddaa 100644 --- a/core/archipelago/src/api/rpc/lnd/wallet.rs +++ b/core/archipelago/src/api/rpc/lnd/wallet.rs @@ -9,9 +9,15 @@ use super::LND_REST_BASE_URL; impl RpcHandler { /// Generate a new on-chain Bitcoin address. pub(in crate::api::rpc) async fn handle_lnd_newaddress(&self) -> Result { - let (client, macaroon_hex) = self.lnd_client().await?; + let (client, macaroon_hex) = self.lnd_client().await.map_err(|e| { + tracing::warn!(error = %format!("{e:#}"), "LND newaddress: client/macaroon unavailable"); + receive_error( + RECEIVE_WALLET_UNINITIALIZED, + "The Lightning wallet isn't set up on this node yet. Finish wallet setup, then try again.", + ) + })?; - let resp = client + let resp = match client .get(format!("{LND_REST_BASE_URL}/v1/newaddress")) // LND's REST gateway parses `type` as the AddressType enum by its // proto name (or integer), NOT the lncli aliases. "p2wkh" is not a @@ -21,13 +27,26 @@ impl RpcHandler { .header("Grpc-Metadata-macaroon", &macaroon_hex) .send() .await - .context("LND REST connection failed")?; + { + Ok(resp) => resp, + Err(e) => { + // The .116 case: LND container is up but its REST endpoint isn't + // reachable on the expected port (e.g. published-port drift), so + // the connection is refused. This is NOT a locked wallet — emit a + // distinct code so the UI stops mislabelling it. + tracing::warn!(error = %format!("{e:#}"), "LND newaddress: REST connection failed"); + return Err(receive_error( + RECEIVE_REST_UNREACHABLE, + "The Lightning wallet service isn't reachable yet. It may be starting up or recovering — please try again in a moment.", + )); + } + }; let status = resp.status(); let raw_body = resp .text() .await - .context("LND address response could not be read")?; + .context("Bitcoin address response could not be read")?; let body: serde_json::Value = serde_json::from_str(&raw_body).unwrap_or_else(|_| { serde_json::json!({ "raw": raw_body, @@ -36,11 +55,9 @@ impl RpcHandler { if !status.is_success() { let message = lnd_error_message(&body); - anyhow::bail!( - "Bitcoin address generation failed ({}): {}", - status, - message - ); + let code = classify_lnd_address_error(&message); + tracing::warn!(%status, lnd_message = %message, code, "LND newaddress returned an error"); + return Err(receive_error(code, default_receive_detail(code))); } if let Some(error) = body @@ -48,14 +65,21 @@ impl RpcHandler { .or_else(|| body.get("message")) .and_then(|v| v.as_str()) { - anyhow::bail!("Bitcoin address generation failed: {}", error); + let code = classify_lnd_address_error(error); + tracing::warn!(lnd_message = %error, code, "LND newaddress returned an error body"); + return Err(receive_error(code, default_receive_detail(code))); } let address = body .get("address") .and_then(|v| v.as_str()) .filter(|addr| !addr.trim().is_empty()) - .ok_or_else(|| anyhow::anyhow!("Bitcoin address generation failed: LND did not return a Bitcoin address. The wallet may still be locked, uninitialized, or waiting for Bitcoin to sync."))? + .ok_or_else(|| { + receive_error( + RECEIVE_WALLET_UNINITIALIZED, + "The wallet didn't return an address yet. It may still be unlocking or waiting for Bitcoin to sync — please try again shortly.", + ) + })? .to_string(); Ok(serde_json::json!({ "address": address })) @@ -583,9 +607,75 @@ fn lnd_error_message(body: &serde_json::Value) -> String { .to_string() } +// Stable, machine-readable reason codes for receive-address failures. They are +// embedded in the error message as a `[CODE]` token so the frontend +// (neode-ui/src/utils/bitcoinReceive.ts) can show an accurate explanation +// instead of guessing wallet state by substring-matching — which is what made +// .228 report "wallet is locked" when LND's REST was merely unreachable. +// +// Every receive error string starts with "Bitcoin address" so it survives the +// RPC error sanitizer (api/rpc/middleware.rs) unchanged rather than being +// flattened to the generic "Operation failed" message (the .116 symptom). +pub(crate) const RECEIVE_REST_UNREACHABLE: &str = "LND_REST_UNREACHABLE"; +pub(crate) const RECEIVE_WALLET_LOCKED: &str = "LND_WALLET_LOCKED"; +pub(crate) const RECEIVE_WALLET_UNINITIALIZED: &str = "LND_WALLET_UNINITIALIZED"; +pub(crate) const RECEIVE_SYNCING: &str = "LND_SYNCING"; +pub(crate) const RECEIVE_LND_ERROR: &str = "LND_ERROR"; + +/// Build a receive-address error carrying a reason code the UI can map. +fn receive_error(code: &str, detail: &str) -> anyhow::Error { + anyhow::anyhow!("Bitcoin address unavailable [{code}]: {detail}") +} + +/// A sensible default human message per code (used for non-UI callers and logs; +/// the frontend renders its own copy from the code). +fn default_receive_detail(code: &str) -> &'static str { + match code { + RECEIVE_REST_UNREACHABLE => { + "The Lightning wallet service isn't reachable yet. It may be starting up or recovering — please try again in a moment." + } + RECEIVE_WALLET_LOCKED => { + "The Lightning wallet is locked. Unlock it (or finish wallet setup), then try again." + } + RECEIVE_WALLET_UNINITIALIZED => { + "The Lightning wallet isn't set up yet. Finish wallet setup, then try again." + } + RECEIVE_SYNCING => { + "The wallet is still syncing with the Bitcoin network. Please try again once it has caught up." + } + _ => "Couldn't generate a Bitcoin address right now. Please try again shortly.", + } +} + +/// Classify a non-2xx LND error body/message into a reason code. The wording of +/// LND's REST errors is stable enough to bucket: a locked wallet, an +/// uninitialized wallet, a syncing chain, or some other failure. +fn classify_lnd_address_error(message: &str) -> &'static str { + let m = message.to_lowercase(); + if m.contains("locked") || m.contains("unlock") { + RECEIVE_WALLET_LOCKED + } else if m.contains("synchroniz") + || m.contains("syncing") + || m.contains("not yet ready") + || m.contains("in the process of starting") + { + RECEIVE_SYNCING + } else if m.contains("wallet not found") + || m.contains("not exist") + || m.contains("uninitialized") + || m.contains("not initialized") + || m.contains("create a wallet") + || m.contains("no wallet") + { + RECEIVE_WALLET_UNINITIALIZED + } else { + RECEIVE_LND_ERROR + } +} + #[cfg(test)] mod tests { - use super::lnd_error_message; + use super::*; #[test] fn lnd_error_message_prefers_message_field() { @@ -604,4 +694,46 @@ mod tests { "unknown LND error" ); } + + #[test] + fn classify_locked_wallet() { + assert_eq!( + classify_lnd_address_error("wallet locked, please unlock"), + RECEIVE_WALLET_LOCKED + ); + } + + #[test] + fn classify_uninitialized_wallet() { + assert_eq!( + classify_lnd_address_error("wallet not found, create a wallet first"), + RECEIVE_WALLET_UNINITIALIZED + ); + } + + #[test] + fn classify_syncing() { + assert_eq!( + classify_lnd_address_error("server is still in the process of starting"), + RECEIVE_SYNCING + ); + } + + #[test] + fn classify_unknown_is_generic_error() { + assert_eq!( + classify_lnd_address_error("some other failure"), + RECEIVE_LND_ERROR + ); + } + + #[test] + fn receive_error_starts_with_sanitizer_safe_prefix_and_embeds_code() { + // Must start with "Bitcoin address" (survives the RPC error sanitizer) + // and carry the [CODE] token the frontend parses. + let err = receive_error(RECEIVE_REST_UNREACHABLE, "unreachable"); + let s = format!("{err}"); + assert!(s.starts_with("Bitcoin address"), "got: {s}"); + assert!(s.contains("[LND_REST_UNREACHABLE]"), "got: {s}"); + } } diff --git a/core/archipelago/src/container/prod_orchestrator.rs b/core/archipelago/src/container/prod_orchestrator.rs index db4153a6..132289a9 100644 --- a/core/archipelago/src/container/prod_orchestrator.rs +++ b/core/archipelago/src/container/prod_orchestrator.rs @@ -297,6 +297,53 @@ async fn wait_for_manifest_host_ports(manifest: &AppManifest, timeout_secs: u64) Ok(()) } +/// Pure published-port drift check. `port_bindings_json` is the JSON that +/// `podman inspect --format '{{json .HostConfig.PortBindings}}'` emits, e.g. +/// `{"8080/tcp":[{"HostIp":"","HostPort":"18080"}]}`. Returns true only when a +/// manifest container-port is positively published to a *different* host port +/// than the manifest now asks for. Absence of a binding is deliberately NOT +/// treated as drift here — that case is handled by the host-port repair/restart +/// path and by host-networked apps that publish nothing — so we never trigger a +/// destructive recreate on a false positive. +fn host_port_bindings_drifted( + port_bindings_json: &str, + manifest_ports: &[archipelago_container::manifest::PortMapping], +) -> bool { + let parsed: serde_json::Value = match serde_json::from_str(port_bindings_json) { + Ok(v) => v, + Err(_) => return false, + }; + let Some(map) = parsed.as_object() else { + return false; + }; + for port in manifest_ports { + let proto = if port.protocol.is_empty() { + "tcp" + } else { + port.protocol.as_str() + }; + let key = format!("{}/{}", port.container, proto); + let Some(bindings) = map.get(&key).and_then(|b| b.as_array()) else { + // Container-port not currently published — not our case. + continue; + }; + if bindings.is_empty() { + continue; + } + let expected = port.host.to_string(); + let matches_expected = bindings.iter().any(|b| { + b.get("HostPort") + .and_then(|h| h.as_str()) + .map(|h| h == expected) + .unwrap_or(false) + }); + if !matches_expected { + return true; + } + } + false +} + async fn ensure_user_podman_socket() -> Result<()> { let socket_path = "/run/user/1000/podman/podman.sock"; if podman_socket_accepts_connections(socket_path).await { @@ -734,8 +781,19 @@ struct FileSecretsProvider { impl SecretsProvider for FileSecretsProvider { fn read(&self, name: &str) -> std::result::Result { let path = self.root.join(name); - let data = std::fs::read_to_string(&path).map_err(ManifestError::Io)?; - Ok(data.trim().to_string()) + match std::fs::read_to_string(&path) { + Ok(data) => Ok(data.trim().to_string()), + // Name the missing secret explicitly so the failure is actionable + // instead of a bare "IO error: No such file or directory" that hides + // which secret (and which app) is blocked. + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + Err(ManifestError::Invalid(format!( + "required secret '{name}' is missing (expected at {}); the app cannot start until it is generated", + path.display() + ))) + } + Err(e) => Err(ManifestError::Io(e)), + } } } @@ -1054,6 +1112,7 @@ impl ProdContainerOrchestrator { let lock = self.app_lock(&app_id).await; let _guard = lock.lock().await; + self.ensure_app_secrets(&app_id).await?; let mut resolved_manifest = lm.manifest.clone(); self.resolve_dynamic_env(&mut resolved_manifest)?; let name = compute_container_name(&lm.manifest); @@ -1094,6 +1153,22 @@ impl ProdContainerOrchestrator { self.run_post_start_hooks(&app_id).await?; return Ok(ReconcileAction::Started); } + // Published-port drift means the container exists but maps + // its ports to the wrong host ports (e.g. lnd REST stuck on + // host 8080 while the manifest/clients expect 18080, as seen + // on .116). The container is already non-functional, so + // recreate it even for restart-sensitive apps during boot — + // leaving it "untouched" would perpetuate the breakage. + if self + .container_ports_drifted(&name, &resolved_manifest) + .await + { + tracing::info!(app_id = %app_id, container = %name, "container published-port drift detected — recreating"); + let _ = self.runtime.stop_container(&name).await; + let _ = self.runtime.remove_container(&name).await; + self.install_fresh(lm).await?; + return Ok(ReconcileAction::Installed); + } if self.container_env_drifted(&name, &resolved_manifest).await { if mode == ReconcileMode::ExistingOnly && is_restart_sensitive_app(&app_id) @@ -1154,8 +1229,12 @@ impl ProdContainerOrchestrator { // reading it. A Rewritten outcome is fine here — we're // about to start from a stopped state anyway. self.prepare_for_start(&resolved_manifest).await?; - if self.container_env_drifted(&name, &resolved_manifest).await { - tracing::info!(app_id = %app_id, container = %name, "stopped container env drift detected — recreating"); + if self.container_env_drifted(&name, &resolved_manifest).await + || self + .container_ports_drifted(&name, &resolved_manifest) + .await + { + tracing::info!(app_id = %app_id, container = %name, "stopped container env/port drift detected — recreating"); let _ = self.runtime.remove_container(&name).await; self.install_fresh(lm).await?; return Ok(ReconcileAction::Installed); @@ -1297,6 +1376,7 @@ impl ProdContainerOrchestrator { /// Build-or-pull, create, start. Assumes the per-app mutex is already held. async fn install_fresh(&self, lm: &LoadedManifest) -> Result<()> { + self.ensure_app_secrets(&lm.manifest.app.id).await?; let mut resolved_manifest = lm.manifest.clone(); self.resolve_dynamic_env(&mut resolved_manifest)?; @@ -2300,6 +2380,25 @@ impl ProdContainerOrchestrator { Self::detect_disk_gb() } + /// Ensure app-specific secrets exist *before* env resolution. The Bitcoin + /// backends reference `bitcoin-rpc-txrelay-rpcauth` as a required + /// `secret_env`; it is normally created by the tx-relay flow, so nodes that + /// never used tx-relay lack it and `resolve_secret_env` hard-fails — taking + /// bitcoind (and everything that depends on it) down. Generating it here, + /// idempotently, lets reconcile/install self-heal that state (the .198 case) + /// instead of cascading. Must be called before every `resolve_dynamic_env`. + async fn ensure_app_secrets(&self, app_id: &str) -> Result<()> { + if cfg!(test) { + return Ok(()); + } + if matches!(app_id, "bitcoin-knots" | "bitcoin-core" | "bitcoin") { + crate::api::rpc::bitcoin_relay::ensure_txrelay_credentials(&self.data_dir) + .await + .context("ensuring bitcoin tx-relay credentials")?; + } + Ok(()) + } + fn resolve_dynamic_env(&self, manifest: &mut AppManifest) -> Result<()> { let facts = self.detect_host_facts(); let mut env = manifest.app.environment.clone(); @@ -2443,6 +2542,42 @@ impl ProdContainerOrchestrator { false } + /// True when a *running* container publishes a manifest container-port to a + /// different host port than the manifest now asks for (published-port + /// drift). This catches the class of failure seen on .116, where `lnd` was + /// created mapping host 8080 -> container 8080 but the current manifest maps + /// host 18080 -> container 8080, so every in-process REST client (which + /// connects to the manifest port) gets connection-refused forever while the + /// container looks "Up". `container_env_drifted` never inspects ports, and + /// `wait_for_manifest_host_ports` only restarts the stale container (which + /// republishes the wrong mapping), so without this check the drift is + /// self-perpetuating. + async fn container_ports_drifted(&self, name: &str, manifest: &AppManifest) -> bool { + if cfg!(test) { + return false; + } + if manifest.app.ports.is_empty() { + return false; + } + let inspect = tokio::process::Command::new("podman") + .args([ + "inspect", + name, + "--format", + "{{json .HostConfig.PortBindings}}", + ]) + .output() + .await; + let Ok(output) = inspect else { + return false; + }; + if !output.status.success() { + return false; + } + let bindings = String::from_utf8_lossy(&output.stdout); + host_port_bindings_drifted(&bindings, &manifest.app.ports) + } + async fn apply_data_uid(&self, manifest: &AppManifest) -> Result<()> { let Some(uid_gid) = manifest.app.container.data_uid.as_ref() else { return Ok(()); @@ -2752,6 +2887,7 @@ impl ContainerOrchestrator for ProdContainerOrchestrator { let lm = self.loaded(app_id).await?; let lock = self.app_lock(app_id).await; let _guard = lock.lock().await; + self.ensure_app_secrets(app_id).await?; let name = compute_container_name(&lm.manifest); let mut resolved_manifest = lm.manifest.clone(); self.resolve_dynamic_env(&mut resolved_manifest)?; @@ -2913,6 +3049,61 @@ mod tests { use async_trait::async_trait; use std::sync::Mutex as StdMutex; + fn port(host: u16, container: u16) -> archipelago_container::manifest::PortMapping { + archipelago_container::manifest::PortMapping { + host, + container, + protocol: "tcp".to_string(), + } + } + + #[test] + fn port_drift_detected_when_host_port_differs() { + // The .116 case: container publishes container-port 8080 on host 8080, + // but the manifest now asks for host 18080. + let bindings = r#"{"8080/tcp":[{"HostIp":"","HostPort":"8080"}]}"#; + assert!(host_port_bindings_drifted(bindings, &[port(18080, 8080)])); + } + + #[test] + fn no_drift_when_host_port_matches() { + let bindings = r#"{"8080/tcp":[{"HostIp":"0.0.0.0","HostPort":"18080"}]}"#; + assert!(!host_port_bindings_drifted(bindings, &[port(18080, 8080)])); + } + + #[test] + fn no_drift_when_binding_absent() { + // Absence is handled elsewhere (host-port repair / host-networked apps); + // never treat it as drift to avoid a destructive recreate on a false + // positive. + assert!(!host_port_bindings_drifted("{}", &[port(18080, 8080)])); + assert!(!host_port_bindings_drifted("null", &[port(18080, 8080)])); + } + + #[test] + fn no_drift_on_unparseable_bindings() { + assert!(!host_port_bindings_drifted( + "not json", + &[port(18080, 8080)] + )); + } + + #[test] + fn missing_secret_error_names_the_secret() { + use archipelago_container::manifest::SecretsProvider; + let provider = FileSecretsProvider { + root: PathBuf::from("/nonexistent-secrets-dir-xyz"), + }; + let err = provider + .read("bitcoin-rpc-txrelay-rpcauth") + .expect_err("missing secret must error"); + let msg = err.to_string(); + assert!( + msg.contains("bitcoin-rpc-txrelay-rpcauth"), + "error should name the missing secret, got: {msg}" + ); + } + /// Instrumented in-memory runtime. Every call is recorded so tests can assert /// the exact sequence of side effects. #[derive(Default)] diff --git a/neode-ui/src/utils/__tests__/bitcoinReceive.test.ts b/neode-ui/src/utils/__tests__/bitcoinReceive.test.ts index dbc74c16..393eac8d 100644 --- a/neode-ui/src/utils/__tests__/bitcoinReceive.test.ts +++ b/neode-ui/src/utils/__tests__/bitcoinReceive.test.ts @@ -21,4 +21,40 @@ describe('explainReceiveAddressFailure', () => { it('keeps bitcoin address generation failures visible', () => { expect(explainReceiveAddressFailure(new Error('Bitcoin address generation failed: LND is not ready'))).toContain('Bitcoin address generation failed') }) + + describe('structured reason codes (backend [CODE] token)', () => { + it('maps an unreachable REST endpoint to a starting/recovering message — NOT locked (.228 regression)', () => { + const msg = explainReceiveAddressFailure( + new Error('Bitcoin address unavailable [LND_REST_UNREACHABLE]: service not reachable'), + ) + expect(msg).toContain('starting up or recovering') + expect(msg.toLowerCase()).not.toContain('locked') + }) + + it('maps a genuinely locked wallet to the locked message', () => { + expect( + explainReceiveAddressFailure(new Error('Bitcoin address unavailable [LND_WALLET_LOCKED]: locked')), + ).toContain('locked') + }) + + it('maps an uninitialized wallet to the setup message', () => { + expect( + explainReceiveAddressFailure(new Error('Bitcoin address unavailable [LND_WALLET_UNINITIALIZED]: x')), + ).toContain('has not been set up') + }) + + it('maps a syncing wallet to the syncing message', () => { + expect( + explainReceiveAddressFailure(new Error('Bitcoin address unavailable [LND_SYNCING]: x')), + ).toContain('syncing') + }) + + it('prefers the code over substrings even when the detail text is misleading', () => { + // Detail mentions neither "locked" nor "unlock"; code must still win. + const msg = explainReceiveAddressFailure( + new Error('Bitcoin address unavailable [LND_REST_UNREACHABLE]: wallet service down'), + ) + expect(msg).toContain('starting up or recovering') + }) + }) }) diff --git a/neode-ui/src/utils/bitcoinReceive.ts b/neode-ui/src/utils/bitcoinReceive.ts index deb87ee5..71a133ef 100644 --- a/neode-ui/src/utils/bitcoinReceive.ts +++ b/neode-ui/src/utils/bitcoinReceive.ts @@ -1,5 +1,29 @@ +// Machine-readable reason codes the backend embeds as a `[CODE]` token in +// receive-address errors (see core .../api/rpc/lnd/wallet.rs). Mapping the code +// directly is precise — unlike the substring heuristics below, it cannot +// mislabel an unreachable-REST failure as "wallet is locked" (the .228 bug). +const RECEIVE_CODE_MESSAGES: Record = { + LND_REST_UNREACHABLE: + 'Bitcoin address is not ready yet because the Lightning wallet service is still starting up or recovering. Please try again in a moment.', + LND_WALLET_LOCKED: + 'Bitcoin address is not ready because the Lightning wallet is locked. Unlock or initialize LND first.', + LND_WALLET_UNINITIALIZED: + 'Bitcoin address is not ready because the Lightning wallet has not been set up yet. Finish wallet setup, then try again.', + LND_SYNCING: + 'Bitcoin address is not ready while the wallet is still syncing with the Bitcoin network. Try again once sync has progressed.', + LND_ERROR: + 'Bitcoin address is not ready yet. Check that the Lightning app is healthy, then try again.', +} + export function explainReceiveAddressFailure(error: unknown): string { const message = error instanceof Error ? error.message : String(error || '') + + // Prefer the structured reason code when present. + const codeMatch = message.match(/\[([A-Z_]+)\]/) + if (codeMatch && RECEIVE_CODE_MESSAGES[codeMatch[1]]) { + return RECEIVE_CODE_MESSAGES[codeMatch[1]] + } + const lower = message.toLowerCase() if (lower.includes('wallet') && (lower.includes('locked') || lower.includes('unlock'))) { diff --git a/neode-ui/src/views/appSession/generatedAppSessionConfig.ts b/neode-ui/src/views/appSession/generatedAppSessionConfig.ts index de37486b..fc59c34b 100644 --- a/neode-ui/src/views/appSession/generatedAppSessionConfig.ts +++ b/neode-ui/src/views/appSession/generatedAppSessionConfig.ts @@ -7,6 +7,7 @@ export const GENERATED_APP_PORTS: Record = { "botfights": 9100, "btcpay-server": 23000, "did-wallet": 8083, + "electrumx": 50002, "fedimint": 8175, "filebrowser": 8083, "gitea": 3001, diff --git a/neode-ui/src/views/apps/appsConfig.ts b/neode-ui/src/views/apps/appsConfig.ts index 9d934721..8efc3f64 100644 --- a/neode-ui/src/views/apps/appsConfig.ts +++ b/neode-ui/src/views/apps/appsConfig.ts @@ -189,7 +189,14 @@ export function canLaunch(pkg: PackageDataEntry): boolean { if ((pkg.manifest.id === 'fedimint' || pkg.manifest.id === 'fedimintd') && hasUI) { return pkg.state === PackageState.Running || pkg.state === PackageState.Starting } - return !!hasUI && pkg.state === 'running' && pkg.health !== 'starting' && pkg.health !== 'unhealthy' + // A static launch URL (e.g. a host-networked companion UI like + // archy-electrs-ui) serves independently of the backend's own sync state, so + // the tile stays launchable while the backend is still 'starting' (ElectrumX + // indexes for 10m+ on first run). A genuinely 'unhealthy' backend still + // blocks. Apps that rely on a runtime interface-address keep the strict gate. + const blockedByHealth = + pkg.health === 'unhealthy' || (pkg.health === 'starting' && !hasKnownLaunchUrl) + return !!hasUI && pkg.state === 'running' && !blockedByHealth } export function launchBlockedReason(id: string, pkg?: PackageDataEntry | null): string { diff --git a/scripts/create-release.sh b/scripts/create-release.sh index 6c90f02d..da012f39 100755 --- a/scripts/create-release.sh +++ b/scripts/create-release.sh @@ -75,6 +75,28 @@ if ! git -C "$PROJECT_ROOT" diff --quiet HEAD; then exit 1 fi +# ── Pre-flight test gate ────────────────────────────────────────────── +# A release must not ship if the static/frontend/backend checks fail. This +# runs the release gate harness (cargo fmt/check, catalog drift, vitest, and +# the focused cargo suites — incl. the receive/port-drift/secret regressions). +# Skipped on --dry-run, or set SKIP_RELEASE_TESTS=1 to bypass in an emergency. +# The lifecycle bats harness (tests/lifecycle/run-20x.sh) still runs separately +# against live nodes — see tests/lifecycle/TESTING.md. +if ! $DRY_RUN; then + if [ "${SKIP_RELEASE_TESTS:-0}" = "1" ]; then + echo "WARNING: SKIP_RELEASE_TESTS=1 — bypassing the pre-flight test gate" + elif [ -x "$PROJECT_ROOT/tests/release/run.sh" ]; then + echo "[0/7] Running release gate (tests/release/run.sh)..." + if ! "$PROJECT_ROOT/tests/release/run.sh"; then + echo "Error: release gate failed — aborting release. Fix the failing" + echo " stage, or re-run with SKIP_RELEASE_TESTS=1 to override." + exit 1 + fi + else + echo "WARNING: tests/release/run.sh not found/executable — skipping test gate" + fi +fi + # Check tag doesn't already exist if git -C "$PROJECT_ROOT" tag -l "v$VERSION" | grep -q "v$VERSION"; then echo "Error: Tag v$VERSION already exists" @@ -94,6 +116,7 @@ echo "" if $DRY_RUN; then echo "[DRY RUN] Would perform the following:" + echo " 0. Run pre-flight test gate (tests/release/run.sh) — aborts on failure" echo " 1. Update core/archipelago/Cargo.toml version to $VERSION" echo " 2. Update neode-ui/package.json version to $VERSION" echo " 3. Build backend (cargo build --release -p archipelago)" diff --git a/tests/lifecycle/TESTING.md b/tests/lifecycle/TESTING.md index 6a668acd..dcb28ea8 100644 --- a/tests/lifecycle/TESTING.md +++ b/tests/lifecycle/TESTING.md @@ -58,10 +58,29 @@ v1.7.52 tags. | L1 RPC | 70 | bitcoin-knots, lnd, electrumx, btcpay, mempool, fedimint, required-stack, package-update-smoke | ● for the 6 core apps | | L2 UI | 9 | ui-coverage | ● for dashboard + 7 proxy paths + bitcoin-ui:8334 | | L3 lifecycle survival | 14 | companion-survives-archipelago-restart, backend-survives-archipelago-restart, required-stack-destructive, use-quadlet-backends-install | ◐ companions ● ; backends ◐ regression-gate (will fail until Phase 3 Quadlet ships); quadlet post-condition gate ✅ skip-clean today, hard gate when flag flipped | +| L1 wallet-receive / drift / secrets | 5 | bitcoin-receive, port-drift, secret-completeness | ● guards the v1.7.9x wallet fleet failures | | L4 browser journey | 0 | none | ○ not started | | L5 chaos | 0 | none | ○ not started | | L6 performance | 0 | none | ○ not started | +### Wallet / Bitcoin fleet-failure regression suites (added after v1.7.90-alpha) + +Three production failures shipped on v1.7.90-alpha despite the existing harness, +because nothing exercised the receive path, port-mapping drift, or secret +completeness on a live node. New suites close those gaps (all run on the archy +host, read-only, so they join `run.sh`/`run-20x.sh` automatically): + +| Suite | Failure it guards | Asserts | +|---|---|---| +| `bitcoin-receive.bats` | .116 ("Operation failed" on receive) and .228 (false "wallet is locked") | LND REST reachable on the **manifest** host port; `lnd.newaddress` returns a `bc1…` address on a running node; receive errors are specific, never the generic catch-all | +| `port-drift.bats` | .116 (lnd REST stuck on host 8080 vs manifest 18080) | every installed backend's live `podman inspect` PortBindings match its manifest `ports:` (the external mirror of the orchestrator's `host_port_bindings_drifted`) | +| `secret-completeness.bats` | .198 (bitcoin-knots needs `bitcoin-rpc-txrelay-rpcauth`, never generated → stack cascade) | every `secret_file` referenced by an installed backend manifest exists in the secrets dir | + +Backed by L0 unit tests (`cargo test … drift missing_secret lnd`) and a vitest +for the frontend reason-code mapping (`bitcoinReceive.test.ts`). The release +gate `scripts/create-release.sh` now runs `tests/release/run.sh` (which includes +these) and **aborts the release on failure** — previously it ran no tests at all. + ## Run commands ```bash diff --git a/tests/lifecycle/bats/bitcoin-receive.bats b/tests/lifecycle/bats/bitcoin-receive.bats new file mode 100644 index 00000000..278cbb73 --- /dev/null +++ b/tests/lifecycle/bats/bitcoin-receive.bats @@ -0,0 +1,104 @@ +#!/usr/bin/env bats +# tests/lifecycle/bats/bitcoin-receive.bats +# +# Regression coverage for the Bitcoin "Receive" flow. Receive addresses come +# from LND's hot wallet via the `lnd.newaddress` RPC, so this exercises the +# exact path that broke on the fleet: +# - .116: LND REST published on the wrong host port (8080 vs the manifest's +# 18080) -> connection refused -> receive failed with the generic +# "Operation failed. Check server logs." message. +# - .228: the same family surfaced to the UI as a *false* "wallet is locked". +# +# These tests run on the archy host (they shell into podman / curl localhost). +# +# Tiers: read-only only — generating a receive address is non-destructive. + +load '../lib/rpc.bash' + +setup_file() { + : "${ARCHY_PASSWORD:?Set ARCHY_PASSWORD env var to the UI password}" + export ARCHY_FORCE_LOGIN=1 + rpc_login + unset ARCHY_FORCE_LOGIN +} + +teardown_file() { + rpc_logout_local +} + +# Resolve the LND REST host port from the manifest (single source of truth) so +# this test follows the manifest rather than hard-coding 18080. +_lnd_rest_host_port() { + local mf + for mf in \ + "${ARCHIPELAGO_APPS_DIR:-/opt/archipelago/apps}/lnd/manifest.yml" \ + "${ARCHIPELAGO_APPS_DIR:-/opt/archipelago/apps}/lnd/manifest.yaml" \ + "$BATS_TEST_DIRNAME/../../../apps/lnd/manifest.yml"; do + [[ -r "$mf" ]] || continue + # The REST mapping is the `- host: ` whose following `container:` is 8080. + awk ' + /- host:/ { host=$3 } + /container:/ { if ($2 == 8080 && host != "") { print host; exit } } + ' "$mf" + return 0 + done +} + +_lnd_running() { + rpc_result container-list 2>/dev/null \ + | jq -e '.[] | select(.name == "lnd" and .state == "running")' >/dev/null 2>&1 +} + +# ──────────────────────────────────────────────────────────────────── +# Read-only tier +# ──────────────────────────────────────────────────────────────────── + +@test "LND REST is reachable on the manifest host port (catches port drift)" { + _lnd_running || skip "lnd not running" + local port + port=$(_lnd_rest_host_port) + [[ -n "$port" ]] || skip "could not resolve LND REST host port from manifest" + + # A TCP connect is enough: drift (container published on a different host + # port) shows up as connection-refused here, exactly as on .116. + run curl -sk -o /dev/null --max-time 8 "https://127.0.0.1:${port}/v1/getinfo" + if [ "$status" -ne 0 ]; then + echo "LND REST not reachable on host port ${port} (curl exit $status) — likely published-port drift" >&2 + return 1 + fi +} + +@test "lnd.newaddress returns a bech32 address when lnd is running" { + _lnd_running || skip "lnd not running" + + run rpc_call lnd.newaddress + [ "$status" -eq 0 ] + + local err addr + err=$(echo "$output" | jq -r '.error.message // .error // empty') + addr=$(echo "$output" | jq -r '.result.address // empty') + + # The whole point of the fix: a running lnd must hand back a real address. + if [[ -n "$err" ]]; then + echo "lnd.newaddress errored on a running node: $err" >&2 + return 1 + fi + if [[ "$addr" != bc1* ]]; then + echo "expected a bech32 (bc1…) address, got: '$addr'" >&2 + return 1 + fi +} + +@test "receive errors are specific, never the generic catch-all" { + # Even when receive legitimately can't produce an address, the message must be + # actionable (start with 'Bitcoin address' and/or carry a [CODE] token) — the + # generic 'Operation failed' is what hid the real cause on .116. + run rpc_call lnd.newaddress + [ "$status" -eq 0 ] + local err + err=$(echo "$output" | jq -r '.error.message // .error // empty') + if [[ "$err" == "Operation failed. Check server logs for details." ]]; then + echo "receive returned the generic catch-all instead of a specific reason" >&2 + return 1 + fi +} diff --git a/tests/lifecycle/bats/port-drift.bats b/tests/lifecycle/bats/port-drift.bats new file mode 100644 index 00000000..73bf2362 --- /dev/null +++ b/tests/lifecycle/bats/port-drift.bats @@ -0,0 +1,78 @@ +#!/usr/bin/env bats +# tests/lifecycle/bats/port-drift.bats +# +# Regression guard for the .116 failure class: a backend container that is +# "Up" but publishes its ports to the WRONG host ports because the manifest +# changed after the container was created (e.g. lnd REST stuck on host 8080 +# while the manifest — and every in-process client — expects 18080). +# +# This mirrors the orchestrator's `host_port_bindings_drifted` check, but from +# the outside: it compares the live `podman inspect` PortBindings against the +# manifest `ports:` for each installed backend. Runs on the archy host. +# +# Tiers: read-only. + +_apps_dir() { + local d + for d in "${ARCHIPELAGO_APPS_DIR:-}" /opt/archipelago/apps \ + "$BATS_TEST_DIRNAME/../../../apps"; do + [[ -n "$d" && -d "$d" ]] && { echo "$d"; return 0; } + done + return 1 +} + +_manifest_for() { + local app="$1" dir + dir=$(_apps_dir) || return 1 + local mf + for mf in "$dir/$app/manifest.yml" "$dir/$app/manifest.yaml"; do + [[ -r "$mf" ]] && { echo "$mf"; return 0; } + done + return 1 +} + +# Emit "host container" pairs from a manifest's ports: block. +_manifest_ports() { + awk ' + /^[[:space:]]*ports:/ { inports=1; next } + inports && /^[[:space:]]*[a-z_]+:[[:space:]]*$/ && !/protocol:|host:|container:/ { inports=0 } + inports && /- host:/ { host=$3 } + inports && /container:/ { print host, $2 } + ' "$1" +} + +# For a given container + (host,container) port, emit a "DRIFT: …" line on +# mismatch (and nothing otherwise). Stays silent for unpublished / host-net +# ports — those are handled elsewhere and must never be treated as drift. +_drift_line() { + local cname="$1" want_host="$2" cport="$3" + local bindings actual + bindings=$(podman inspect "$cname" --format '{{json .HostConfig.PortBindings}}' 2>/dev/null) || return 0 + actual=$(echo "$bindings" | jq -r --arg k "${cport}/tcp" '.[$k][]?.HostPort // empty' 2>/dev/null) + [[ -n "$actual" ]] || return 0 + echo "$actual" | grep -qx "$want_host" && return 0 + echo "DRIFT: $cname container-port $cport published on host [$actual] but manifest wants $want_host" +} + +@test "backend containers publish ports that match their manifest" { + command -v podman >/dev/null 2>&1 || skip "podman not available" + local checked=0 violations="" app cname mf line + # container-name : manifest-app-id + for pair in "lnd:lnd" "bitcoin-knots:bitcoin-knots" "electrumx:electrumx"; do + cname="${pair%%:*}"; app="${pair##*:}" + podman container exists "$cname" 2>/dev/null || continue + mf=$(_manifest_for "$app") || continue + while read -r host cport; do + [[ -n "$host" && -n "$cport" ]] || continue + checked=$((checked + 1)) + line=$(_drift_line "$cname" "$host" "$cport") + [[ -n "$line" ]] && violations+="${line}"$'\n' + done < <(_manifest_ports "$mf") + done + [[ "$checked" -gt 0 ]] || skip "no installed backend containers with published ports to check" + if [[ -n "$violations" ]]; then + echo "published-port drift detected:" >&2 + echo "$violations" >&2 + return 1 + fi +} diff --git a/tests/lifecycle/bats/secret-completeness.bats b/tests/lifecycle/bats/secret-completeness.bats new file mode 100644 index 00000000..cb0303ea --- /dev/null +++ b/tests/lifecycle/bats/secret-completeness.bats @@ -0,0 +1,73 @@ +#!/usr/bin/env bats +# tests/lifecycle/bats/secret-completeness.bats +# +# Regression guard for the .198 failure class: a manifest references a +# `secret_env.secret_file` that was never generated on the node, so secret +# resolution hard-fails and the container won't start — cascading the whole +# Bitcoin stack. (bitcoin-knots gained `bitcoin-rpc-txrelay-rpcauth`, which old +# nodes lacked, so bitcoind never came up and reinstall "just stopped".) +# +# For every installed backend, assert every secret_file it references exists in +# the secrets dir. Runs on the archy host. +# +# Tiers: read-only. + +SECRETS_DIR="${ARCHY_SECRETS_DIR:-/var/lib/archipelago/secrets}" + +_apps_dir() { + local d + for d in "${ARCHIPELAGO_APPS_DIR:-}" /opt/archipelago/apps \ + "$BATS_TEST_DIRNAME/../../../apps"; do + [[ -n "$d" && -d "$d" ]] && { echo "$d"; return 0; } + done + return 1 +} + +_manifest_for() { + local app="$1" dir mf + dir=$(_apps_dir) || return 1 + for mf in "$dir/$app/manifest.yml" "$dir/$app/manifest.yaml"; do + [[ -r "$mf" ]] && { echo "$mf"; return 0; } + done + return 1 +} + +_secret_files_in() { + # Emit each `secret_file:` value referenced by the manifest. + grep -E '^[[:space:]]*secret_file:' "$1" 2>/dev/null | awk '{print $2}' +} + +_secret_exists() { + local f="$SECRETS_DIR/$1" + [[ -e "$f" ]] && return 0 + sudo -n test -f "$f" 2>/dev/null +} + +@test "every installed backend's referenced secrets exist on disk" { + command -v podman >/dev/null 2>&1 || skip "podman not available" + [[ -d "$SECRETS_DIR" ]] || sudo -n test -d "$SECRETS_DIR" 2>/dev/null || skip "secrets dir not present" + + local checked=0 missing="" app cname mf sf + # container-name : manifest-app-id (the bitcoin stack that cascades) + for pair in \ + "bitcoin-knots:bitcoin-knots" "lnd:lnd" "electrumx:electrumx" \ + "mempool-api:mempool-api" "btcpay-server:btcpay-server" \ + "archy-nbxplorer:archy-nbxplorer" "fedimint:fedimint" \ + "fedimint-gateway:fedimint-gateway"; do + cname="${pair%%:*}"; app="${pair##*:}" + podman container exists "$cname" 2>/dev/null || continue + mf=$(_manifest_for "$app") || continue + while read -r sf; do + [[ -n "$sf" ]] || continue + checked=$((checked + 1)) + _secret_exists "$sf" || missing+="${app} -> ${sf}\n" + done < <(_secret_files_in "$mf") + done + + [[ "$checked" -gt 0 ]] || skip "no installed backends with secret references to check" + if [[ -n "$missing" ]]; then + echo "installed apps reference missing secrets:" >&2 + echo -e "$missing" >&2 + return 1 + fi +} diff --git a/tests/release/run.sh b/tests/release/run.sh index 66495b26..5716857b 100755 --- a/tests/release/run.sh +++ b/tests/release/run.sh @@ -85,14 +85,17 @@ fi stage "cargo-check" timeout 580 cargo check --manifest-path core/Cargo.toml -p archipelago # Focused suites for the subsystems this release train touched: # update:: — OTA download/apply/rollback/probe (v1.7.89 hardening) -# lnd — receive address + wallet readiness (v1.7.85–.89) +# lnd — receive address + wallet readiness (v1.7.85–.89), incl. the +# structured receive-error reason-code classifier # container::image_versions — image pinning / false-update detection # scanner — RAII in-flight guard (v1.7.84) +# drift — published-port drift detection (the .116 self-heal) +# missing_secret — secret-resolution names the missing file (the .198 fix) # 1500s: the non-incremental test-profile compile alone takes ~9 min on the # .116 ThinkPad; 580s expires mid-compile (exit 124) before a single test runs. stage "cargo-test-weekly" timeout 1500 env CARGO_INCREMENTAL=0 \ cargo test --manifest-path core/Cargo.toml -p archipelago -- \ - update:: lnd container::image_versions scanner + update:: lnd container::image_versions scanner drift missing_secret # ── Stage 4: live node smoke ───────────────────────────────────────── if [[ $LIVE -eq 1 ]]; then