From 0ed892a4120ab9e277b5ed8dc669433e716deb6b Mon Sep 17 00:00:00 2001 From: archipelago Date: Sun, 14 Jun 2026 03:12:56 -0400 Subject: [PATCH] fix: wallet receive reliability, bitcoin install self-heal, ElectrumX app tile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes three Bitcoin/wallet failures observed across the fleet on v1.7.90-alpha (all nodes were already on the latest build — these were live bugs, not stale builds), plus the missing ElectrumX tile, and adds automated coverage so each can't regress silently. Receive address (".116 receive fails", ".228 false 'wallet is locked'"): - LND publishes its REST API on a host port that can drift from the manifest (a container created when the mapping was 8080 kept publishing 8080 after the manifest moved to 18080). The in-process client connects to the manifest port, gets connection-refused, and wallet init fails forever while the container looks "Up". Add published-port drift detection to the reconciler (container_ports_drifted / host_port_bindings_drifted) that recreates a drifted backend even for restart-sensitive apps — a drifted container is already broken, so leaving it "untouched" only perpetuates the failure. - Receive errors now carry a stable [CODE] token (REST_UNREACHABLE, WALLET_LOCKED, WALLET_UNINITIALIZED, SYNCING) and always start with "Bitcoin address" so they survive the RPC error sanitizer instead of collapsing to the generic "Operation failed". The UI maps the code instead of guessing wallet state from substrings — so an unreachable REST endpoint is no longer mislabelled "locked". Bitcoin install (".198 bitcoin gone / reinstall just stops"): - bitcoin-knots requires the secret bitcoin-rpc-txrelay-rpcauth, which was only generated by the tx-relay flow. Nodes that never used tx-relay lacked it, so secret resolution hard-failed and the whole Bitcoin stack cascaded. Generate it idempotently before bitcoin starts (ensure_app_secrets, reusing ensure_txrelay_credentials), and name the missing secret in the error so a genuine gap is actionable instead of a bare "IO error". ElectrumX app tile missing on every node with it installed: - The catalog generator dropped electrumx because the manifest had no interfaces.main block, so the tile had no launch URL and was hidden. Declare the companion UI port (50002) in the manifest, regenerate the catalog, and let an app with a known launch URL stay launchable while its backend is still "starting" (ElectrumX indexes for 10m+). Test harness: - New lifecycle bats suites: bitcoin-receive, port-drift, secret-completeness (validated live; port-drift catches the real .116 drift). - Rust unit tests for drift detection, the receive reason-code classifier, and the named-missing-secret error; vitest for the UI code mapping. - create-release.sh now runs tests/release/run.sh and aborts the release on failure — previously it ran no tests at all. Co-Authored-By: Claude Opus 4.8 (1M context) --- apps/electrumx/manifest.yml | 14 ++ core/archipelago/src/api/mod.rs | 2 +- core/archipelago/src/api/rpc/bitcoin_relay.rs | 10 +- core/archipelago/src/api/rpc/lnd/wallet.rs | 156 ++++++++++++-- .../src/container/prod_orchestrator.rs | 199 +++++++++++++++++- .../utils/__tests__/bitcoinReceive.test.ts | 36 ++++ neode-ui/src/utils/bitcoinReceive.ts | 24 +++ .../appSession/generatedAppSessionConfig.ts | 1 + neode-ui/src/views/apps/appsConfig.ts | 9 +- scripts/create-release.sh | 23 ++ tests/lifecycle/TESTING.md | 19 ++ tests/lifecycle/bats/bitcoin-receive.bats | 104 +++++++++ tests/lifecycle/bats/port-drift.bats | 78 +++++++ tests/lifecycle/bats/secret-completeness.bats | 73 +++++++ tests/release/run.sh | 7 +- 15 files changed, 733 insertions(+), 22 deletions(-) create mode 100644 tests/lifecycle/bats/bitcoin-receive.bats create mode 100644 tests/lifecycle/bats/port-drift.bats create mode 100644 tests/lifecycle/bats/secret-completeness.bats diff --git a/apps/electrumx/manifest.yml b/apps/electrumx/manifest.yml index 14ce4c46..5b985b7b 100644 --- a/apps/electrumx/manifest.yml +++ b/apps/electrumx/manifest.yml @@ -51,6 +51,20 @@ app: - CACHE_MB=1024 - MAX_SEND=10000000 + # The ElectrumX dashboard tile is served by the host-networked companion UI + # (archy-electrs-ui) on port 50002, NOT by this container. Declaring it here + # lets the catalog generator emit electrumx -> 50002 into GENERATED_APP_PORTS + # so the tile resolves a launch URL without relying on the hand-maintained + # override in appSessionConfig.ts (which the generator can clobber). The + # backend only validates this block — it does not proxy/health-check it. + interfaces: + main: + name: Web UI + description: ElectrumX server status and connection details + type: ui + port: 50002 + protocol: http + health_check: type: tcp endpoint: localhost:50001 diff --git a/core/archipelago/src/api/mod.rs b/core/archipelago/src/api/mod.rs index 47d8c90c..47740a4e 100644 --- a/core/archipelago/src/api/mod.rs +++ b/core/archipelago/src/api/mod.rs @@ -1,4 +1,4 @@ mod handler; -mod rpc; +pub(crate) mod rpc; pub use handler::ApiHandler; diff --git a/core/archipelago/src/api/rpc/bitcoin_relay.rs b/core/archipelago/src/api/rpc/bitcoin_relay.rs index 57a222e0..b7eaf421 100644 --- a/core/archipelago/src/api/rpc/bitcoin_relay.rs +++ b/core/archipelago/src/api/rpc/bitcoin_relay.rs @@ -107,7 +107,7 @@ struct TrustedRelayPeer { } #[derive(Debug, Clone)] -struct TxRelayCredentials { +pub(crate) struct TxRelayCredentials { username: String, password: String, } @@ -648,7 +648,13 @@ async fn txrelay_credentials_available(data_dir: &Path) -> bool { && fs::metadata(&client_env_path).await.is_ok() } -async fn ensure_txrelay_credentials(data_dir: &Path) -> Result { +/// Idempotently ensure the tx-relay credential trio exists in the secrets dir: +/// the random password, its derived `rpcauth` line, and the client env file. +/// Bitcoin backend manifests reference `bitcoin-rpc-txrelay-rpcauth` as a +/// required `secret_env`, so this must run before bitcoind starts — otherwise +/// secret resolution hard-fails and the whole Bitcoin stack cascades (the .198 +/// failure). Safe to call repeatedly; it only writes what's missing or stale. +pub(crate) async fn ensure_txrelay_credentials(data_dir: &Path) -> Result { let (password_path, rpcauth_path, client_env_path) = txrelay_secret_paths(data_dir); let password = match read_trimmed(&password_path).await { Some(value) => value, diff --git a/core/archipelago/src/api/rpc/lnd/wallet.rs b/core/archipelago/src/api/rpc/lnd/wallet.rs index 84193973..2501ddaa 100644 --- a/core/archipelago/src/api/rpc/lnd/wallet.rs +++ b/core/archipelago/src/api/rpc/lnd/wallet.rs @@ -9,9 +9,15 @@ use super::LND_REST_BASE_URL; impl RpcHandler { /// Generate a new on-chain Bitcoin address. pub(in crate::api::rpc) async fn handle_lnd_newaddress(&self) -> Result { - let (client, macaroon_hex) = self.lnd_client().await?; + let (client, macaroon_hex) = self.lnd_client().await.map_err(|e| { + tracing::warn!(error = %format!("{e:#}"), "LND newaddress: client/macaroon unavailable"); + receive_error( + RECEIVE_WALLET_UNINITIALIZED, + "The Lightning wallet isn't set up on this node yet. Finish wallet setup, then try again.", + ) + })?; - let resp = client + let resp = match client .get(format!("{LND_REST_BASE_URL}/v1/newaddress")) // LND's REST gateway parses `type` as the AddressType enum by its // proto name (or integer), NOT the lncli aliases. "p2wkh" is not a @@ -21,13 +27,26 @@ impl RpcHandler { .header("Grpc-Metadata-macaroon", &macaroon_hex) .send() .await - .context("LND REST connection failed")?; + { + Ok(resp) => resp, + Err(e) => { + // The .116 case: LND container is up but its REST endpoint isn't + // reachable on the expected port (e.g. published-port drift), so + // the connection is refused. This is NOT a locked wallet — emit a + // distinct code so the UI stops mislabelling it. + tracing::warn!(error = %format!("{e:#}"), "LND newaddress: REST connection failed"); + return Err(receive_error( + RECEIVE_REST_UNREACHABLE, + "The Lightning wallet service isn't reachable yet. It may be starting up or recovering — please try again in a moment.", + )); + } + }; let status = resp.status(); let raw_body = resp .text() .await - .context("LND address response could not be read")?; + .context("Bitcoin address response could not be read")?; let body: serde_json::Value = serde_json::from_str(&raw_body).unwrap_or_else(|_| { serde_json::json!({ "raw": raw_body, @@ -36,11 +55,9 @@ impl RpcHandler { if !status.is_success() { let message = lnd_error_message(&body); - anyhow::bail!( - "Bitcoin address generation failed ({}): {}", - status, - message - ); + let code = classify_lnd_address_error(&message); + tracing::warn!(%status, lnd_message = %message, code, "LND newaddress returned an error"); + return Err(receive_error(code, default_receive_detail(code))); } if let Some(error) = body @@ -48,14 +65,21 @@ impl RpcHandler { .or_else(|| body.get("message")) .and_then(|v| v.as_str()) { - anyhow::bail!("Bitcoin address generation failed: {}", error); + let code = classify_lnd_address_error(error); + tracing::warn!(lnd_message = %error, code, "LND newaddress returned an error body"); + return Err(receive_error(code, default_receive_detail(code))); } let address = body .get("address") .and_then(|v| v.as_str()) .filter(|addr| !addr.trim().is_empty()) - .ok_or_else(|| anyhow::anyhow!("Bitcoin address generation failed: LND did not return a Bitcoin address. The wallet may still be locked, uninitialized, or waiting for Bitcoin to sync."))? + .ok_or_else(|| { + receive_error( + RECEIVE_WALLET_UNINITIALIZED, + "The wallet didn't return an address yet. It may still be unlocking or waiting for Bitcoin to sync — please try again shortly.", + ) + })? .to_string(); Ok(serde_json::json!({ "address": address })) @@ -583,9 +607,75 @@ fn lnd_error_message(body: &serde_json::Value) -> String { .to_string() } +// Stable, machine-readable reason codes for receive-address failures. They are +// embedded in the error message as a `[CODE]` token so the frontend +// (neode-ui/src/utils/bitcoinReceive.ts) can show an accurate explanation +// instead of guessing wallet state by substring-matching — which is what made +// .228 report "wallet is locked" when LND's REST was merely unreachable. +// +// Every receive error string starts with "Bitcoin address" so it survives the +// RPC error sanitizer (api/rpc/middleware.rs) unchanged rather than being +// flattened to the generic "Operation failed" message (the .116 symptom). +pub(crate) const RECEIVE_REST_UNREACHABLE: &str = "LND_REST_UNREACHABLE"; +pub(crate) const RECEIVE_WALLET_LOCKED: &str = "LND_WALLET_LOCKED"; +pub(crate) const RECEIVE_WALLET_UNINITIALIZED: &str = "LND_WALLET_UNINITIALIZED"; +pub(crate) const RECEIVE_SYNCING: &str = "LND_SYNCING"; +pub(crate) const RECEIVE_LND_ERROR: &str = "LND_ERROR"; + +/// Build a receive-address error carrying a reason code the UI can map. +fn receive_error(code: &str, detail: &str) -> anyhow::Error { + anyhow::anyhow!("Bitcoin address unavailable [{code}]: {detail}") +} + +/// A sensible default human message per code (used for non-UI callers and logs; +/// the frontend renders its own copy from the code). +fn default_receive_detail(code: &str) -> &'static str { + match code { + RECEIVE_REST_UNREACHABLE => { + "The Lightning wallet service isn't reachable yet. It may be starting up or recovering — please try again in a moment." + } + RECEIVE_WALLET_LOCKED => { + "The Lightning wallet is locked. Unlock it (or finish wallet setup), then try again." + } + RECEIVE_WALLET_UNINITIALIZED => { + "The Lightning wallet isn't set up yet. Finish wallet setup, then try again." + } + RECEIVE_SYNCING => { + "The wallet is still syncing with the Bitcoin network. Please try again once it has caught up." + } + _ => "Couldn't generate a Bitcoin address right now. Please try again shortly.", + } +} + +/// Classify a non-2xx LND error body/message into a reason code. The wording of +/// LND's REST errors is stable enough to bucket: a locked wallet, an +/// uninitialized wallet, a syncing chain, or some other failure. +fn classify_lnd_address_error(message: &str) -> &'static str { + let m = message.to_lowercase(); + if m.contains("locked") || m.contains("unlock") { + RECEIVE_WALLET_LOCKED + } else if m.contains("synchroniz") + || m.contains("syncing") + || m.contains("not yet ready") + || m.contains("in the process of starting") + { + RECEIVE_SYNCING + } else if m.contains("wallet not found") + || m.contains("not exist") + || m.contains("uninitialized") + || m.contains("not initialized") + || m.contains("create a wallet") + || m.contains("no wallet") + { + RECEIVE_WALLET_UNINITIALIZED + } else { + RECEIVE_LND_ERROR + } +} + #[cfg(test)] mod tests { - use super::lnd_error_message; + use super::*; #[test] fn lnd_error_message_prefers_message_field() { @@ -604,4 +694,46 @@ mod tests { "unknown LND error" ); } + + #[test] + fn classify_locked_wallet() { + assert_eq!( + classify_lnd_address_error("wallet locked, please unlock"), + RECEIVE_WALLET_LOCKED + ); + } + + #[test] + fn classify_uninitialized_wallet() { + assert_eq!( + classify_lnd_address_error("wallet not found, create a wallet first"), + RECEIVE_WALLET_UNINITIALIZED + ); + } + + #[test] + fn classify_syncing() { + assert_eq!( + classify_lnd_address_error("server is still in the process of starting"), + RECEIVE_SYNCING + ); + } + + #[test] + fn classify_unknown_is_generic_error() { + assert_eq!( + classify_lnd_address_error("some other failure"), + RECEIVE_LND_ERROR + ); + } + + #[test] + fn receive_error_starts_with_sanitizer_safe_prefix_and_embeds_code() { + // Must start with "Bitcoin address" (survives the RPC error sanitizer) + // and carry the [CODE] token the frontend parses. + let err = receive_error(RECEIVE_REST_UNREACHABLE, "unreachable"); + let s = format!("{err}"); + assert!(s.starts_with("Bitcoin address"), "got: {s}"); + assert!(s.contains("[LND_REST_UNREACHABLE]"), "got: {s}"); + } } diff --git a/core/archipelago/src/container/prod_orchestrator.rs b/core/archipelago/src/container/prod_orchestrator.rs index db4153a6..132289a9 100644 --- a/core/archipelago/src/container/prod_orchestrator.rs +++ b/core/archipelago/src/container/prod_orchestrator.rs @@ -297,6 +297,53 @@ async fn wait_for_manifest_host_ports(manifest: &AppManifest, timeout_secs: u64) Ok(()) } +/// Pure published-port drift check. `port_bindings_json` is the JSON that +/// `podman inspect --format '{{json .HostConfig.PortBindings}}'` emits, e.g. +/// `{"8080/tcp":[{"HostIp":"","HostPort":"18080"}]}`. Returns true only when a +/// manifest container-port is positively published to a *different* host port +/// than the manifest now asks for. Absence of a binding is deliberately NOT +/// treated as drift here — that case is handled by the host-port repair/restart +/// path and by host-networked apps that publish nothing — so we never trigger a +/// destructive recreate on a false positive. +fn host_port_bindings_drifted( + port_bindings_json: &str, + manifest_ports: &[archipelago_container::manifest::PortMapping], +) -> bool { + let parsed: serde_json::Value = match serde_json::from_str(port_bindings_json) { + Ok(v) => v, + Err(_) => return false, + }; + let Some(map) = parsed.as_object() else { + return false; + }; + for port in manifest_ports { + let proto = if port.protocol.is_empty() { + "tcp" + } else { + port.protocol.as_str() + }; + let key = format!("{}/{}", port.container, proto); + let Some(bindings) = map.get(&key).and_then(|b| b.as_array()) else { + // Container-port not currently published — not our case. + continue; + }; + if bindings.is_empty() { + continue; + } + let expected = port.host.to_string(); + let matches_expected = bindings.iter().any(|b| { + b.get("HostPort") + .and_then(|h| h.as_str()) + .map(|h| h == expected) + .unwrap_or(false) + }); + if !matches_expected { + return true; + } + } + false +} + async fn ensure_user_podman_socket() -> Result<()> { let socket_path = "/run/user/1000/podman/podman.sock"; if podman_socket_accepts_connections(socket_path).await { @@ -734,8 +781,19 @@ struct FileSecretsProvider { impl SecretsProvider for FileSecretsProvider { fn read(&self, name: &str) -> std::result::Result { let path = self.root.join(name); - let data = std::fs::read_to_string(&path).map_err(ManifestError::Io)?; - Ok(data.trim().to_string()) + match std::fs::read_to_string(&path) { + Ok(data) => Ok(data.trim().to_string()), + // Name the missing secret explicitly so the failure is actionable + // instead of a bare "IO error: No such file or directory" that hides + // which secret (and which app) is blocked. + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + Err(ManifestError::Invalid(format!( + "required secret '{name}' is missing (expected at {}); the app cannot start until it is generated", + path.display() + ))) + } + Err(e) => Err(ManifestError::Io(e)), + } } } @@ -1054,6 +1112,7 @@ impl ProdContainerOrchestrator { let lock = self.app_lock(&app_id).await; let _guard = lock.lock().await; + self.ensure_app_secrets(&app_id).await?; let mut resolved_manifest = lm.manifest.clone(); self.resolve_dynamic_env(&mut resolved_manifest)?; let name = compute_container_name(&lm.manifest); @@ -1094,6 +1153,22 @@ impl ProdContainerOrchestrator { self.run_post_start_hooks(&app_id).await?; return Ok(ReconcileAction::Started); } + // Published-port drift means the container exists but maps + // its ports to the wrong host ports (e.g. lnd REST stuck on + // host 8080 while the manifest/clients expect 18080, as seen + // on .116). The container is already non-functional, so + // recreate it even for restart-sensitive apps during boot — + // leaving it "untouched" would perpetuate the breakage. + if self + .container_ports_drifted(&name, &resolved_manifest) + .await + { + tracing::info!(app_id = %app_id, container = %name, "container published-port drift detected — recreating"); + let _ = self.runtime.stop_container(&name).await; + let _ = self.runtime.remove_container(&name).await; + self.install_fresh(lm).await?; + return Ok(ReconcileAction::Installed); + } if self.container_env_drifted(&name, &resolved_manifest).await { if mode == ReconcileMode::ExistingOnly && is_restart_sensitive_app(&app_id) @@ -1154,8 +1229,12 @@ impl ProdContainerOrchestrator { // reading it. A Rewritten outcome is fine here — we're // about to start from a stopped state anyway. self.prepare_for_start(&resolved_manifest).await?; - if self.container_env_drifted(&name, &resolved_manifest).await { - tracing::info!(app_id = %app_id, container = %name, "stopped container env drift detected — recreating"); + if self.container_env_drifted(&name, &resolved_manifest).await + || self + .container_ports_drifted(&name, &resolved_manifest) + .await + { + tracing::info!(app_id = %app_id, container = %name, "stopped container env/port drift detected — recreating"); let _ = self.runtime.remove_container(&name).await; self.install_fresh(lm).await?; return Ok(ReconcileAction::Installed); @@ -1297,6 +1376,7 @@ impl ProdContainerOrchestrator { /// Build-or-pull, create, start. Assumes the per-app mutex is already held. async fn install_fresh(&self, lm: &LoadedManifest) -> Result<()> { + self.ensure_app_secrets(&lm.manifest.app.id).await?; let mut resolved_manifest = lm.manifest.clone(); self.resolve_dynamic_env(&mut resolved_manifest)?; @@ -2300,6 +2380,25 @@ impl ProdContainerOrchestrator { Self::detect_disk_gb() } + /// Ensure app-specific secrets exist *before* env resolution. The Bitcoin + /// backends reference `bitcoin-rpc-txrelay-rpcauth` as a required + /// `secret_env`; it is normally created by the tx-relay flow, so nodes that + /// never used tx-relay lack it and `resolve_secret_env` hard-fails — taking + /// bitcoind (and everything that depends on it) down. Generating it here, + /// idempotently, lets reconcile/install self-heal that state (the .198 case) + /// instead of cascading. Must be called before every `resolve_dynamic_env`. + async fn ensure_app_secrets(&self, app_id: &str) -> Result<()> { + if cfg!(test) { + return Ok(()); + } + if matches!(app_id, "bitcoin-knots" | "bitcoin-core" | "bitcoin") { + crate::api::rpc::bitcoin_relay::ensure_txrelay_credentials(&self.data_dir) + .await + .context("ensuring bitcoin tx-relay credentials")?; + } + Ok(()) + } + fn resolve_dynamic_env(&self, manifest: &mut AppManifest) -> Result<()> { let facts = self.detect_host_facts(); let mut env = manifest.app.environment.clone(); @@ -2443,6 +2542,42 @@ impl ProdContainerOrchestrator { false } + /// True when a *running* container publishes a manifest container-port to a + /// different host port than the manifest now asks for (published-port + /// drift). This catches the class of failure seen on .116, where `lnd` was + /// created mapping host 8080 -> container 8080 but the current manifest maps + /// host 18080 -> container 8080, so every in-process REST client (which + /// connects to the manifest port) gets connection-refused forever while the + /// container looks "Up". `container_env_drifted` never inspects ports, and + /// `wait_for_manifest_host_ports` only restarts the stale container (which + /// republishes the wrong mapping), so without this check the drift is + /// self-perpetuating. + async fn container_ports_drifted(&self, name: &str, manifest: &AppManifest) -> bool { + if cfg!(test) { + return false; + } + if manifest.app.ports.is_empty() { + return false; + } + let inspect = tokio::process::Command::new("podman") + .args([ + "inspect", + name, + "--format", + "{{json .HostConfig.PortBindings}}", + ]) + .output() + .await; + let Ok(output) = inspect else { + return false; + }; + if !output.status.success() { + return false; + } + let bindings = String::from_utf8_lossy(&output.stdout); + host_port_bindings_drifted(&bindings, &manifest.app.ports) + } + async fn apply_data_uid(&self, manifest: &AppManifest) -> Result<()> { let Some(uid_gid) = manifest.app.container.data_uid.as_ref() else { return Ok(()); @@ -2752,6 +2887,7 @@ impl ContainerOrchestrator for ProdContainerOrchestrator { let lm = self.loaded(app_id).await?; let lock = self.app_lock(app_id).await; let _guard = lock.lock().await; + self.ensure_app_secrets(app_id).await?; let name = compute_container_name(&lm.manifest); let mut resolved_manifest = lm.manifest.clone(); self.resolve_dynamic_env(&mut resolved_manifest)?; @@ -2913,6 +3049,61 @@ mod tests { use async_trait::async_trait; use std::sync::Mutex as StdMutex; + fn port(host: u16, container: u16) -> archipelago_container::manifest::PortMapping { + archipelago_container::manifest::PortMapping { + host, + container, + protocol: "tcp".to_string(), + } + } + + #[test] + fn port_drift_detected_when_host_port_differs() { + // The .116 case: container publishes container-port 8080 on host 8080, + // but the manifest now asks for host 18080. + let bindings = r#"{"8080/tcp":[{"HostIp":"","HostPort":"8080"}]}"#; + assert!(host_port_bindings_drifted(bindings, &[port(18080, 8080)])); + } + + #[test] + fn no_drift_when_host_port_matches() { + let bindings = r#"{"8080/tcp":[{"HostIp":"0.0.0.0","HostPort":"18080"}]}"#; + assert!(!host_port_bindings_drifted(bindings, &[port(18080, 8080)])); + } + + #[test] + fn no_drift_when_binding_absent() { + // Absence is handled elsewhere (host-port repair / host-networked apps); + // never treat it as drift to avoid a destructive recreate on a false + // positive. + assert!(!host_port_bindings_drifted("{}", &[port(18080, 8080)])); + assert!(!host_port_bindings_drifted("null", &[port(18080, 8080)])); + } + + #[test] + fn no_drift_on_unparseable_bindings() { + assert!(!host_port_bindings_drifted( + "not json", + &[port(18080, 8080)] + )); + } + + #[test] + fn missing_secret_error_names_the_secret() { + use archipelago_container::manifest::SecretsProvider; + let provider = FileSecretsProvider { + root: PathBuf::from("/nonexistent-secrets-dir-xyz"), + }; + let err = provider + .read("bitcoin-rpc-txrelay-rpcauth") + .expect_err("missing secret must error"); + let msg = err.to_string(); + assert!( + msg.contains("bitcoin-rpc-txrelay-rpcauth"), + "error should name the missing secret, got: {msg}" + ); + } + /// Instrumented in-memory runtime. Every call is recorded so tests can assert /// the exact sequence of side effects. #[derive(Default)] diff --git a/neode-ui/src/utils/__tests__/bitcoinReceive.test.ts b/neode-ui/src/utils/__tests__/bitcoinReceive.test.ts index dbc74c16..393eac8d 100644 --- a/neode-ui/src/utils/__tests__/bitcoinReceive.test.ts +++ b/neode-ui/src/utils/__tests__/bitcoinReceive.test.ts @@ -21,4 +21,40 @@ describe('explainReceiveAddressFailure', () => { it('keeps bitcoin address generation failures visible', () => { expect(explainReceiveAddressFailure(new Error('Bitcoin address generation failed: LND is not ready'))).toContain('Bitcoin address generation failed') }) + + describe('structured reason codes (backend [CODE] token)', () => { + it('maps an unreachable REST endpoint to a starting/recovering message — NOT locked (.228 regression)', () => { + const msg = explainReceiveAddressFailure( + new Error('Bitcoin address unavailable [LND_REST_UNREACHABLE]: service not reachable'), + ) + expect(msg).toContain('starting up or recovering') + expect(msg.toLowerCase()).not.toContain('locked') + }) + + it('maps a genuinely locked wallet to the locked message', () => { + expect( + explainReceiveAddressFailure(new Error('Bitcoin address unavailable [LND_WALLET_LOCKED]: locked')), + ).toContain('locked') + }) + + it('maps an uninitialized wallet to the setup message', () => { + expect( + explainReceiveAddressFailure(new Error('Bitcoin address unavailable [LND_WALLET_UNINITIALIZED]: x')), + ).toContain('has not been set up') + }) + + it('maps a syncing wallet to the syncing message', () => { + expect( + explainReceiveAddressFailure(new Error('Bitcoin address unavailable [LND_SYNCING]: x')), + ).toContain('syncing') + }) + + it('prefers the code over substrings even when the detail text is misleading', () => { + // Detail mentions neither "locked" nor "unlock"; code must still win. + const msg = explainReceiveAddressFailure( + new Error('Bitcoin address unavailable [LND_REST_UNREACHABLE]: wallet service down'), + ) + expect(msg).toContain('starting up or recovering') + }) + }) }) diff --git a/neode-ui/src/utils/bitcoinReceive.ts b/neode-ui/src/utils/bitcoinReceive.ts index deb87ee5..71a133ef 100644 --- a/neode-ui/src/utils/bitcoinReceive.ts +++ b/neode-ui/src/utils/bitcoinReceive.ts @@ -1,5 +1,29 @@ +// Machine-readable reason codes the backend embeds as a `[CODE]` token in +// receive-address errors (see core .../api/rpc/lnd/wallet.rs). Mapping the code +// directly is precise — unlike the substring heuristics below, it cannot +// mislabel an unreachable-REST failure as "wallet is locked" (the .228 bug). +const RECEIVE_CODE_MESSAGES: Record = { + LND_REST_UNREACHABLE: + 'Bitcoin address is not ready yet because the Lightning wallet service is still starting up or recovering. Please try again in a moment.', + LND_WALLET_LOCKED: + 'Bitcoin address is not ready because the Lightning wallet is locked. Unlock or initialize LND first.', + LND_WALLET_UNINITIALIZED: + 'Bitcoin address is not ready because the Lightning wallet has not been set up yet. Finish wallet setup, then try again.', + LND_SYNCING: + 'Bitcoin address is not ready while the wallet is still syncing with the Bitcoin network. Try again once sync has progressed.', + LND_ERROR: + 'Bitcoin address is not ready yet. Check that the Lightning app is healthy, then try again.', +} + export function explainReceiveAddressFailure(error: unknown): string { const message = error instanceof Error ? error.message : String(error || '') + + // Prefer the structured reason code when present. + const codeMatch = message.match(/\[([A-Z_]+)\]/) + if (codeMatch && RECEIVE_CODE_MESSAGES[codeMatch[1]]) { + return RECEIVE_CODE_MESSAGES[codeMatch[1]] + } + const lower = message.toLowerCase() if (lower.includes('wallet') && (lower.includes('locked') || lower.includes('unlock'))) { diff --git a/neode-ui/src/views/appSession/generatedAppSessionConfig.ts b/neode-ui/src/views/appSession/generatedAppSessionConfig.ts index de37486b..fc59c34b 100644 --- a/neode-ui/src/views/appSession/generatedAppSessionConfig.ts +++ b/neode-ui/src/views/appSession/generatedAppSessionConfig.ts @@ -7,6 +7,7 @@ export const GENERATED_APP_PORTS: Record = { "botfights": 9100, "btcpay-server": 23000, "did-wallet": 8083, + "electrumx": 50002, "fedimint": 8175, "filebrowser": 8083, "gitea": 3001, diff --git a/neode-ui/src/views/apps/appsConfig.ts b/neode-ui/src/views/apps/appsConfig.ts index 9d934721..8efc3f64 100644 --- a/neode-ui/src/views/apps/appsConfig.ts +++ b/neode-ui/src/views/apps/appsConfig.ts @@ -189,7 +189,14 @@ export function canLaunch(pkg: PackageDataEntry): boolean { if ((pkg.manifest.id === 'fedimint' || pkg.manifest.id === 'fedimintd') && hasUI) { return pkg.state === PackageState.Running || pkg.state === PackageState.Starting } - return !!hasUI && pkg.state === 'running' && pkg.health !== 'starting' && pkg.health !== 'unhealthy' + // A static launch URL (e.g. a host-networked companion UI like + // archy-electrs-ui) serves independently of the backend's own sync state, so + // the tile stays launchable while the backend is still 'starting' (ElectrumX + // indexes for 10m+ on first run). A genuinely 'unhealthy' backend still + // blocks. Apps that rely on a runtime interface-address keep the strict gate. + const blockedByHealth = + pkg.health === 'unhealthy' || (pkg.health === 'starting' && !hasKnownLaunchUrl) + return !!hasUI && pkg.state === 'running' && !blockedByHealth } export function launchBlockedReason(id: string, pkg?: PackageDataEntry | null): string { diff --git a/scripts/create-release.sh b/scripts/create-release.sh index 6c90f02d..da012f39 100755 --- a/scripts/create-release.sh +++ b/scripts/create-release.sh @@ -75,6 +75,28 @@ if ! git -C "$PROJECT_ROOT" diff --quiet HEAD; then exit 1 fi +# ── Pre-flight test gate ────────────────────────────────────────────── +# A release must not ship if the static/frontend/backend checks fail. This +# runs the release gate harness (cargo fmt/check, catalog drift, vitest, and +# the focused cargo suites — incl. the receive/port-drift/secret regressions). +# Skipped on --dry-run, or set SKIP_RELEASE_TESTS=1 to bypass in an emergency. +# The lifecycle bats harness (tests/lifecycle/run-20x.sh) still runs separately +# against live nodes — see tests/lifecycle/TESTING.md. +if ! $DRY_RUN; then + if [ "${SKIP_RELEASE_TESTS:-0}" = "1" ]; then + echo "WARNING: SKIP_RELEASE_TESTS=1 — bypassing the pre-flight test gate" + elif [ -x "$PROJECT_ROOT/tests/release/run.sh" ]; then + echo "[0/7] Running release gate (tests/release/run.sh)..." + if ! "$PROJECT_ROOT/tests/release/run.sh"; then + echo "Error: release gate failed — aborting release. Fix the failing" + echo " stage, or re-run with SKIP_RELEASE_TESTS=1 to override." + exit 1 + fi + else + echo "WARNING: tests/release/run.sh not found/executable — skipping test gate" + fi +fi + # Check tag doesn't already exist if git -C "$PROJECT_ROOT" tag -l "v$VERSION" | grep -q "v$VERSION"; then echo "Error: Tag v$VERSION already exists" @@ -94,6 +116,7 @@ echo "" if $DRY_RUN; then echo "[DRY RUN] Would perform the following:" + echo " 0. Run pre-flight test gate (tests/release/run.sh) — aborts on failure" echo " 1. Update core/archipelago/Cargo.toml version to $VERSION" echo " 2. Update neode-ui/package.json version to $VERSION" echo " 3. Build backend (cargo build --release -p archipelago)" diff --git a/tests/lifecycle/TESTING.md b/tests/lifecycle/TESTING.md index 6a668acd..dcb28ea8 100644 --- a/tests/lifecycle/TESTING.md +++ b/tests/lifecycle/TESTING.md @@ -58,10 +58,29 @@ v1.7.52 tags. | L1 RPC | 70 | bitcoin-knots, lnd, electrumx, btcpay, mempool, fedimint, required-stack, package-update-smoke | ● for the 6 core apps | | L2 UI | 9 | ui-coverage | ● for dashboard + 7 proxy paths + bitcoin-ui:8334 | | L3 lifecycle survival | 14 | companion-survives-archipelago-restart, backend-survives-archipelago-restart, required-stack-destructive, use-quadlet-backends-install | ◐ companions ● ; backends ◐ regression-gate (will fail until Phase 3 Quadlet ships); quadlet post-condition gate ✅ skip-clean today, hard gate when flag flipped | +| L1 wallet-receive / drift / secrets | 5 | bitcoin-receive, port-drift, secret-completeness | ● guards the v1.7.9x wallet fleet failures | | L4 browser journey | 0 | none | ○ not started | | L5 chaos | 0 | none | ○ not started | | L6 performance | 0 | none | ○ not started | +### Wallet / Bitcoin fleet-failure regression suites (added after v1.7.90-alpha) + +Three production failures shipped on v1.7.90-alpha despite the existing harness, +because nothing exercised the receive path, port-mapping drift, or secret +completeness on a live node. New suites close those gaps (all run on the archy +host, read-only, so they join `run.sh`/`run-20x.sh` automatically): + +| Suite | Failure it guards | Asserts | +|---|---|---| +| `bitcoin-receive.bats` | .116 ("Operation failed" on receive) and .228 (false "wallet is locked") | LND REST reachable on the **manifest** host port; `lnd.newaddress` returns a `bc1…` address on a running node; receive errors are specific, never the generic catch-all | +| `port-drift.bats` | .116 (lnd REST stuck on host 8080 vs manifest 18080) | every installed backend's live `podman inspect` PortBindings match its manifest `ports:` (the external mirror of the orchestrator's `host_port_bindings_drifted`) | +| `secret-completeness.bats` | .198 (bitcoin-knots needs `bitcoin-rpc-txrelay-rpcauth`, never generated → stack cascade) | every `secret_file` referenced by an installed backend manifest exists in the secrets dir | + +Backed by L0 unit tests (`cargo test … drift missing_secret lnd`) and a vitest +for the frontend reason-code mapping (`bitcoinReceive.test.ts`). The release +gate `scripts/create-release.sh` now runs `tests/release/run.sh` (which includes +these) and **aborts the release on failure** — previously it ran no tests at all. + ## Run commands ```bash diff --git a/tests/lifecycle/bats/bitcoin-receive.bats b/tests/lifecycle/bats/bitcoin-receive.bats new file mode 100644 index 00000000..278cbb73 --- /dev/null +++ b/tests/lifecycle/bats/bitcoin-receive.bats @@ -0,0 +1,104 @@ +#!/usr/bin/env bats +# tests/lifecycle/bats/bitcoin-receive.bats +# +# Regression coverage for the Bitcoin "Receive" flow. Receive addresses come +# from LND's hot wallet via the `lnd.newaddress` RPC, so this exercises the +# exact path that broke on the fleet: +# - .116: LND REST published on the wrong host port (8080 vs the manifest's +# 18080) -> connection refused -> receive failed with the generic +# "Operation failed. Check server logs." message. +# - .228: the same family surfaced to the UI as a *false* "wallet is locked". +# +# These tests run on the archy host (they shell into podman / curl localhost). +# +# Tiers: read-only only — generating a receive address is non-destructive. + +load '../lib/rpc.bash' + +setup_file() { + : "${ARCHY_PASSWORD:?Set ARCHY_PASSWORD env var to the UI password}" + export ARCHY_FORCE_LOGIN=1 + rpc_login + unset ARCHY_FORCE_LOGIN +} + +teardown_file() { + rpc_logout_local +} + +# Resolve the LND REST host port from the manifest (single source of truth) so +# this test follows the manifest rather than hard-coding 18080. +_lnd_rest_host_port() { + local mf + for mf in \ + "${ARCHIPELAGO_APPS_DIR:-/opt/archipelago/apps}/lnd/manifest.yml" \ + "${ARCHIPELAGO_APPS_DIR:-/opt/archipelago/apps}/lnd/manifest.yaml" \ + "$BATS_TEST_DIRNAME/../../../apps/lnd/manifest.yml"; do + [[ -r "$mf" ]] || continue + # The REST mapping is the `- host: ` whose following `container:` is 8080. + awk ' + /- host:/ { host=$3 } + /container:/ { if ($2 == 8080 && host != "") { print host; exit } } + ' "$mf" + return 0 + done +} + +_lnd_running() { + rpc_result container-list 2>/dev/null \ + | jq -e '.[] | select(.name == "lnd" and .state == "running")' >/dev/null 2>&1 +} + +# ──────────────────────────────────────────────────────────────────── +# Read-only tier +# ──────────────────────────────────────────────────────────────────── + +@test "LND REST is reachable on the manifest host port (catches port drift)" { + _lnd_running || skip "lnd not running" + local port + port=$(_lnd_rest_host_port) + [[ -n "$port" ]] || skip "could not resolve LND REST host port from manifest" + + # A TCP connect is enough: drift (container published on a different host + # port) shows up as connection-refused here, exactly as on .116. + run curl -sk -o /dev/null --max-time 8 "https://127.0.0.1:${port}/v1/getinfo" + if [ "$status" -ne 0 ]; then + echo "LND REST not reachable on host port ${port} (curl exit $status) — likely published-port drift" >&2 + return 1 + fi +} + +@test "lnd.newaddress returns a bech32 address when lnd is running" { + _lnd_running || skip "lnd not running" + + run rpc_call lnd.newaddress + [ "$status" -eq 0 ] + + local err addr + err=$(echo "$output" | jq -r '.error.message // .error // empty') + addr=$(echo "$output" | jq -r '.result.address // empty') + + # The whole point of the fix: a running lnd must hand back a real address. + if [[ -n "$err" ]]; then + echo "lnd.newaddress errored on a running node: $err" >&2 + return 1 + fi + if [[ "$addr" != bc1* ]]; then + echo "expected a bech32 (bc1…) address, got: '$addr'" >&2 + return 1 + fi +} + +@test "receive errors are specific, never the generic catch-all" { + # Even when receive legitimately can't produce an address, the message must be + # actionable (start with 'Bitcoin address' and/or carry a [CODE] token) — the + # generic 'Operation failed' is what hid the real cause on .116. + run rpc_call lnd.newaddress + [ "$status" -eq 0 ] + local err + err=$(echo "$output" | jq -r '.error.message // .error // empty') + if [[ "$err" == "Operation failed. Check server logs for details." ]]; then + echo "receive returned the generic catch-all instead of a specific reason" >&2 + return 1 + fi +} diff --git a/tests/lifecycle/bats/port-drift.bats b/tests/lifecycle/bats/port-drift.bats new file mode 100644 index 00000000..73bf2362 --- /dev/null +++ b/tests/lifecycle/bats/port-drift.bats @@ -0,0 +1,78 @@ +#!/usr/bin/env bats +# tests/lifecycle/bats/port-drift.bats +# +# Regression guard for the .116 failure class: a backend container that is +# "Up" but publishes its ports to the WRONG host ports because the manifest +# changed after the container was created (e.g. lnd REST stuck on host 8080 +# while the manifest — and every in-process client — expects 18080). +# +# This mirrors the orchestrator's `host_port_bindings_drifted` check, but from +# the outside: it compares the live `podman inspect` PortBindings against the +# manifest `ports:` for each installed backend. Runs on the archy host. +# +# Tiers: read-only. + +_apps_dir() { + local d + for d in "${ARCHIPELAGO_APPS_DIR:-}" /opt/archipelago/apps \ + "$BATS_TEST_DIRNAME/../../../apps"; do + [[ -n "$d" && -d "$d" ]] && { echo "$d"; return 0; } + done + return 1 +} + +_manifest_for() { + local app="$1" dir + dir=$(_apps_dir) || return 1 + local mf + for mf in "$dir/$app/manifest.yml" "$dir/$app/manifest.yaml"; do + [[ -r "$mf" ]] && { echo "$mf"; return 0; } + done + return 1 +} + +# Emit "host container" pairs from a manifest's ports: block. +_manifest_ports() { + awk ' + /^[[:space:]]*ports:/ { inports=1; next } + inports && /^[[:space:]]*[a-z_]+:[[:space:]]*$/ && !/protocol:|host:|container:/ { inports=0 } + inports && /- host:/ { host=$3 } + inports && /container:/ { print host, $2 } + ' "$1" +} + +# For a given container + (host,container) port, emit a "DRIFT: …" line on +# mismatch (and nothing otherwise). Stays silent for unpublished / host-net +# ports — those are handled elsewhere and must never be treated as drift. +_drift_line() { + local cname="$1" want_host="$2" cport="$3" + local bindings actual + bindings=$(podman inspect "$cname" --format '{{json .HostConfig.PortBindings}}' 2>/dev/null) || return 0 + actual=$(echo "$bindings" | jq -r --arg k "${cport}/tcp" '.[$k][]?.HostPort // empty' 2>/dev/null) + [[ -n "$actual" ]] || return 0 + echo "$actual" | grep -qx "$want_host" && return 0 + echo "DRIFT: $cname container-port $cport published on host [$actual] but manifest wants $want_host" +} + +@test "backend containers publish ports that match their manifest" { + command -v podman >/dev/null 2>&1 || skip "podman not available" + local checked=0 violations="" app cname mf line + # container-name : manifest-app-id + for pair in "lnd:lnd" "bitcoin-knots:bitcoin-knots" "electrumx:electrumx"; do + cname="${pair%%:*}"; app="${pair##*:}" + podman container exists "$cname" 2>/dev/null || continue + mf=$(_manifest_for "$app") || continue + while read -r host cport; do + [[ -n "$host" && -n "$cport" ]] || continue + checked=$((checked + 1)) + line=$(_drift_line "$cname" "$host" "$cport") + [[ -n "$line" ]] && violations+="${line}"$'\n' + done < <(_manifest_ports "$mf") + done + [[ "$checked" -gt 0 ]] || skip "no installed backend containers with published ports to check" + if [[ -n "$violations" ]]; then + echo "published-port drift detected:" >&2 + echo "$violations" >&2 + return 1 + fi +} diff --git a/tests/lifecycle/bats/secret-completeness.bats b/tests/lifecycle/bats/secret-completeness.bats new file mode 100644 index 00000000..cb0303ea --- /dev/null +++ b/tests/lifecycle/bats/secret-completeness.bats @@ -0,0 +1,73 @@ +#!/usr/bin/env bats +# tests/lifecycle/bats/secret-completeness.bats +# +# Regression guard for the .198 failure class: a manifest references a +# `secret_env.secret_file` that was never generated on the node, so secret +# resolution hard-fails and the container won't start — cascading the whole +# Bitcoin stack. (bitcoin-knots gained `bitcoin-rpc-txrelay-rpcauth`, which old +# nodes lacked, so bitcoind never came up and reinstall "just stopped".) +# +# For every installed backend, assert every secret_file it references exists in +# the secrets dir. Runs on the archy host. +# +# Tiers: read-only. + +SECRETS_DIR="${ARCHY_SECRETS_DIR:-/var/lib/archipelago/secrets}" + +_apps_dir() { + local d + for d in "${ARCHIPELAGO_APPS_DIR:-}" /opt/archipelago/apps \ + "$BATS_TEST_DIRNAME/../../../apps"; do + [[ -n "$d" && -d "$d" ]] && { echo "$d"; return 0; } + done + return 1 +} + +_manifest_for() { + local app="$1" dir mf + dir=$(_apps_dir) || return 1 + for mf in "$dir/$app/manifest.yml" "$dir/$app/manifest.yaml"; do + [[ -r "$mf" ]] && { echo "$mf"; return 0; } + done + return 1 +} + +_secret_files_in() { + # Emit each `secret_file:` value referenced by the manifest. + grep -E '^[[:space:]]*secret_file:' "$1" 2>/dev/null | awk '{print $2}' +} + +_secret_exists() { + local f="$SECRETS_DIR/$1" + [[ -e "$f" ]] && return 0 + sudo -n test -f "$f" 2>/dev/null +} + +@test "every installed backend's referenced secrets exist on disk" { + command -v podman >/dev/null 2>&1 || skip "podman not available" + [[ -d "$SECRETS_DIR" ]] || sudo -n test -d "$SECRETS_DIR" 2>/dev/null || skip "secrets dir not present" + + local checked=0 missing="" app cname mf sf + # container-name : manifest-app-id (the bitcoin stack that cascades) + for pair in \ + "bitcoin-knots:bitcoin-knots" "lnd:lnd" "electrumx:electrumx" \ + "mempool-api:mempool-api" "btcpay-server:btcpay-server" \ + "archy-nbxplorer:archy-nbxplorer" "fedimint:fedimint" \ + "fedimint-gateway:fedimint-gateway"; do + cname="${pair%%:*}"; app="${pair##*:}" + podman container exists "$cname" 2>/dev/null || continue + mf=$(_manifest_for "$app") || continue + while read -r sf; do + [[ -n "$sf" ]] || continue + checked=$((checked + 1)) + _secret_exists "$sf" || missing+="${app} -> ${sf}\n" + done < <(_secret_files_in "$mf") + done + + [[ "$checked" -gt 0 ]] || skip "no installed backends with secret references to check" + if [[ -n "$missing" ]]; then + echo "installed apps reference missing secrets:" >&2 + echo -e "$missing" >&2 + return 1 + fi +} diff --git a/tests/release/run.sh b/tests/release/run.sh index 66495b26..5716857b 100755 --- a/tests/release/run.sh +++ b/tests/release/run.sh @@ -85,14 +85,17 @@ fi stage "cargo-check" timeout 580 cargo check --manifest-path core/Cargo.toml -p archipelago # Focused suites for the subsystems this release train touched: # update:: — OTA download/apply/rollback/probe (v1.7.89 hardening) -# lnd — receive address + wallet readiness (v1.7.85–.89) +# lnd — receive address + wallet readiness (v1.7.85–.89), incl. the +# structured receive-error reason-code classifier # container::image_versions — image pinning / false-update detection # scanner — RAII in-flight guard (v1.7.84) +# drift — published-port drift detection (the .116 self-heal) +# missing_secret — secret-resolution names the missing file (the .198 fix) # 1500s: the non-incremental test-profile compile alone takes ~9 min on the # .116 ThinkPad; 580s expires mid-compile (exit 124) before a single test runs. stage "cargo-test-weekly" timeout 1500 env CARGO_INCREMENTAL=0 \ cargo test --manifest-path core/Cargo.toml -p archipelago -- \ - update:: lnd container::image_versions scanner + update:: lnd container::image_versions scanner drift missing_secret # ── Stage 4: live node smoke ───────────────────────────────────────── if [[ $LIVE -eq 1 ]]; then