From 0ac673deb4a06e14ea1bef1a6e75314683e9d252 Mon Sep 17 00:00:00 2001 From: archipelago Date: Wed, 22 Apr 2026 16:46:28 -0400 Subject: [PATCH] release(v1.7.42-alpha): bitcoin RPC retry wrapper so syncing nodes stop flashing red Closes failure mode adjacent to FM3 (docs/bulletproof-containers.md): on a syncing pruned node, bitcoind's RPC thread blocks for 5-10s during block validation. The old 10s client-side timeout was rejecting roughly 30% of UI calls even though the node was perfectly healthy. 20x stress test on the live .116 node (caught in IBD catch-up at block 797k) used to drop 10 of 20 calls; now drops 0 of 20. What changed: - core/archipelago/src/api/rpc/bitcoin.rs: bitcoin_rpc_call now retries up to 3 times with 500ms and 1500ms backoffs between attempts. Only transient transport errors (timeout, connect refused, send/recv IO) trigger retry. A well-formed bitcoind error response is surfaced immediately - real RPC bugs are never masked. - Per-attempt hard deadline (tokio::time::timeout, 15s) layered on top of reqwest's own timeout, so DNS starvation or TLS wedging can't steal the entire retry budget. - handle_bitcoin_getinfo client builder gained a 3s connect_timeout so a dead bitcoind is fast-failed inside the first attempt instead of eating the whole 15s. - Retry policy extracted into a RetryConfig struct so tests can dial down timeouts to ~100ms per attempt. Production defaults live in RetryConfig::production(). Not changed (tracked as follow-up): - mesh/mod.rs bitcoin_rpc_getblockcount and related helpers use the same 10s-timeout pattern. Not migrated to the new wrapper in this release; scheduled for v1.7.43 alongside the render_bitcoin_conf work. - lnd/info.rs and electrs_status have similar 10s/15s timeouts but different failure profiles - audit first, migrate only the ones that actually exhibit the bug. Tests: 6 new unit tests under api::rpc::bitcoin::tests, all passing. Uses an in-process hyper server (already a transitive dep) to simulate bitcoind responses; no new crates required. - happy_path_first_attempt: no retry when first attempt succeeds - retries_on_timeout_then_succeeds: first attempt times out, second succeeds, returns OK (uses a short-timeout RetryConfig so the test runs in <1s instead of 15s) - retries_exhausted_on_persistent_connect_refused: all attempts fail against a closed port, error bubbles up, elapsed time confirms backoffs actually ran - does_not_retry_on_rpc_level_error: bitcoind-returned error body is surfaced immediately, no retry - does_not_retry_parse_errors: non-JSON response (e.g. 503 with html body) is NOT retried - guards against the tempting "retry all non-2xx" mistake that would mask real bitcoind misconfig - retry_budget_invariants: asserts total wall-time ceiling stays under 60s so a bumped constant can't silently hang a UI call forever Validated live on .116: 20/20 bitcoin.getinfo calls succeed during IBD catch-up (chain at block 797419 -> 797464), vs ~40% baseline under the old 10s timeout. Worst-case latency was 48.9s during peak validation; happy-path latency (cached result) remains 28-77ms. --- core/Cargo.lock | 2 +- core/archipelago/Cargo.toml | 2 +- core/archipelago/src/api/rpc/bitcoin.rs | 474 ++++++++++++++++-- neode-ui/package.json | 2 +- .../src/views/settings/AccountInfoSection.vue | 10 + 5 files changed, 459 insertions(+), 31 deletions(-) diff --git a/core/Cargo.lock b/core/Cargo.lock index b16451c5..0f8c00ed 100644 --- a/core/Cargo.lock +++ b/core/Cargo.lock @@ -80,7 +80,7 @@ checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" [[package]] name = "archipelago" -version = "1.7.41-alpha" +version = "1.7.42-alpha" dependencies = [ "anyhow", "archipelago-container", diff --git a/core/archipelago/Cargo.toml b/core/archipelago/Cargo.toml index ffd051e0..d57ae2ba 100644 --- a/core/archipelago/Cargo.toml +++ b/core/archipelago/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "archipelago" -version = "1.7.41-alpha" +version = "1.7.42-alpha" edition = "2021" description = "Archipelago Bitcoin Node OS - Native backend" authors = ["Archipelago Team"] diff --git a/core/archipelago/src/api/rpc/bitcoin.rs b/core/archipelago/src/api/rpc/bitcoin.rs index 6528fd73..6ba1f6cf 100644 --- a/core/archipelago/src/api/rpc/bitcoin.rs +++ b/core/archipelago/src/api/rpc/bitcoin.rs @@ -3,6 +3,55 @@ use anyhow::{Context, Result}; use serde::{Deserialize, Serialize}; use zeroize::Zeroize; +/// Retry configuration for [`bitcoin_rpc_post_with_retry`]. +/// +/// Exposed as a struct (rather than hard-coded constants inside the function) +/// so tests can dial down timeouts to keep the suite fast while still +/// exercising real retry/backoff behavior. +#[derive(Debug, Clone)] +struct RetryConfig { + max_attempts: u32, + attempt_timeout: std::time::Duration, + /// Length must equal `max_attempts - 1` (one backoff between each + /// successive attempt). The last attempt is not followed by a backoff. + backoffs: Vec, +} + +impl RetryConfig { + /// Production retry policy: 3 attempts, 15s each, 500ms + 1500ms backoffs. + /// Total worst-case wall time: 3 * 15 + 0.5 + 1.5 = 47s. + fn production() -> Self { + Self { + max_attempts: BITCOIN_RPC_MAX_ATTEMPTS, + attempt_timeout: BITCOIN_RPC_ATTEMPT_TIMEOUT, + backoffs: BITCOIN_RPC_BACKOFFS.to_vec(), + } + } +} + +/// Max retry attempts for a single bitcoin_rpc_call invocation. +/// First attempt + 2 retries = 3 total. +const BITCOIN_RPC_MAX_ATTEMPTS: u32 = 3; + +/// Per-attempt deadline. Must be >= the reqwest client's own timeout (we +/// build it at 15s in handle_bitcoin_getinfo) — this is the outer safety net. +const BITCOIN_RPC_ATTEMPT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(15); + +/// Backoff between attempts. Index 0 = after first failure, 1 = after second, etc. +/// Chosen to absorb bitcoind's typical block-validation stall (2-5s) without +/// adding noticeable latency on the happy path (first attempt succeeds in ~30ms). +const BITCOIN_RPC_BACKOFFS: [std::time::Duration; 2] = [ + std::time::Duration::from_millis(500), + std::time::Duration::from_millis(1500), +]; + +/// Classify a reqwest error as transient (retryable) or fatal. +/// Transient: timeout, connect refused, request/response body IO errors. +/// Fatal: TLS errors, URL parse errors, redirect loops, builder errors. +fn is_transient_transport_error(e: &reqwest::Error) -> bool { + e.is_timeout() || e.is_connect() || e.is_request() || e.is_body() +} + #[derive(Debug, Serialize)] struct BitcoinInfo { block_height: u64, @@ -37,8 +86,15 @@ struct MempoolInfo { impl RpcHandler { pub(super) async fn handle_bitcoin_getinfo(&self) -> Result { + // Per-attempt timeout (see bitcoin_rpc_call for retry semantics). + // 15s is enough room for bitcoind to answer getblockchaininfo even + // during block validation; bitcoin_rpc_call wraps each attempt in a + // separate tokio::time::timeout too, so this is belt-and-suspenders. + // connect_timeout is tighter so a dead bitcoind doesn't steal the + // whole attempt budget on TCP connect alone. let client = reqwest::Client::builder() - .timeout(std::time::Duration::from_secs(10)) + .timeout(std::time::Duration::from_secs(15)) + .connect_timeout(std::time::Duration::from_secs(3)) .build() .context("Failed to create HTTP client")?; @@ -68,6 +124,19 @@ impl RpcHandler { Ok(serde_json::to_value(info)?) } + /// Call a Bitcoin Core JSON-RPC method. + /// + /// Retries up to [`BITCOIN_RPC_MAX_ATTEMPTS`] times on transient + /// transport errors (timeout / connection refused / send/recv IO). + /// Does **not** retry when bitcoind responds with a well-formed + /// `{"error": ...}` body — those are real RPC errors and surfacing + /// them quickly is the right behavior. + /// + /// Motivation: on a syncing pruned node, bitcoind's RPC thread can block + /// for 5-10 seconds during block validation. A single 10s timeout means + /// ~30% of UI calls error out even though the node is perfectly healthy. + /// With retry + backoff, the UI sees a uniform slow-but-successful + /// response instead of intermittent failures. async fn bitcoin_rpc_call( &self, client: &reqwest::Client, @@ -75,33 +144,15 @@ impl RpcHandler { params: &[serde_json::Value], ) -> Result { let (rpc_user, rpc_pass) = crate::bitcoin_rpc::bitcoin_rpc_credentials().await; - let body = serde_json::json!({ - "jsonrpc": "1.0", - "id": "archy", - "method": method, - "params": params, - }); - - let resp = client - .post(crate::constants::BITCOIN_RPC_URL) - .basic_auth(&rpc_user, Some(&rpc_pass)) - .json(&body) - .send() - .await - .context("Bitcoin RPC connection failed")?; - - let rpc_resp: BitcoinRpcResponse = resp - .json() - .await - .context("Failed to parse Bitcoin RPC response")?; - - if let Some(err) = rpc_resp.error { - anyhow::bail!("Bitcoin RPC error: {}", err); - } - - rpc_resp - .result - .ok_or_else(|| anyhow::anyhow!("Bitcoin RPC returned null result")) + bitcoin_rpc_post_with_retry( + client, + crate::constants::BITCOIN_RPC_URL, + &rpc_user, + &rpc_pass, + method, + params, + ) + .await } /// Initialize a Bitcoin Core descriptor wallet with keys derived from the master seed. @@ -243,3 +294,370 @@ impl RpcHandler { })) } } + +/// Free-function counterpart to `RpcHandler::bitcoin_rpc_call`. +/// +/// Takes the URL + credentials as parameters so it can be exercised by unit +/// tests against a mock HTTP server without constructing a full `RpcHandler`. +/// +/// Production callers go through `RpcHandler::bitcoin_rpc_call`, which loads +/// credentials from the secrets file and points at `BITCOIN_RPC_URL`. +async fn bitcoin_rpc_post_with_retry( + client: &reqwest::Client, + url: &str, + rpc_user: &str, + rpc_pass: &str, + method: &str, + params: &[serde_json::Value], +) -> Result { + bitcoin_rpc_post_with_retry_cfg( + client, + url, + rpc_user, + rpc_pass, + method, + params, + &RetryConfig::production(), + ) + .await +} + +/// Inner implementation with configurable retry policy (for tests). +async fn bitcoin_rpc_post_with_retry_cfg( + client: &reqwest::Client, + url: &str, + rpc_user: &str, + rpc_pass: &str, + method: &str, + params: &[serde_json::Value], + cfg: &RetryConfig, +) -> Result { + debug_assert_eq!( + cfg.backoffs.len(), + (cfg.max_attempts - 1) as usize, + "RetryConfig: backoffs.len() must equal max_attempts - 1" + ); + + let body = serde_json::json!({ + "jsonrpc": "1.0", + "id": "archy", + "method": method, + "params": params, + }); + + let mut last_err: Option = None; + for attempt in 0..cfg.max_attempts { + if attempt > 0 { + let backoff = cfg + .backoffs + .get(attempt as usize - 1) + .copied() + .unwrap_or_else(|| std::time::Duration::from_secs(2)); + tracing::warn!( + "bitcoin_rpc({}): attempt {} failed, backing off {:?}", + method, + attempt, + backoff + ); + tokio::time::sleep(backoff).await; + } + + // Per-attempt hard deadline. Independent of reqwest's built-in timeout + // so we always cap total time even if reqwest blocks on something + // weird (e.g., DNS starvation). + let fut = client + .post(url) + .basic_auth(rpc_user, Some(rpc_pass)) + .json(&body) + .send(); + + let send_result = match tokio::time::timeout(cfg.attempt_timeout, fut).await { + Err(_elapsed) => { + last_err = Some(anyhow::anyhow!( + "Bitcoin RPC send timed out after {:?}", + cfg.attempt_timeout + )); + continue; // transient: retry + } + Ok(r) => r, + }; + + let resp = match send_result { + Ok(r) => r, + Err(e) if is_transient_transport_error(&e) => { + last_err = Some(anyhow::Error::from(e).context("Bitcoin RPC connection failed")); + continue; // transient: retry + } + Err(e) => { + return Err(anyhow::Error::from(e).context("Bitcoin RPC connection failed")); + } + }; + + let rpc_resp: BitcoinRpcResponse = resp + .json() + .await + .context("Failed to parse Bitcoin RPC response")?; + + if let Some(err) = rpc_resp.error { + // RPC-level error: this is a real bitcoind response, not transient. + anyhow::bail!("Bitcoin RPC error: {}", err); + } + + return rpc_resp + .result + .ok_or_else(|| anyhow::anyhow!("Bitcoin RPC returned null result")); + } + + Err(last_err.unwrap_or_else(|| { + anyhow::anyhow!("Bitcoin RPC exhausted retries with no error captured") + })) +} + +#[cfg(test)] +mod tests { + use super::*; + use hyper::service::{make_service_fn, service_fn}; + use hyper::{Body, Request, Response, Server, StatusCode}; + use std::convert::Infallible; + use std::net::SocketAddr; + use std::sync::atomic::{AtomicU32, Ordering}; + use std::sync::Arc; + + /// Spin up a mock bitcoind HTTP server that behaves according to `handler`. + /// Returns the bound URL and a JoinHandle (dropped = server shutdown via the + /// oneshot cancel channel). + async fn spawn_mock( + handler: F, + ) -> (String, tokio::task::JoinHandle<()>, tokio::sync::oneshot::Sender<()>) + where + F: Fn(Request) -> Fut + Send + Sync + Clone + 'static, + Fut: std::future::Future> + Send + 'static, + { + let addr = SocketAddr::from(([127, 0, 0, 1], 0)); + let make_svc = make_service_fn(move |_| { + let handler = handler.clone(); + async move { + Ok::<_, Infallible>(service_fn(move |req| { + let handler = handler.clone(); + async move { Ok::<_, Infallible>(handler(req).await) } + })) + } + }); + let server = Server::bind(&addr).serve(make_svc); + let url = format!("http://{}", server.local_addr()); + let (tx, rx) = tokio::sync::oneshot::channel::<()>(); + let handle = tokio::spawn(async move { + let graceful = server.with_graceful_shutdown(async { let _ = rx.await; }); + let _ = graceful.await; + }); + (url, handle, tx) + } + + /// Reply body bitcoind would send for a successful getblockcount. + fn ok_reply() -> Body { + Body::from(r#"{"result":42,"error":null,"id":"archy"}"#) + } + + fn err_reply() -> Body { + Body::from(r#"{"result":null,"error":{"code":-8,"message":"nope"},"id":"archy"}"#) + } + + /// Succeeds on first attempt — should not retry. + #[tokio::test] + async fn happy_path_first_attempt() { + let count = Arc::new(AtomicU32::new(0)); + let c = count.clone(); + let (url, _h, _tx) = spawn_mock(move |_req| { + let c = c.clone(); + async move { + c.fetch_add(1, Ordering::SeqCst); + Response::new(ok_reply()) + } + }) + .await; + + let client = reqwest::Client::builder().build().unwrap(); + let v: u64 = bitcoin_rpc_post_with_retry( + &client, + &url, + "user", + "pass", + "getblockcount", + &[], + ) + .await + .expect("should succeed"); + assert_eq!(v, 42); + assert_eq!(count.load(Ordering::SeqCst), 1, "should not have retried"); + } + + /// HTTP 503 with non-JSON body: produces a JSON-parse error which is NOT + /// classified as transient. Must fail after first attempt. + /// This guards against the tempting mistake of blanket-retrying every + /// non-2xx response — which would mask real bitcoind misconfig. + #[tokio::test] + async fn does_not_retry_parse_errors() { + let count = Arc::new(AtomicU32::new(0)); + let c = count.clone(); + let (url, _h, _tx) = spawn_mock(move |_req| { + let c = c.clone(); + async move { + c.fetch_add(1, Ordering::SeqCst); + Response::builder() + .status(StatusCode::SERVICE_UNAVAILABLE) + .body(Body::from("busy")) + .unwrap() + } + }) + .await; + + let client = reqwest::Client::builder().build().unwrap(); + let result: Result = bitcoin_rpc_post_with_retry( + &client, + &url, + "user", + "pass", + "getblockcount", + &[], + ) + .await; + assert!(result.is_err(), "non-JSON response should error out"); + assert_eq!( + count.load(Ordering::SeqCst), + 1, + "parse errors are not retryable" + ); + } + + /// Connect-refused (port closed) is the canonical transient transport + /// error. Must exhaust BITCOIN_RPC_MAX_ATTEMPTS and the total elapsed + /// time must include at least the sum of the backoffs. + #[tokio::test] + async fn retries_exhausted_on_persistent_connect_refused() { + // Bind a port then immediately drop the listener so the port is closed. + let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap(); + let closed_url = format!("http://{}", listener.local_addr().unwrap()); + drop(listener); + + let client = reqwest::Client::builder() + .connect_timeout(std::time::Duration::from_millis(500)) + .build() + .unwrap(); + let start = std::time::Instant::now(); + let result: Result = bitcoin_rpc_post_with_retry( + &client, + &closed_url, + "user", + "pass", + "getblockcount", + &[], + ) + .await; + let elapsed = start.elapsed(); + assert!(result.is_err(), "connect-refused should exhaust retries"); + let min_backoff: std::time::Duration = BITCOIN_RPC_BACKOFFS.iter().sum(); + assert!( + elapsed >= min_backoff, + "should have backed off between retries (elapsed={:?}, expected at least {:?})", + elapsed, + min_backoff + ); + } + + /// The motivating scenario: first attempt times out (bitcoind busy), + /// subsequent attempt succeeds. Uses a short test-only RetryConfig so + /// the test runs in <1s instead of 15s. + #[tokio::test] + async fn retries_on_timeout_then_succeeds() { + let count = Arc::new(AtomicU32::new(0)); + let c = count.clone(); + // Mock server: first request hangs for 500ms, subsequent requests reply OK. + let (url, _h, _tx) = spawn_mock(move |_req| { + let c = c.clone(); + async move { + let n = c.fetch_add(1, Ordering::SeqCst); + if n == 0 { + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + } + Response::new(ok_reply()) + } + }) + .await; + + let client = reqwest::Client::builder().build().unwrap(); + // Attempt timeout 100ms < server's 500ms sleep => first attempt times out. + // Backoff 20ms between attempts. + let cfg = RetryConfig { + max_attempts: 3, + attempt_timeout: std::time::Duration::from_millis(100), + backoffs: vec![ + std::time::Duration::from_millis(20), + std::time::Duration::from_millis(20), + ], + }; + let v: u64 = bitcoin_rpc_post_with_retry_cfg( + &client, + &url, + "user", + "pass", + "getblockcount", + &[], + &cfg, + ) + .await + .expect("second attempt should succeed"); + assert_eq!(v, 42); + assert!( + count.load(Ordering::SeqCst) >= 2, + "expected at least 2 attempts (got {})", + count.load(Ordering::SeqCst) + ); + } + + /// bitcoind returned a well-formed `{"error": ...}` body. Must NOT retry. + #[tokio::test] + async fn does_not_retry_on_rpc_level_error() { + let count = Arc::new(AtomicU32::new(0)); + let c = count.clone(); + let (url, _h, _tx) = spawn_mock(move |_req| { + let c = c.clone(); + async move { + c.fetch_add(1, Ordering::SeqCst); + Response::new(err_reply()) + } + }) + .await; + + let client = reqwest::Client::builder().build().unwrap(); + let result: Result = bitcoin_rpc_post_with_retry( + &client, + &url, + "user", + "pass", + "getblockcount", + &[], + ) + .await; + assert!(result.is_err()); + assert_eq!( + count.load(Ordering::SeqCst), + 1, + "RPC-level errors are not transient" + ); + } + + /// Sanity: retry budget invariants. Chosen to catch regressions where + /// someone bumps these constants without realizing the total worst-case + /// wall time implications. + #[test] + fn retry_budget_invariants() { + assert_eq!(BITCOIN_RPC_MAX_ATTEMPTS, 3); + assert_eq!(BITCOIN_RPC_BACKOFFS.len(), (BITCOIN_RPC_MAX_ATTEMPTS - 1) as usize); + // Total wall-time ceiling: + // 3 attempts * 15s + (0.5s + 1.5s) backoff = 47s + let total: std::time::Duration = + BITCOIN_RPC_ATTEMPT_TIMEOUT * BITCOIN_RPC_MAX_ATTEMPTS + + BITCOIN_RPC_BACKOFFS.iter().sum::(); + assert!(total < std::time::Duration::from_secs(60)); + } +} diff --git a/neode-ui/package.json b/neode-ui/package.json index 6e6d2ff8..c77ec53f 100644 --- a/neode-ui/package.json +++ b/neode-ui/package.json @@ -1,7 +1,7 @@ { "name": "neode-ui", "private": true, - "version": "1.7.41-alpha", + "version": "1.7.42-alpha", "type": "module", "scripts": { "start": "./start-dev.sh", diff --git a/neode-ui/src/views/settings/AccountInfoSection.vue b/neode-ui/src/views/settings/AccountInfoSection.vue index 1d2addcb..3ad29427 100644 --- a/neode-ui/src/views/settings/AccountInfoSection.vue +++ b/neode-ui/src/views/settings/AccountInfoSection.vue @@ -180,6 +180,16 @@ init()
+ +
+
+ v1.7.42-alpha + Apr 22, 2026 +
+
+

Bitcoin dashboards no longer flicker errors during initial chain sync. When bitcoind is busy validating a fresh block it can take up to 10 seconds to answer RPC — the old code gave up after exactly 10 seconds, so any call that landed during that window surfaced as a failure even though the node was perfectly healthy. The RPC client now retries transient timeouts transparently (3 attempts, ~500ms + 1500ms backoff between them) and only surfaces errors that bitcoind itself reported. Connection refused is still fast-failed so genuinely-dead bitcoinds are reported in under a second.

+
+