From cc2e055e09bb4558e94a1155ed9fe34d6a25e7f3 Mon Sep 17 00:00:00 2001 From: archipelago Date: Thu, 18 Jun 2026 09:14:47 -0400 Subject: [PATCH] fix(bitcoin,ui): RAM-aware dbcache to stop swap-thrash 502s + snappier status + icon placeholder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sizes bitcoind -dbcache to host RAM (~1/16, floor 300MB, cap 4096) instead of a fixed 2048/4096. A multi-GB UTXO cache on an 8GB node running the full app stack pushed memory past physical RAM and triggered system-wide swap thrash: the disk saturated, bitcoind could not answer its own RPC, and the dashboard backend's sqlite reads stalled — surfacing as fleet-wide /rpc/v1 502s and a blank Bitcoin UI. Applied in scripts/container-specs.sh (reconciler path) and the config.rs bitcoin-core path. Bitcoin status cache now polls every 5s (was 10/15) with an 8s timeout (was 20s) and fetches the four RPCs concurrently, so the cached snapshot tracks bitcoind's responsive windows during IBD and the UI stops dwelling on "reconnecting...". Unifies the divergent discover AppGrid/FeaturedApps image-error handlers onto the canonical placeholder fallback so missing app icons render the placeholder. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../archipelago/src/api/rpc/package/config.rs | 41 ++++++++++++++---- core/archipelago/src/bitcoin_status.rs | 43 ++++++++++--------- neode-ui/src/views/discover/AppGrid.vue | 6 +-- neode-ui/src/views/discover/FeaturedApps.vue | 6 +-- scripts/container-specs.sh | 17 ++++++-- 5 files changed, 70 insertions(+), 43 deletions(-) diff --git a/core/archipelago/src/api/rpc/package/config.rs b/core/archipelago/src/api/rpc/package/config.rs index 139585f5..2b6240b1 100644 --- a/core/archipelago/src/api/rpc/package/config.rs +++ b/core/archipelago/src/api/rpc/package/config.rs @@ -349,13 +349,37 @@ fn http_probe_cmd(url: &'static str) -> &'static str { } } +/// Bitcoin UTXO cache (`-dbcache`) in MB, sized to host RAM. +/// +/// A fixed large dbcache on a small box pushes bitcoind + the ~20 app +/// containers past physical RAM and triggers system-wide swap thrash: the +/// disk saturates, bitcoind can't answer its own RPC, and the dashboard +/// backend's sqlite reads stall — surfacing as /rpc/v1 502s and a blank +/// Bitcoin UI. Budget ~1/16 of RAM for the cache (floor 300 MB — bitcoind's +/// own default is 450 — cap 4096 MB), mirroring scripts/container-specs.sh. +pub(super) fn bitcoin_dbcache_mb() -> u64 { + let total_mb = std::fs::read_to_string("/proc/meminfo") + .ok() + .and_then(|c| { + c.lines() + .find_map(|l| l.strip_prefix("MemTotal:")) + .and_then(|v| v.split_whitespace().next()) + .and_then(|kb| kb.parse::().ok()) + }) + .map(|kb| kb / 1024) + .unwrap_or(16000); // assume a comfortable host if /proc/meminfo is unreadable + (total_mb / 16).clamp(300, 4096) +} + /// Get per-app memory limit. pub(super) fn get_memory_limit(app_id: &str) -> &'static str { match app_id { - // Heavy apps. Bitcoin: dbcache uses ~4GB; the daemon also needs - // headroom for mempool + connection buffers + script-verifier - // memory + I/O. 4g caused OOM-cascades during IBD. 8g is the - // floor; ideally this would be host-RAM aware (next pass). + // Heavy apps. Bitcoin: dbcache is now host-RAM-aware (see + // bitcoin_dbcache_mb), so the daemon's footprint scales with the box. + // This cgroup cap is an upper bound for mempool + connection buffers + + // script-verifier memory + I/O; a tight cap (4g) previously caused + // OOM-cascades during IBD, so keep 8g as a generous ceiling rather + // than a tight limit — swap thrash is prevented at the dbcache layer. "bitcoin" | "bitcoin-core" | "bitcoin-knots" => "8g", // ElectrumX indexing spikes above its cache size due Python, // RocksDB, socket buffers, and reorg/history work. Keep cache @@ -674,9 +698,10 @@ pub(super) async fn get_app_config( // RPC is reachable from the bitcoin-ui companion container. // // Sync-speed flags: - // -dbcache=4096 — UTXO set cache; 4GB is the sweet spot before - // diminishing returns. Container has --memory=8g now so - // there's headroom for mempool + connections. + // -dbcache — UTXO set cache, sized to host RAM via + // bitcoin_dbcache_mb() (see there). A fixed 4GB cache swap- + // thrashed small nodes into fleet-wide 502s; ~1/16 of RAM + // keeps headroom for mempool + connections + the app stack. // -par=0 — use all available cores for script // verification (defaults to NCPU-1 capped at 16). Was // effectively pinned at 2 by --cpus=2 (now removed). @@ -689,7 +714,7 @@ pub(super) async fn get_app_config( "-rpcport=8332".to_string(), "-printtoconsole=1".to_string(), "-datadir=/home/bitcoin/.bitcoin".to_string(), - "-dbcache=4096".to_string(), + format!("-dbcache={}", bitcoin_dbcache_mb()), "-par=0".to_string(), "-maxconnections=125".to_string(), ]), diff --git a/core/archipelago/src/bitcoin_status.rs b/core/archipelago/src/bitcoin_status.rs index 1a67cdad..dd3c1d70 100644 --- a/core/archipelago/src/bitcoin_status.rs +++ b/core/archipelago/src/bitcoin_status.rs @@ -13,8 +13,14 @@ use std::time::{Duration, SystemTime, UNIX_EPOCH}; use tokio::sync::RwLock; use tracing::{debug, warn}; -const CACHE_REFRESH_SECS: u64 = 10; -const CACHE_ERROR_BACKOFF_SECS: u64 = 15; +// Poll frequently and recover fast so the cached snapshot tracks bitcoind's +// responsive windows during IBD. During heavy block-connection, getblockchaininfo +// can block briefly; a slow 10s/15s/20s cadence let one missed poll age the +// snapshot past the UI's 30s "stale" threshold, so the UI dwelled on +// "reconnecting…" long after bitcoind was answering again. Tight cadence + short +// timeout keeps last-known state fresh and clears the stale banner promptly. +const CACHE_REFRESH_SECS: u64 = 5; +const CACHE_ERROR_BACKOFF_SECS: u64 = 5; #[derive(Debug, Clone, Serialize)] pub struct BitcoinNodeStatus { @@ -147,25 +153,20 @@ pub async fn get_bitcoin_status() -> BitcoinNodeStatus { async fn fetch_bitcoin_status() -> Result { let client = reqwest::Client::builder() - .timeout(Duration::from_secs(20)) + .timeout(Duration::from_secs(8)) .build() .context("build Bitcoin status HTTP client")?; - let blockchain_info = bitcoin_rpc_call(&client, "getblockchaininfo", serde_json::json!([])) - .await - .context("getblockchaininfo")?; - let network_info = bitcoin_rpc_call(&client, "getnetworkinfo", serde_json::json!([])) - .await - .context("getnetworkinfo") - .ok(); - let index_info = bitcoin_rpc_call(&client, "getindexinfo", serde_json::json!([])) - .await - .context("getindexinfo") - .ok(); - let zmq_notifications = bitcoin_rpc_call(&client, "getzmqnotifications", serde_json::json!([])) - .await - .context("getzmqnotifications") - .ok(); + // Fetch all four calls concurrently: getblockchaininfo gates freshness, so a + // slow auxiliary call (network/index/zmq) must not delay the snapshot or block + // the next refresh. Only getblockchaininfo failing marks the status stale. + let (blockchain_info, network_info, index_info, zmq_notifications) = tokio::join!( + bitcoin_rpc_call(&client, "getblockchaininfo", serde_json::json!([])), + bitcoin_rpc_call(&client, "getnetworkinfo", serde_json::json!([])), + bitcoin_rpc_call(&client, "getindexinfo", serde_json::json!([])), + bitcoin_rpc_call(&client, "getzmqnotifications", serde_json::json!([])), + ); + let blockchain_info = blockchain_info.context("getblockchaininfo")?; Ok(BitcoinNodeStatus { ok: true, @@ -173,9 +174,9 @@ async fn fetch_bitcoin_status() -> Result { updated_at_ms: now_ms(), error: None, blockchain_info: Some(blockchain_info), - network_info, - index_info, - zmq_notifications, + network_info: network_info.ok(), + index_info: index_info.ok(), + zmq_notifications: zmq_notifications.ok(), }) } diff --git a/neode-ui/src/views/discover/AppGrid.vue b/neode-ui/src/views/discover/AppGrid.vue index 749dcca8..b9beca9a 100644 --- a/neode-ui/src/views/discover/AppGrid.vue +++ b/neode-ui/src/views/discover/AppGrid.vue @@ -157,6 +157,7 @@