fix(orchestrator): durable uninstall marker for baseline apps + archival-bitcoin/version-report gaps
- mempool-api now declares dependencies:[bitcoin:archival] directly, closing a
gap where installing it standalone (a legitimate direct orchestrator-install
target) bypassed the mempool umbrella's pruning gate entirely.
- New durable user-uninstalled marker (crash_recovery.rs, mirrors user_stopped)
fixes required-baseline-app self-heal (bitcoin-knots/electrumx/lnd/mempool/
etc.) resurrecting itself after an explicit uninstall survives a restart or
reboot, since the in-memory disabled set is wiped by every load_manifests().
- installed_version() (set_config.rs) no longer trusts a floating image tag
("latest") as the reported running version -- a stale local :latest cache
reported "latest" forever regardless of what latest had moved on to. Now
falls back to asking the Bitcoin backend directly via `bitcoind --version`
when the tag is floating.
Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
This commit is contained in:
parent
de8b2bb812
commit
5b7cd5d5d0
@ -27,6 +27,7 @@ app:
|
|||||||
version: ">=1.18.0"
|
version: ">=1.18.0"
|
||||||
- app_id: archy-mempool-db
|
- app_id: archy-mempool-db
|
||||||
version: ">=11.4.10"
|
version: ">=11.4.10"
|
||||||
|
- bitcoin:archival
|
||||||
|
|
||||||
resources:
|
resources:
|
||||||
memory_limit: 2Gi
|
memory_limit: 2Gi
|
||||||
|
|||||||
@ -624,4 +624,20 @@ mod tests {
|
|||||||
// An id with no manifest on disk at all.
|
// An id with no manifest on disk at all.
|
||||||
assert!(!manifest_declares_archival_bitcoin("does-not-exist"));
|
assert!(!manifest_declares_archival_bitcoin("does-not-exist"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn mempool_api_is_directly_installable_and_covered_by_the_archival_gate() {
|
||||||
|
// `mempool-api` is a legitimate direct `package.install` target
|
||||||
|
// (`uses_orchestrator_install_flow` in install.rs), reachable without
|
||||||
|
// going through the `mempool`/`mempool-web` umbrella id that the old
|
||||||
|
// hardcoded fallback list only recognized. It was missing from that
|
||||||
|
// list, so installing/repairing it directly skipped the archival
|
||||||
|
// Bitcoin gate entirely. Its manifest now declares `bitcoin:archival`
|
||||||
|
// directly, closing the gap the manifest-driven path exists for.
|
||||||
|
assert!(requires_unpruned_bitcoin("mempool-api"));
|
||||||
|
assert!(manifest_declares_archival_bitcoin("mempool-api"));
|
||||||
|
// `archy-mempool-web` has no direct Bitcoin RPC access
|
||||||
|
// (bitcoin_integration.rpc_access: none) and correctly stays excluded.
|
||||||
|
assert!(!requires_unpruned_bitcoin("archy-mempool-web"));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -59,7 +59,54 @@ async fn installed_version(app_id: &str) -> Option<String> {
|
|||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
let image = String::from_utf8_lossy(&out.stdout).trim().to_string();
|
let image = String::from_utf8_lossy(&out.stdout).trim().to_string();
|
||||||
image_tag(&image)
|
let tag = image_tag(&image)?;
|
||||||
|
// A floating tag (latest/stable/...) names the reference used to CREATE the
|
||||||
|
// container, not what's actually running — podman never re-resolves it once
|
||||||
|
// cached, so a stale local `:latest` reports "latest" even when the real
|
||||||
|
// `latest` moved on months ago (.228, 2026-07-01: ran a 4-month-old cached
|
||||||
|
// image while a newer one already sat locally, unused). Ask the Bitcoin
|
||||||
|
// backends directly instead of trusting the tag literal in that case.
|
||||||
|
if is_floating_tag(&tag) {
|
||||||
|
if let Some(real) = bitcoind_reported_version(app_id, name).await {
|
||||||
|
return Some(real);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some(tag)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_floating_tag(tag: &str) -> bool {
|
||||||
|
matches!(tag, "latest" | "stable" | "release" | "main")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Best-effort: ask the running bitcoind binary for its own version, trimmed to
|
||||||
|
/// the catalog's version-tag format (e.g. `29.3.knots20260210`, `29.2`). `None`
|
||||||
|
/// for apps other than the Bitcoin backends (no generic way to introspect a
|
||||||
|
/// third-party image's content version this way) or if the exec fails.
|
||||||
|
async fn bitcoind_reported_version(app_id: &str, container_name: &str) -> Option<String> {
|
||||||
|
if !matches!(app_id, "bitcoin-core" | "bitcoin-knots") {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let out = tokio::process::Command::new("podman")
|
||||||
|
.args(["exec", container_name, "bitcoind", "--version"])
|
||||||
|
.output()
|
||||||
|
.await
|
||||||
|
.ok()?;
|
||||||
|
if !out.status.success() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
parse_bitcoind_version_output(&String::from_utf8_lossy(&out.stdout))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parses e.g. "Bitcoin Knots daemon version v29.3.knots20260210\n..." or
|
||||||
|
/// "Bitcoin Core version v29.2.0\n..." down to the version tag after `version v`.
|
||||||
|
fn parse_bitcoind_version_output(output: &str) -> Option<String> {
|
||||||
|
let first_line = output.lines().next()?;
|
||||||
|
let (_, version) = first_line.rsplit_once("version v")?;
|
||||||
|
let version = version.trim();
|
||||||
|
if version.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
Some(version.to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RpcHandler {
|
impl RpcHandler {
|
||||||
@ -248,7 +295,42 @@ impl RpcHandler {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::image_tag;
|
use super::{image_tag, is_floating_tag, parse_bitcoind_version_output};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn floating_tag_detects_generic_channel_names() {
|
||||||
|
for tag in ["latest", "stable", "release", "main"] {
|
||||||
|
assert!(is_floating_tag(tag), "{tag}");
|
||||||
|
}
|
||||||
|
for tag in ["29.3.knots20260508", "28.4", "v29.2.0"] {
|
||||||
|
assert!(!is_floating_tag(tag), "{tag}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parses_knots_version_line() {
|
||||||
|
assert_eq!(
|
||||||
|
parse_bitcoind_version_output(
|
||||||
|
"Bitcoin Knots daemon version v29.3.knots20260210\nCopyright...\n"
|
||||||
|
)
|
||||||
|
.as_deref(),
|
||||||
|
Some("29.3.knots20260210")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parses_core_version_line() {
|
||||||
|
assert_eq!(
|
||||||
|
parse_bitcoind_version_output("Bitcoin Core version v29.2.0\n").as_deref(),
|
||||||
|
Some("29.2.0")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_returns_none_when_output_has_no_version_marker() {
|
||||||
|
assert_eq!(parse_bitcoind_version_output("garbage output\n"), None);
|
||||||
|
assert_eq!(parse_bitcoind_version_output(""), None);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn image_tag_keeps_registry_port_colon() {
|
fn image_tag_keeps_registry_port_colon() {
|
||||||
|
|||||||
@ -1567,6 +1567,25 @@ impl ProdContainerOrchestrator {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Same durability problem as user-stopped above, but for uninstall:
|
||||||
|
// `is_required_baseline_app` below otherwise self-heals bitcoin-knots,
|
||||||
|
// electrumx, lnd, mempool, etc. the moment their container is missing —
|
||||||
|
// including right after an explicit uninstall, since the in-memory
|
||||||
|
// `disabled` set doesn't survive a `load_manifests()` reload (every
|
||||||
|
// archipelago restart/reboot runs one before the first reconcile).
|
||||||
|
{
|
||||||
|
let user_uninstalled =
|
||||||
|
crate::crash_recovery::load_user_uninstalled(&self.data_dir).await;
|
||||||
|
if user_uninstalled.contains(&app_id) || user_uninstalled.contains(&name) {
|
||||||
|
tracing::debug!(
|
||||||
|
app_id = %app_id,
|
||||||
|
container = %name,
|
||||||
|
"reconcile skipped — app is user-uninstalled (must stay removed)"
|
||||||
|
);
|
||||||
|
return Ok(ReconcileAction::Left("user-uninstalled".into()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
match self.runtime.get_container_status(&name).await {
|
match self.runtime.get_container_status(&name).await {
|
||||||
Ok(status) => {
|
Ok(status) => {
|
||||||
// Phase 3.3: migrate pre-Phase-3 containers in place, but only
|
// Phase 3.3: migrate pre-Phase-3 containers in place, but only
|
||||||
@ -3317,6 +3336,10 @@ impl ContainerOrchestrator for ProdContainerOrchestrator {
|
|||||||
// `ensure_running_with_mode` doesn't skip the very container we're
|
// `ensure_running_with_mode` doesn't skip the very container we're
|
||||||
// installing. (start/restart RPC handlers clear it on their side too.)
|
// installing. (start/restart RPC handlers clear it on their side too.)
|
||||||
crate::crash_recovery::clear_user_stopped(&self.data_dir, app_id).await;
|
crate::crash_recovery::clear_user_stopped(&self.data_dir, app_id).await;
|
||||||
|
// Same for the user-uninstalled marker — otherwise re-installing a
|
||||||
|
// baseline app the user had previously uninstalled would hit the
|
||||||
|
// same reconcile guard and silently no-op.
|
||||||
|
crate::crash_recovery::clear_user_uninstalled(&self.data_dir, app_id).await;
|
||||||
// Idempotent: if the container is already up and healthy, just
|
// Idempotent: if the container is already up and healthy, just
|
||||||
// refresh hooks and return. If it's stopped, start it. If it's
|
// refresh hooks and return. If it's stopped, start it. If it's
|
||||||
// missing or in a wedged state, install fresh.
|
// missing or in a wedged state, install fresh.
|
||||||
@ -3364,6 +3387,7 @@ impl ContainerOrchestrator for ProdContainerOrchestrator {
|
|||||||
// `ensure_running_with_mode` doesn't skip this container (symmetric with
|
// `ensure_running_with_mode` doesn't skip this container (symmetric with
|
||||||
// install; the start/restart RPC handlers also clear it).
|
// install; the start/restart RPC handlers also clear it).
|
||||||
crate::crash_recovery::clear_user_stopped(&self.data_dir, app_id).await;
|
crate::crash_recovery::clear_user_stopped(&self.data_dir, app_id).await;
|
||||||
|
crate::crash_recovery::clear_user_uninstalled(&self.data_dir, app_id).await;
|
||||||
let lm = self.loaded(app_id).await?;
|
let lm = self.loaded(app_id).await?;
|
||||||
let action = self.ensure_running(&lm).await?;
|
let action = self.ensure_running(&lm).await?;
|
||||||
match action {
|
match action {
|
||||||
@ -3532,11 +3556,20 @@ impl ContainerOrchestrator for ProdContainerOrchestrator {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut state = self.state.write().await;
|
{
|
||||||
state.disabled.insert(app_id.to_string());
|
let mut state = self.state.write().await;
|
||||||
|
state.disabled.insert(app_id.to_string());
|
||||||
|
}
|
||||||
if let Some(e) = remove_err {
|
if let Some(e) = remove_err {
|
||||||
return Err(e);
|
return Err(e);
|
||||||
}
|
}
|
||||||
|
// Durable, unlike `state.disabled` above (wiped by every
|
||||||
|
// `load_manifests()`, which runs on every archipelago restart/reboot
|
||||||
|
// before the first reconcile) — without this, `is_required_baseline_app`
|
||||||
|
// self-heals bitcoin-knots/electrumx/lnd/mempool/etc. right back after
|
||||||
|
// an explicit uninstall survives to the next restart. Only mark on the
|
||||||
|
// success path above — a failed removal means the app isn't actually gone.
|
||||||
|
crate::crash_recovery::mark_user_uninstalled(&self.data_dir, app_id).await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4611,6 +4644,38 @@ app:
|
|||||||
assert!(calls.iter().any(|c| c == "start_container:filebrowser"));
|
assert!(calls.iter().any(|c| c == "start_container:filebrowser"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn reconcile_existing_respects_durable_user_uninstalled_marker_for_baseline_apps() {
|
||||||
|
let rt = Arc::new(MockRuntime::default());
|
||||||
|
let mut orch = orch_with(rt.clone()).await;
|
||||||
|
orch.set_disk_gb_for_test(500);
|
||||||
|
orch.insert_manifest_for_test(
|
||||||
|
pull_manifest("filebrowser", "docker.io/filebrowser/filebrowser:latest"),
|
||||||
|
PathBuf::from("/tmp/filebrowser"),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
// Simulates the real-world sequence: user uninstalls a required
|
||||||
|
// baseline app, then archipelago restarts (which wipes the in-memory
|
||||||
|
// `disabled` set via `load_manifests()` before this harness even
|
||||||
|
// gets involved) — the durable marker is all that's left standing
|
||||||
|
// between that and the baseline self-heal below reinstalling it.
|
||||||
|
crate::crash_recovery::mark_user_uninstalled(&orch.data_dir, "filebrowser").await;
|
||||||
|
|
||||||
|
let report = orch.reconcile_existing().await;
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
report.actions,
|
||||||
|
vec![(
|
||||||
|
"filebrowser".to_string(),
|
||||||
|
ReconcileAction::Left("user-uninstalled".to_string())
|
||||||
|
)]
|
||||||
|
);
|
||||||
|
assert!(report.failures.is_empty());
|
||||||
|
let calls = rt.calls();
|
||||||
|
assert!(!calls.iter().any(|c| c.starts_with("pull_image:")));
|
||||||
|
assert!(!calls.iter().any(|c| c.starts_with("create_container:")));
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn reconcile_existing_skips_archival_baseline_apps_on_pruned_hosts() {
|
async fn reconcile_existing_skips_archival_baseline_apps_on_pruned_hosts() {
|
||||||
let rt = Arc::new(MockRuntime::default());
|
let rt = Arc::new(MockRuntime::default());
|
||||||
|
|||||||
@ -20,6 +20,7 @@ use tracing::{info, warn};
|
|||||||
const PID_FILE: &str = "archipelago.pid";
|
const PID_FILE: &str = "archipelago.pid";
|
||||||
const CONTAINER_STATE_FILE: &str = "running-containers.json";
|
const CONTAINER_STATE_FILE: &str = "running-containers.json";
|
||||||
const USER_STOPPED_FILE: &str = "user-stopped.json";
|
const USER_STOPPED_FILE: &str = "user-stopped.json";
|
||||||
|
const USER_UNINSTALLED_FILE: &str = "user-uninstalled.json";
|
||||||
|
|
||||||
/// Shared flag: true once boot recovery is complete. Health monitor should wait for this.
|
/// Shared flag: true once boot recovery is complete. Health monitor should wait for this.
|
||||||
pub static RECOVERY_COMPLETE: AtomicBool = AtomicBool::new(false);
|
pub static RECOVERY_COMPLETE: AtomicBool = AtomicBool::new(false);
|
||||||
@ -100,6 +101,51 @@ pub async fn clear_user_stopped(data_dir: &Path, name: &str) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── User-uninstalled tracking ───────────────────────────────────────────
|
||||||
|
// Baseline apps (bitcoin-knots, electrumx, lnd, mempool, ...) self-heal when
|
||||||
|
// their container is missing — see `is_required_baseline_app` in
|
||||||
|
// prod_orchestrator.rs — because they're expected to exist from first boot.
|
||||||
|
// That self-heal has no way to distinguish "container vanished after a
|
||||||
|
// crash" from "user explicitly uninstalled this," and the in-memory
|
||||||
|
// `disabled` set the orchestrator otherwise uses is wiped by every
|
||||||
|
// `load_manifests()` call (once per archipelago startup). Without a durable
|
||||||
|
// marker, uninstalling a baseline app only "sticks" until the next reboot or
|
||||||
|
// archipelago restart, at which point the boot reconciler resurrects it.
|
||||||
|
// This mirrors `user_stopped` exactly, just for uninstall instead of stop.
|
||||||
|
|
||||||
|
/// Load the set of explicitly user-uninstalled app/container names from disk.
|
||||||
|
pub async fn load_user_uninstalled(data_dir: &Path) -> std::collections::HashSet<String> {
|
||||||
|
let path = data_dir.join(USER_UNINSTALLED_FILE);
|
||||||
|
match fs::read_to_string(&path).await {
|
||||||
|
Ok(content) => serde_json::from_str(&content).unwrap_or_default(),
|
||||||
|
Err(_) => std::collections::HashSet::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Save the set of user-uninstalled app/container names to disk.
|
||||||
|
pub async fn save_user_uninstalled(data_dir: &Path, uninstalled: &std::collections::HashSet<String>) {
|
||||||
|
let path = data_dir.join(USER_UNINSTALLED_FILE);
|
||||||
|
if let Ok(json) = serde_json::to_string_pretty(uninstalled) {
|
||||||
|
let _ = fs::write(&path, json).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Mark a name as user-uninstalled (won't be self-healed by the baseline-app
|
||||||
|
/// reconciler across restarts/reboots).
|
||||||
|
pub async fn mark_user_uninstalled(data_dir: &Path, name: &str) {
|
||||||
|
let mut uninstalled = load_user_uninstalled(data_dir).await;
|
||||||
|
uninstalled.insert(name.to_string());
|
||||||
|
save_user_uninstalled(data_dir, &uninstalled).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clear the user-uninstalled flag (app was explicitly (re)installed/started).
|
||||||
|
pub async fn clear_user_uninstalled(data_dir: &Path, name: &str) {
|
||||||
|
let mut uninstalled = load_user_uninstalled(data_dir).await;
|
||||||
|
if uninstalled.remove(name) {
|
||||||
|
save_user_uninstalled(data_dir, &uninstalled).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct RunningContainerRecord {
|
pub struct RunningContainerRecord {
|
||||||
pub name: String,
|
pub name: String,
|
||||||
|
|||||||
@ -1010,3 +1010,79 @@ this match.
|
|||||||
- Definition of done: a new text sent from stock Meshtastic `3ccc` appears in `.116`
|
- Definition of done: a new text sent from stock Meshtastic `3ccc` appears in `.116`
|
||||||
`mesh.messages` as an incoming LoRa message without a browser refresh, and `.116` -> `3ccc`
|
`mesh.messages` as an incoming LoRa message without a browser refresh, and `.116` -> `3ccc`
|
||||||
visibly arrives in the Meshtastic app.
|
visibly arrives in the Meshtastic app.
|
||||||
|
|
||||||
|
## 11. Arch Issues (reported 2026-07-01, untriaged)
|
||||||
|
|
||||||
|
User-reported, raw, not yet root-caused. Split by owner — **do not fix the mesh items from the
|
||||||
|
non-mesh thread**; they route to the mesh/Reticulum agent (§10d owner).
|
||||||
|
|
||||||
|
- **[MESH — routes to §10d owner]** Transport-type label on mesh is delayed / requires a browser
|
||||||
|
refresh to show. Note: §10d (2026-06-30) already claims this was fixed ("Post-send message
|
||||||
|
refresh now polls briefly so FIPS/Tor/LoRa pills do not require a manual browser refresh") — this
|
||||||
|
report means it has regressed or the fix didn't fully land/deploy. Needs re-verification by the
|
||||||
|
mesh owner, not a re-fix from scratch. (The "mesh"-tag-should-read-"LoRa" report that used to be
|
||||||
|
listed alongside this was dropped 2026-07-01 — user is OK with current behavior there.)
|
||||||
|
- **[NON-MESH]** Indeedhub won't install on Arch Dev (node identity TBD — likely `.116`; confirm).
|
||||||
|
Untriaged.
|
||||||
|
- **[NON-MESH, touches bitcoin lifecycle] ROOT-CAUSED + FIX WRITTEN 2026-07-01** — Uninstalling
|
||||||
|
Bitcoin didn't stick: the container came back in My Apps and restarted IBD. Root cause:
|
||||||
|
`is_required_baseline_app` in `prod_orchestrator.rs` (bitcoin-knots, electrumx, lnd, mempool,
|
||||||
|
mempool-api, archy-mempool-db, filebrowser, fedimint-clientd) self-heals when its container is
|
||||||
|
missing — including right after an explicit uninstall — because the in-memory `disabled` set used
|
||||||
|
to suppress that is unconditionally wiped by `load_manifests()`, which runs once per archipelago
|
||||||
|
startup/reboot, immediately before the boot reconciler's first pass. Fix: a durable
|
||||||
|
`user-uninstalled.json` marker (mirrors the existing `user_stopped` mechanism in
|
||||||
|
`crash_recovery.rs`) checked at the same single reconcile choke point in
|
||||||
|
`ensure_running_with_mode`, set on successful `remove()`, cleared on `install()`/`start()`.
|
||||||
|
Test `reconcile_existing_respects_durable_user_uninstalled_marker_for_baseline_apps` passes;
|
||||||
|
`cargo test --workspace` green (873 tests). Low collision risk confirmed — the mechanism is
|
||||||
|
generic (applies to all baseline apps, not bitcoin-multi-version-specific) and the
|
||||||
|
`bitcoin-version-bulletproof` branch/worktree had no uncommitted changes in these files at the
|
||||||
|
time this was written. Not yet committed/pushed — pending user go-ahead.
|
||||||
|
- **[NON-MESH, touches bitcoin lifecycle]** Manually stopping Bitcoin causes it to auto-restart — a
|
||||||
|
user-initiated `package.stop` should NOT be treated as a crash by the auto-restart/health-monitor
|
||||||
|
logic. Investigated 2026-07-01: both live restart paths (`prod_orchestrator.rs`
|
||||||
|
`ensure_running_with_mode` and the legacy `health_monitor.rs` loop) already check the durable
|
||||||
|
`user_stopped` marker before restarting and look correctly wired on current `main` — no live
|
||||||
|
repro path found in code. Likely the reporting node's deployed binary predates a fix already on
|
||||||
|
`main`; needs the node identity + build/commit to confirm before further action.
|
||||||
|
- **[NON-MESH] FIXED 2026-07-01, LIVE ON `.228`** — `.228` Bitcoin RPC was connection-refused
|
||||||
|
("waiting for the Bitcoin RPC listener"). Root cause: the queued `bitcoin-knots-reindex` swap from
|
||||||
|
the bitcoin-rollout handover (`project_bitcoin_rollout_handover.md`) was never finished — the
|
||||||
|
detached reindex container (RPC intentionally off) had been fully synced and idling for 2 days
|
||||||
|
(height 956191, `progress=1.000000`). Executed the queued swap: stopped+removed
|
||||||
|
`bitcoin-knots-reindex`, started the managed `bitcoin-knots` service via RPC. Confirmed healthy:
|
||||||
|
v29.3.knots20260210, connected to peers, tip advanced to 956193, RPC listening on 8332.
|
||||||
|
**Follow-up same day:** user asked to confirm the version, since the UI/catalog said "latest" —
|
||||||
|
turned out the container was running a **4-month-old cached `:latest` image**
|
||||||
|
(`v29.3.knots20260210`) while the actual newest release (`29.3.knots20260508`) was already pulled
|
||||||
|
locally 2 days earlier but never applied. Root-caused why: `installed_version()` in
|
||||||
|
`set_config.rs` (`package.versions`/`package.set-config`) reported the literal image **tag string**
|
||||||
|
used to create the container (`"latest"`), not the content actually running — a stale local
|
||||||
|
`:latest` cache reports "latest" forever regardless of what `latest` has since moved to. **FIXED**:
|
||||||
|
when the resolved tag is a floating one (`latest`/`stable`/`release`/`main`), `installed_version()`
|
||||||
|
now asks the Bitcoin backend directly (`podman exec <name> bitcoind --version`, parsed via new
|
||||||
|
`parse_bitcoind_version_output`) instead of trusting the tag literal. 5 new tests in
|
||||||
|
`set_config.rs` (`floating_tag_detects_generic_channel_names`, `parses_knots_version_line`,
|
||||||
|
`parses_core_version_line`, `parse_returns_none_when_output_has_no_version_marker`,
|
||||||
|
`image_tag_keeps_registry_port_colon`) all pass. No frontend change needed — `AppSidebar.vue`
|
||||||
|
("Running Version" in the Version & Updates card) already renders `versionInfo.installedVersion`
|
||||||
|
verbatim, so it will show the real version once this backend fix ships. Then used the existing
|
||||||
|
bulletproof switch mechanism itself — `package.set-config {id: "bitcoin-knots", version:
|
||||||
|
"29.3.knots20260508"}` (an upgrade, so no downgrade-confirm gate) — to move `.228` onto the real
|
||||||
|
latest image. Confirmed: `bitcoind --version` now reports `v29.3.knots20260508`, no reindex
|
||||||
|
triggered, tip advancing normally. Not yet committed/pushed — pending user go-ahead, same batch as
|
||||||
|
the uninstall-durability fix above.
|
||||||
|
- **[NON-MESH, untriaged]** `.198` — `bitcoin-knots` RPC is saturated: logs flooded with "Request
|
||||||
|
rejected because http work queue depth exceeded" despite `-rpcworkqueue=256` already applied
|
||||||
|
(confirmed via `podman inspect`/entrypoint). This cascades into fedimint: `fedimint` /
|
||||||
|
`fedimint-gateway` / `fedimint-clientd` have been stuck in `(starting)` for 36–46h because their
|
||||||
|
RPC calls to `bitcoin-knots` time out (45s) — this is almost certainly what the user meant by
|
||||||
|
"fedimint guardian keeps going down" (not `.228`, whose fedimint stack looks healthy). Root cause
|
||||||
|
of the saturation itself not yet found — suspect a multi-service retry storm (health_monitor +
|
||||||
|
fedimint x2 + electrumx + mempool + UI all polling without backoff) compounding under any bitcoind
|
||||||
|
slowdown, but not confirmed.
|
||||||
|
- **[NON-MESH, untriaged]** `.198` — portainer is completely absent from `podman ps -a` (not just
|
||||||
|
crashed/stopped — no container record at all). `.228`'s portainer is healthy for comparison. No
|
||||||
|
`/var/lib/archipelago/install.log` found on `.198` to check install history; needs a
|
||||||
|
package_data/state check via RPC or `journalctl` for the archipelago service.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user