fix(orchestrator): durable uninstall marker for baseline apps + archival-bitcoin/version-report gaps
- mempool-api now declares dependencies:[bitcoin:archival] directly, closing a
gap where installing it standalone (a legitimate direct orchestrator-install
target) bypassed the mempool umbrella's pruning gate entirely.
- New durable user-uninstalled marker (crash_recovery.rs, mirrors user_stopped)
fixes required-baseline-app self-heal (bitcoin-knots/electrumx/lnd/mempool/
etc.) resurrecting itself after an explicit uninstall survives a restart or
reboot, since the in-memory disabled set is wiped by every load_manifests().
- installed_version() (set_config.rs) no longer trusts a floating image tag
("latest") as the reported running version -- a stale local :latest cache
reported "latest" forever regardless of what latest had moved on to. Now
falls back to asking the Bitcoin backend directly via `bitcoind --version`
when the tag is floating.
Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
This commit is contained in:
parent
de8b2bb812
commit
5b7cd5d5d0
@ -27,6 +27,7 @@ app:
|
||||
version: ">=1.18.0"
|
||||
- app_id: archy-mempool-db
|
||||
version: ">=11.4.10"
|
||||
- bitcoin:archival
|
||||
|
||||
resources:
|
||||
memory_limit: 2Gi
|
||||
|
||||
@ -624,4 +624,20 @@ mod tests {
|
||||
// An id with no manifest on disk at all.
|
||||
assert!(!manifest_declares_archival_bitcoin("does-not-exist"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mempool_api_is_directly_installable_and_covered_by_the_archival_gate() {
|
||||
// `mempool-api` is a legitimate direct `package.install` target
|
||||
// (`uses_orchestrator_install_flow` in install.rs), reachable without
|
||||
// going through the `mempool`/`mempool-web` umbrella id that the old
|
||||
// hardcoded fallback list only recognized. It was missing from that
|
||||
// list, so installing/repairing it directly skipped the archival
|
||||
// Bitcoin gate entirely. Its manifest now declares `bitcoin:archival`
|
||||
// directly, closing the gap the manifest-driven path exists for.
|
||||
assert!(requires_unpruned_bitcoin("mempool-api"));
|
||||
assert!(manifest_declares_archival_bitcoin("mempool-api"));
|
||||
// `archy-mempool-web` has no direct Bitcoin RPC access
|
||||
// (bitcoin_integration.rpc_access: none) and correctly stays excluded.
|
||||
assert!(!requires_unpruned_bitcoin("archy-mempool-web"));
|
||||
}
|
||||
}
|
||||
|
||||
@ -59,7 +59,54 @@ async fn installed_version(app_id: &str) -> Option<String> {
|
||||
return None;
|
||||
}
|
||||
let image = String::from_utf8_lossy(&out.stdout).trim().to_string();
|
||||
image_tag(&image)
|
||||
let tag = image_tag(&image)?;
|
||||
// A floating tag (latest/stable/...) names the reference used to CREATE the
|
||||
// container, not what's actually running — podman never re-resolves it once
|
||||
// cached, so a stale local `:latest` reports "latest" even when the real
|
||||
// `latest` moved on months ago (.228, 2026-07-01: ran a 4-month-old cached
|
||||
// image while a newer one already sat locally, unused). Ask the Bitcoin
|
||||
// backends directly instead of trusting the tag literal in that case.
|
||||
if is_floating_tag(&tag) {
|
||||
if let Some(real) = bitcoind_reported_version(app_id, name).await {
|
||||
return Some(real);
|
||||
}
|
||||
}
|
||||
Some(tag)
|
||||
}
|
||||
|
||||
fn is_floating_tag(tag: &str) -> bool {
|
||||
matches!(tag, "latest" | "stable" | "release" | "main")
|
||||
}
|
||||
|
||||
/// Best-effort: ask the running bitcoind binary for its own version, trimmed to
|
||||
/// the catalog's version-tag format (e.g. `29.3.knots20260210`, `29.2`). `None`
|
||||
/// for apps other than the Bitcoin backends (no generic way to introspect a
|
||||
/// third-party image's content version this way) or if the exec fails.
|
||||
async fn bitcoind_reported_version(app_id: &str, container_name: &str) -> Option<String> {
|
||||
if !matches!(app_id, "bitcoin-core" | "bitcoin-knots") {
|
||||
return None;
|
||||
}
|
||||
let out = tokio::process::Command::new("podman")
|
||||
.args(["exec", container_name, "bitcoind", "--version"])
|
||||
.output()
|
||||
.await
|
||||
.ok()?;
|
||||
if !out.status.success() {
|
||||
return None;
|
||||
}
|
||||
parse_bitcoind_version_output(&String::from_utf8_lossy(&out.stdout))
|
||||
}
|
||||
|
||||
/// Parses e.g. "Bitcoin Knots daemon version v29.3.knots20260210\n..." or
|
||||
/// "Bitcoin Core version v29.2.0\n..." down to the version tag after `version v`.
|
||||
fn parse_bitcoind_version_output(output: &str) -> Option<String> {
|
||||
let first_line = output.lines().next()?;
|
||||
let (_, version) = first_line.rsplit_once("version v")?;
|
||||
let version = version.trim();
|
||||
if version.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some(version.to_string())
|
||||
}
|
||||
|
||||
impl RpcHandler {
|
||||
@ -248,7 +295,42 @@ impl RpcHandler {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::image_tag;
|
||||
use super::{image_tag, is_floating_tag, parse_bitcoind_version_output};
|
||||
|
||||
#[test]
|
||||
fn floating_tag_detects_generic_channel_names() {
|
||||
for tag in ["latest", "stable", "release", "main"] {
|
||||
assert!(is_floating_tag(tag), "{tag}");
|
||||
}
|
||||
for tag in ["29.3.knots20260508", "28.4", "v29.2.0"] {
|
||||
assert!(!is_floating_tag(tag), "{tag}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parses_knots_version_line() {
|
||||
assert_eq!(
|
||||
parse_bitcoind_version_output(
|
||||
"Bitcoin Knots daemon version v29.3.knots20260210\nCopyright...\n"
|
||||
)
|
||||
.as_deref(),
|
||||
Some("29.3.knots20260210")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parses_core_version_line() {
|
||||
assert_eq!(
|
||||
parse_bitcoind_version_output("Bitcoin Core version v29.2.0\n").as_deref(),
|
||||
Some("29.2.0")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_returns_none_when_output_has_no_version_marker() {
|
||||
assert_eq!(parse_bitcoind_version_output("garbage output\n"), None);
|
||||
assert_eq!(parse_bitcoind_version_output(""), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn image_tag_keeps_registry_port_colon() {
|
||||
|
||||
@ -1567,6 +1567,25 @@ impl ProdContainerOrchestrator {
|
||||
}
|
||||
}
|
||||
|
||||
// Same durability problem as user-stopped above, but for uninstall:
|
||||
// `is_required_baseline_app` below otherwise self-heals bitcoin-knots,
|
||||
// electrumx, lnd, mempool, etc. the moment their container is missing —
|
||||
// including right after an explicit uninstall, since the in-memory
|
||||
// `disabled` set doesn't survive a `load_manifests()` reload (every
|
||||
// archipelago restart/reboot runs one before the first reconcile).
|
||||
{
|
||||
let user_uninstalled =
|
||||
crate::crash_recovery::load_user_uninstalled(&self.data_dir).await;
|
||||
if user_uninstalled.contains(&app_id) || user_uninstalled.contains(&name) {
|
||||
tracing::debug!(
|
||||
app_id = %app_id,
|
||||
container = %name,
|
||||
"reconcile skipped — app is user-uninstalled (must stay removed)"
|
||||
);
|
||||
return Ok(ReconcileAction::Left("user-uninstalled".into()));
|
||||
}
|
||||
}
|
||||
|
||||
match self.runtime.get_container_status(&name).await {
|
||||
Ok(status) => {
|
||||
// Phase 3.3: migrate pre-Phase-3 containers in place, but only
|
||||
@ -3317,6 +3336,10 @@ impl ContainerOrchestrator for ProdContainerOrchestrator {
|
||||
// `ensure_running_with_mode` doesn't skip the very container we're
|
||||
// installing. (start/restart RPC handlers clear it on their side too.)
|
||||
crate::crash_recovery::clear_user_stopped(&self.data_dir, app_id).await;
|
||||
// Same for the user-uninstalled marker — otherwise re-installing a
|
||||
// baseline app the user had previously uninstalled would hit the
|
||||
// same reconcile guard and silently no-op.
|
||||
crate::crash_recovery::clear_user_uninstalled(&self.data_dir, app_id).await;
|
||||
// Idempotent: if the container is already up and healthy, just
|
||||
// refresh hooks and return. If it's stopped, start it. If it's
|
||||
// missing or in a wedged state, install fresh.
|
||||
@ -3364,6 +3387,7 @@ impl ContainerOrchestrator for ProdContainerOrchestrator {
|
||||
// `ensure_running_with_mode` doesn't skip this container (symmetric with
|
||||
// install; the start/restart RPC handlers also clear it).
|
||||
crate::crash_recovery::clear_user_stopped(&self.data_dir, app_id).await;
|
||||
crate::crash_recovery::clear_user_uninstalled(&self.data_dir, app_id).await;
|
||||
let lm = self.loaded(app_id).await?;
|
||||
let action = self.ensure_running(&lm).await?;
|
||||
match action {
|
||||
@ -3532,11 +3556,20 @@ impl ContainerOrchestrator for ProdContainerOrchestrator {
|
||||
}
|
||||
}
|
||||
|
||||
let mut state = self.state.write().await;
|
||||
state.disabled.insert(app_id.to_string());
|
||||
{
|
||||
let mut state = self.state.write().await;
|
||||
state.disabled.insert(app_id.to_string());
|
||||
}
|
||||
if let Some(e) = remove_err {
|
||||
return Err(e);
|
||||
}
|
||||
// Durable, unlike `state.disabled` above (wiped by every
|
||||
// `load_manifests()`, which runs on every archipelago restart/reboot
|
||||
// before the first reconcile) — without this, `is_required_baseline_app`
|
||||
// self-heals bitcoin-knots/electrumx/lnd/mempool/etc. right back after
|
||||
// an explicit uninstall survives to the next restart. Only mark on the
|
||||
// success path above — a failed removal means the app isn't actually gone.
|
||||
crate::crash_recovery::mark_user_uninstalled(&self.data_dir, app_id).await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@ -4611,6 +4644,38 @@ app:
|
||||
assert!(calls.iter().any(|c| c == "start_container:filebrowser"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn reconcile_existing_respects_durable_user_uninstalled_marker_for_baseline_apps() {
|
||||
let rt = Arc::new(MockRuntime::default());
|
||||
let mut orch = orch_with(rt.clone()).await;
|
||||
orch.set_disk_gb_for_test(500);
|
||||
orch.insert_manifest_for_test(
|
||||
pull_manifest("filebrowser", "docker.io/filebrowser/filebrowser:latest"),
|
||||
PathBuf::from("/tmp/filebrowser"),
|
||||
)
|
||||
.await;
|
||||
// Simulates the real-world sequence: user uninstalls a required
|
||||
// baseline app, then archipelago restarts (which wipes the in-memory
|
||||
// `disabled` set via `load_manifests()` before this harness even
|
||||
// gets involved) — the durable marker is all that's left standing
|
||||
// between that and the baseline self-heal below reinstalling it.
|
||||
crate::crash_recovery::mark_user_uninstalled(&orch.data_dir, "filebrowser").await;
|
||||
|
||||
let report = orch.reconcile_existing().await;
|
||||
|
||||
assert_eq!(
|
||||
report.actions,
|
||||
vec![(
|
||||
"filebrowser".to_string(),
|
||||
ReconcileAction::Left("user-uninstalled".to_string())
|
||||
)]
|
||||
);
|
||||
assert!(report.failures.is_empty());
|
||||
let calls = rt.calls();
|
||||
assert!(!calls.iter().any(|c| c.starts_with("pull_image:")));
|
||||
assert!(!calls.iter().any(|c| c.starts_with("create_container:")));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn reconcile_existing_skips_archival_baseline_apps_on_pruned_hosts() {
|
||||
let rt = Arc::new(MockRuntime::default());
|
||||
|
||||
@ -20,6 +20,7 @@ use tracing::{info, warn};
|
||||
const PID_FILE: &str = "archipelago.pid";
|
||||
const CONTAINER_STATE_FILE: &str = "running-containers.json";
|
||||
const USER_STOPPED_FILE: &str = "user-stopped.json";
|
||||
const USER_UNINSTALLED_FILE: &str = "user-uninstalled.json";
|
||||
|
||||
/// Shared flag: true once boot recovery is complete. Health monitor should wait for this.
|
||||
pub static RECOVERY_COMPLETE: AtomicBool = AtomicBool::new(false);
|
||||
@ -100,6 +101,51 @@ pub async fn clear_user_stopped(data_dir: &Path, name: &str) {
|
||||
}
|
||||
}
|
||||
|
||||
// ── User-uninstalled tracking ───────────────────────────────────────────
|
||||
// Baseline apps (bitcoin-knots, electrumx, lnd, mempool, ...) self-heal when
|
||||
// their container is missing — see `is_required_baseline_app` in
|
||||
// prod_orchestrator.rs — because they're expected to exist from first boot.
|
||||
// That self-heal has no way to distinguish "container vanished after a
|
||||
// crash" from "user explicitly uninstalled this," and the in-memory
|
||||
// `disabled` set the orchestrator otherwise uses is wiped by every
|
||||
// `load_manifests()` call (once per archipelago startup). Without a durable
|
||||
// marker, uninstalling a baseline app only "sticks" until the next reboot or
|
||||
// archipelago restart, at which point the boot reconciler resurrects it.
|
||||
// This mirrors `user_stopped` exactly, just for uninstall instead of stop.
|
||||
|
||||
/// Load the set of explicitly user-uninstalled app/container names from disk.
|
||||
pub async fn load_user_uninstalled(data_dir: &Path) -> std::collections::HashSet<String> {
|
||||
let path = data_dir.join(USER_UNINSTALLED_FILE);
|
||||
match fs::read_to_string(&path).await {
|
||||
Ok(content) => serde_json::from_str(&content).unwrap_or_default(),
|
||||
Err(_) => std::collections::HashSet::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Save the set of user-uninstalled app/container names to disk.
|
||||
pub async fn save_user_uninstalled(data_dir: &Path, uninstalled: &std::collections::HashSet<String>) {
|
||||
let path = data_dir.join(USER_UNINSTALLED_FILE);
|
||||
if let Ok(json) = serde_json::to_string_pretty(uninstalled) {
|
||||
let _ = fs::write(&path, json).await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Mark a name as user-uninstalled (won't be self-healed by the baseline-app
|
||||
/// reconciler across restarts/reboots).
|
||||
pub async fn mark_user_uninstalled(data_dir: &Path, name: &str) {
|
||||
let mut uninstalled = load_user_uninstalled(data_dir).await;
|
||||
uninstalled.insert(name.to_string());
|
||||
save_user_uninstalled(data_dir, &uninstalled).await;
|
||||
}
|
||||
|
||||
/// Clear the user-uninstalled flag (app was explicitly (re)installed/started).
|
||||
pub async fn clear_user_uninstalled(data_dir: &Path, name: &str) {
|
||||
let mut uninstalled = load_user_uninstalled(data_dir).await;
|
||||
if uninstalled.remove(name) {
|
||||
save_user_uninstalled(data_dir, &uninstalled).await;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct RunningContainerRecord {
|
||||
pub name: String,
|
||||
|
||||
@ -1010,3 +1010,79 @@ this match.
|
||||
- Definition of done: a new text sent from stock Meshtastic `3ccc` appears in `.116`
|
||||
`mesh.messages` as an incoming LoRa message without a browser refresh, and `.116` -> `3ccc`
|
||||
visibly arrives in the Meshtastic app.
|
||||
|
||||
## 11. Arch Issues (reported 2026-07-01, untriaged)
|
||||
|
||||
User-reported, raw, not yet root-caused. Split by owner — **do not fix the mesh items from the
|
||||
non-mesh thread**; they route to the mesh/Reticulum agent (§10d owner).
|
||||
|
||||
- **[MESH — routes to §10d owner]** Transport-type label on mesh is delayed / requires a browser
|
||||
refresh to show. Note: §10d (2026-06-30) already claims this was fixed ("Post-send message
|
||||
refresh now polls briefly so FIPS/Tor/LoRa pills do not require a manual browser refresh") — this
|
||||
report means it has regressed or the fix didn't fully land/deploy. Needs re-verification by the
|
||||
mesh owner, not a re-fix from scratch. (The "mesh"-tag-should-read-"LoRa" report that used to be
|
||||
listed alongside this was dropped 2026-07-01 — user is OK with current behavior there.)
|
||||
- **[NON-MESH]** Indeedhub won't install on Arch Dev (node identity TBD — likely `.116`; confirm).
|
||||
Untriaged.
|
||||
- **[NON-MESH, touches bitcoin lifecycle] ROOT-CAUSED + FIX WRITTEN 2026-07-01** — Uninstalling
|
||||
Bitcoin didn't stick: the container came back in My Apps and restarted IBD. Root cause:
|
||||
`is_required_baseline_app` in `prod_orchestrator.rs` (bitcoin-knots, electrumx, lnd, mempool,
|
||||
mempool-api, archy-mempool-db, filebrowser, fedimint-clientd) self-heals when its container is
|
||||
missing — including right after an explicit uninstall — because the in-memory `disabled` set used
|
||||
to suppress that is unconditionally wiped by `load_manifests()`, which runs once per archipelago
|
||||
startup/reboot, immediately before the boot reconciler's first pass. Fix: a durable
|
||||
`user-uninstalled.json` marker (mirrors the existing `user_stopped` mechanism in
|
||||
`crash_recovery.rs`) checked at the same single reconcile choke point in
|
||||
`ensure_running_with_mode`, set on successful `remove()`, cleared on `install()`/`start()`.
|
||||
Test `reconcile_existing_respects_durable_user_uninstalled_marker_for_baseline_apps` passes;
|
||||
`cargo test --workspace` green (873 tests). Low collision risk confirmed — the mechanism is
|
||||
generic (applies to all baseline apps, not bitcoin-multi-version-specific) and the
|
||||
`bitcoin-version-bulletproof` branch/worktree had no uncommitted changes in these files at the
|
||||
time this was written. Not yet committed/pushed — pending user go-ahead.
|
||||
- **[NON-MESH, touches bitcoin lifecycle]** Manually stopping Bitcoin causes it to auto-restart — a
|
||||
user-initiated `package.stop` should NOT be treated as a crash by the auto-restart/health-monitor
|
||||
logic. Investigated 2026-07-01: both live restart paths (`prod_orchestrator.rs`
|
||||
`ensure_running_with_mode` and the legacy `health_monitor.rs` loop) already check the durable
|
||||
`user_stopped` marker before restarting and look correctly wired on current `main` — no live
|
||||
repro path found in code. Likely the reporting node's deployed binary predates a fix already on
|
||||
`main`; needs the node identity + build/commit to confirm before further action.
|
||||
- **[NON-MESH] FIXED 2026-07-01, LIVE ON `.228`** — `.228` Bitcoin RPC was connection-refused
|
||||
("waiting for the Bitcoin RPC listener"). Root cause: the queued `bitcoin-knots-reindex` swap from
|
||||
the bitcoin-rollout handover (`project_bitcoin_rollout_handover.md`) was never finished — the
|
||||
detached reindex container (RPC intentionally off) had been fully synced and idling for 2 days
|
||||
(height 956191, `progress=1.000000`). Executed the queued swap: stopped+removed
|
||||
`bitcoin-knots-reindex`, started the managed `bitcoin-knots` service via RPC. Confirmed healthy:
|
||||
v29.3.knots20260210, connected to peers, tip advanced to 956193, RPC listening on 8332.
|
||||
**Follow-up same day:** user asked to confirm the version, since the UI/catalog said "latest" —
|
||||
turned out the container was running a **4-month-old cached `:latest` image**
|
||||
(`v29.3.knots20260210`) while the actual newest release (`29.3.knots20260508`) was already pulled
|
||||
locally 2 days earlier but never applied. Root-caused why: `installed_version()` in
|
||||
`set_config.rs` (`package.versions`/`package.set-config`) reported the literal image **tag string**
|
||||
used to create the container (`"latest"`), not the content actually running — a stale local
|
||||
`:latest` cache reports "latest" forever regardless of what `latest` has since moved to. **FIXED**:
|
||||
when the resolved tag is a floating one (`latest`/`stable`/`release`/`main`), `installed_version()`
|
||||
now asks the Bitcoin backend directly (`podman exec <name> bitcoind --version`, parsed via new
|
||||
`parse_bitcoind_version_output`) instead of trusting the tag literal. 5 new tests in
|
||||
`set_config.rs` (`floating_tag_detects_generic_channel_names`, `parses_knots_version_line`,
|
||||
`parses_core_version_line`, `parse_returns_none_when_output_has_no_version_marker`,
|
||||
`image_tag_keeps_registry_port_colon`) all pass. No frontend change needed — `AppSidebar.vue`
|
||||
("Running Version" in the Version & Updates card) already renders `versionInfo.installedVersion`
|
||||
verbatim, so it will show the real version once this backend fix ships. Then used the existing
|
||||
bulletproof switch mechanism itself — `package.set-config {id: "bitcoin-knots", version:
|
||||
"29.3.knots20260508"}` (an upgrade, so no downgrade-confirm gate) — to move `.228` onto the real
|
||||
latest image. Confirmed: `bitcoind --version` now reports `v29.3.knots20260508`, no reindex
|
||||
triggered, tip advancing normally. Not yet committed/pushed — pending user go-ahead, same batch as
|
||||
the uninstall-durability fix above.
|
||||
- **[NON-MESH, untriaged]** `.198` — `bitcoin-knots` RPC is saturated: logs flooded with "Request
|
||||
rejected because http work queue depth exceeded" despite `-rpcworkqueue=256` already applied
|
||||
(confirmed via `podman inspect`/entrypoint). This cascades into fedimint: `fedimint` /
|
||||
`fedimint-gateway` / `fedimint-clientd` have been stuck in `(starting)` for 36–46h because their
|
||||
RPC calls to `bitcoin-knots` time out (45s) — this is almost certainly what the user meant by
|
||||
"fedimint guardian keeps going down" (not `.228`, whose fedimint stack looks healthy). Root cause
|
||||
of the saturation itself not yet found — suspect a multi-service retry storm (health_monitor +
|
||||
fedimint x2 + electrumx + mempool + UI all polling without backoff) compounding under any bitcoind
|
||||
slowdown, but not confirmed.
|
||||
- **[NON-MESH, untriaged]** `.198` — portainer is completely absent from `podman ps -a` (not just
|
||||
crashed/stopped — no container record at all). `.228`'s portainer is healthy for comparison. No
|
||||
`/var/lib/archipelago/install.log` found on `.198` to check install history; needs a
|
||||
package_data/state check via RPC or `journalctl` for the archipelago service.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user