diff --git a/CHANGELOG.md b/CHANGELOG.md index e2f887c2..0acc0a83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## v1.7.46-alpha (2026-04-29) + +- Health monitor no longer pages "Auto-restart failed" for orphaned containers. After a variant switch (bitcoin-core ↔ bitcoin-knots) the previous variant's container could survive uninstall and the health monitor would try restarting it forever. Now skipped silently with a debug log. +- Apps no longer disappear from My Apps when an install fails. The card stays visible with state=Stopped so the user can retry or uninstall, with the failure reason surfaced via the new install_progress.message field. +- "Downloading…" progress now actually advances during multi-image stack pulls. Was sticking at 20% until all pulls finished; now interpolates 20%→70% based on which image of N has landed. +- Pulled four docker.io images (bitcoin, gitea, nextcloud, valkey) into the lfg2025 registries on OVH and tx1138. Removes a docker.io dependency from first-boot installs. +- Resilience harness improvements: install-fail entries no longer vanish, install/uninstall/probe cells are timing-tolerant (60s retry on ui_probe and auth_probe), dep snapshots no longer leak companion containers into the dependent app's "new containers" set. + ## v1.7.45-alpha (2026-04-29) - Bitcoin RPC auth is durable. The dashboard reliably connects across container restart, image update, and reboot. Was failing on registry-pulled images that shipped a stale baked-in password. diff --git a/app-catalog/catalog.json b/app-catalog/catalog.json index 99428db1..5fee9801 100644 --- a/app-catalog/catalog.json +++ b/app-catalog/catalog.json @@ -31,7 +31,7 @@ "author": "Bitcoin Core contributors", "category": "money", "tier": "optional", - "dockerImage": "docker.io/bitcoin/bitcoin:28.4", + "dockerImage": "146.59.87.168:3000/lfg2025/bitcoin:28.4", "repoUrl": "https://github.com/bitcoin/bitcoin" }, { @@ -125,7 +125,7 @@ "icon": "/assets/img/app-icons/gitea.svg", "author": "Gitea", "category": "development", - "dockerImage": "docker.io/gitea/gitea:1.23", + "dockerImage": "146.59.87.168:3000/lfg2025/gitea:1.23", "repoUrl": "https://gitea.com" }, { @@ -263,7 +263,7 @@ "icon": "/assets/img/app-icons/nextcloud.webp", "author": "Nextcloud", "category": "data", - "dockerImage": "docker.io/nextcloud:28", + "dockerImage": "146.59.87.168:3000/lfg2025/nextcloud:28", "repoUrl": "https://github.com/nextcloud/server" } ] diff --git a/core/Cargo.lock b/core/Cargo.lock index c7b02f5b..fe457273 100644 --- a/core/Cargo.lock +++ b/core/Cargo.lock @@ -80,7 +80,7 @@ checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" [[package]] name = "archipelago" -version = "1.7.45-alpha" +version = "1.7.46-alpha" dependencies = [ "anyhow", "archipelago-container", diff --git a/core/archipelago/Cargo.toml b/core/archipelago/Cargo.toml index 1e2cbe0f..e73ebde9 100644 --- a/core/archipelago/Cargo.toml +++ b/core/archipelago/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "archipelago" -version = "1.7.45-alpha" +version = "1.7.46-alpha" edition = "2021" description = "Archipelago Bitcoin Node OS - Native backend" authors = ["Archipelago Team"] diff --git a/core/archipelago/src/api/rpc/package/async_lifecycle.rs b/core/archipelago/src/api/rpc/package/async_lifecycle.rs index 20fa5c1f..cac8be94 100644 --- a/core/archipelago/src/api/rpc/package/async_lifecycle.rs +++ b/core/archipelago/src/api/rpc/package/async_lifecycle.rs @@ -113,11 +113,26 @@ impl RpcHandler { Err(e) => { error!("package.install {} failed: {:#}", package_id_spawn, e); install_log(&format!("INSTALL FAIL: {} — {:#}", package_id_spawn, e)).await; - // No pre-state to revert to — remove the entry entirely so - // the UI shows the app as not installed. The next package - // scan will re-create it only if podman actually has a - // container for it (partial install recovery). - remove_package_entry(&handler.state_manager, &package_id_spawn).await; + // Don't remove the entry — that's what made the card + // vanish from My Apps mid-install / between retry-loop + // attempts (e.g. tailscale's entrypoint failure). Leave + // the entry visible with state=Stopped + the install + // error in install_progress.message so the user can see + // what went wrong and decide whether to retry or + // uninstall. clear_install_progress would erase the + // message, so we set it explicitly here instead. + let err_msg = format!("Install failed: {:#}", e); + let (mut data, _) = handler.state_manager.get_snapshot().await; + if let Some(entry) = data.package_data.get_mut(&package_id_spawn) { + entry.state = PackageState::Stopped; + entry.install_progress = Some(crate::data_model::InstallProgress { + size: 0, + downloaded: 0, + phase: None, + message: Some(err_msg), + }); + handler.state_manager.update_data(data).await; + } } } }); diff --git a/core/archipelago/src/api/rpc/package/progress.rs b/core/archipelago/src/api/rpc/package/progress.rs index 3ef0e89e..671651f3 100644 --- a/core/archipelago/src/api/rpc/package/progress.rs +++ b/core/archipelago/src/api/rpc/package/progress.rs @@ -25,6 +25,7 @@ impl RpcHandler { size, downloaded, phase: existing_phase, + message: None, }); self.state_manager.update_data(data).await; } @@ -55,6 +56,7 @@ impl RpcHandler { size, downloaded, phase: Some(phase), + message: None, }); self.state_manager.update_data(data).await; } @@ -97,6 +99,7 @@ impl RpcHandler { size: total, downloaded, phase: existing_phase, + message: None, }); state_manager.update_data(data).await; } diff --git a/core/archipelago/src/api/rpc/package/stacks.rs b/core/archipelago/src/api/rpc/package/stacks.rs index 10f5d668..5c4b9400 100644 --- a/core/archipelago/src/api/rpc/package/stacks.rs +++ b/core/archipelago/src/api/rpc/package/stacks.rs @@ -201,7 +201,7 @@ impl RpcHandler { let images = [ "146.59.87.168:3000/lfg2025/immich-postgres:14-vectorchord0.4.3-pgvectors0.2.0", - "docker.io/valkey/valkey:7-alpine", + "146.59.87.168:3000/lfg2025/valkey:7-alpine", "146.59.87.168:3000/lfg2025/immich-server:release", ]; self.set_install_phase("immich", InstallPhase::PullingImage) @@ -300,7 +300,7 @@ impl RpcHandler { "--health-cmd=valkey-cli ping || exit 1", "--health-interval=30s", "--health-retries=3", - "docker.io/valkey/valkey:7-alpine", + "146.59.87.168:3000/lfg2025/valkey:7-alpine", ]) .output() .await; diff --git a/core/archipelago/src/data_model.rs b/core/archipelago/src/data_model.rs index 7f3c4444..2eadd12d 100644 --- a/core/archipelago/src/data_model.rs +++ b/core/archipelago/src/data_model.rs @@ -255,6 +255,12 @@ pub struct InstallProgress { /// a fixed UI percentage and a descriptive label. #[serde(default, skip_serializing_if = "Option::is_none")] pub phase: Option, + /// Optional explicit message — used to surface install failures so + /// the UI can keep the app card visible with an error description + /// instead of silently removing the entry on fail. UI's PHASE_INFO + /// label takes precedence when phase is set. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub message: Option, } /// Phases of the install / update pipeline, surfaced to the UI so users diff --git a/core/archipelago/src/health_monitor.rs b/core/archipelago/src/health_monitor.rs index d506c05b..1aadbf7c 100644 --- a/core/archipelago/src/health_monitor.rs +++ b/core/archipelago/src/health_monitor.rs @@ -539,6 +539,20 @@ pub fn spawn_health_monitor(state: Arc, data_dir: PathBuf) { debug!("Skipping uninstalled container: {}", container.name); continue; } + } else { + // Orphan: container exists in podman but archipelago has + // no package_data entry for it. Common after a variant + // switch (bitcoin-core ↔ bitcoin-knots) where the + // uninstall removed the package entry but the prior + // variant's container survived in stopped state. Without + // this guard the health monitor pages every minute with + // "Auto-restart failed (attempt N/10)" for an app the + // user can no longer see in the dashboard. + debug!( + "Skipping orphan container (not in package_data): {}", + container.name + ); + continue; } if container.healthy { diff --git a/neode-ui/package.json b/neode-ui/package.json index 84aed1d8..7b1b7f04 100644 --- a/neode-ui/package.json +++ b/neode-ui/package.json @@ -1,7 +1,7 @@ { "name": "neode-ui", "private": true, - "version": "1.7.45-alpha", + "version": "1.7.46-alpha", "type": "module", "scripts": { "start": "./start-dev.sh", diff --git a/neode-ui/public/catalog.json b/neode-ui/public/catalog.json index 99428db1..5fee9801 100644 --- a/neode-ui/public/catalog.json +++ b/neode-ui/public/catalog.json @@ -31,7 +31,7 @@ "author": "Bitcoin Core contributors", "category": "money", "tier": "optional", - "dockerImage": "docker.io/bitcoin/bitcoin:28.4", + "dockerImage": "146.59.87.168:3000/lfg2025/bitcoin:28.4", "repoUrl": "https://github.com/bitcoin/bitcoin" }, { @@ -125,7 +125,7 @@ "icon": "/assets/img/app-icons/gitea.svg", "author": "Gitea", "category": "development", - "dockerImage": "docker.io/gitea/gitea:1.23", + "dockerImage": "146.59.87.168:3000/lfg2025/gitea:1.23", "repoUrl": "https://gitea.com" }, { @@ -263,7 +263,7 @@ "icon": "/assets/img/app-icons/nextcloud.webp", "author": "Nextcloud", "category": "data", - "dockerImage": "docker.io/nextcloud:28", + "dockerImage": "146.59.87.168:3000/lfg2025/nextcloud:28", "repoUrl": "https://github.com/nextcloud/server" } ] diff --git a/neode-ui/src/stores/server.ts b/neode-ui/src/stores/server.ts index 78a9c29f..3b794665 100644 --- a/neode-ui/src/stores/server.ts +++ b/neode-ui/src/stores/server.ts @@ -63,18 +63,44 @@ export const useServerStore = defineStore('server', () => { if (progress.phase) { const info = PHASE_INFO[progress.phase] if (info) { + // Within the PullingImage band (20→70%), interpolate the + // bar based on how many images / bytes have landed so far. + // Without this, multi-container stacks (indeedhub: 7, + // mempool: 3, btcpay: 4) just sit at 20% for the entire + // pull duration — exactly what the user reported as + // "Downloading sticks at 20% mostly". X-of-N progress + // comes from set_install_progress(i, n) in stacks.rs. + let bandProgress = info.progress + if (progress.phase === 'pulling-image' && progress.size > 0) { + const fraction = Math.min(progress.downloaded / progress.size, 1) + // PullingImage band: 20% → 70%, so 50pp to interpolate over. + bandProgress = 20 + Math.round(fraction * 50) + } // Only advance forward — never let the bar step backward // between patches (can happen briefly during scan merges). - const nextProgress = Math.max(current.progress, info.progress) + const nextProgress = Math.max(current.progress, bandProgress) + // Show explicit message when set (e.g. install-fail descriptions + // surfaced via install_progress.message) — otherwise PHASE_INFO label. + const label = progress.message || info.message installingApps.value.set(appId, { ...current, status: info.status, progress: nextProgress, - message: info.message, + message: label, }) continue } } + // No phase but message is set (install-fail path) — show the message + // even if PHASE_INFO doesn't apply. Status stays whatever the watcher + // currently has. + if (progress.message) { + installingApps.value.set(appId, { + ...current, + message: progress.message, + }) + continue + } // Fallback: byte counters (rare — podman usually doesn't // emit parseable progress on a piped stderr). const pct = progress.size > 0 ? Math.round((progress.downloaded / progress.size) * 100) : 0 diff --git a/neode-ui/src/types/api.ts b/neode-ui/src/types/api.ts index 8f787164..3f0773ff 100644 --- a/neode-ui/src/types/api.ts +++ b/neode-ui/src/types/api.ts @@ -166,6 +166,9 @@ export interface InstallProgress { * counters — podman pull doesn't emit parseable progress when * stderr is piped, so byte counters are usually (0,0). */ phase?: InstallPhase + /** Optional explicit message — surfaced on install failures so the + * UI can show what went wrong instead of silently removing the card. */ + message?: string } // RPC Request/Response types diff --git a/releases/manifest.json b/releases/manifest.json index 68553b79..bc24e78d 100644 --- a/releases/manifest.json +++ b/releases/manifest.json @@ -1,32 +1,14 @@ { - "version": "1.7.45-alpha", + "version": "1.7.46-alpha", "release_date": "2026-04-29", "changelog": [ - "Bitcoin RPC authentication is now bulletproof. The credential is rendered to a host file and bind-mounted into bitcoin-ui, so it stays correct across container restart, image update, reboot, or service restart. Replaces the previous fragile post-start patch that failed on tightly-confined containers.", - "Install progress bar now advances through real phases for multi-container apps too. IndeedHub's seven containers, BTCPay's four, Mempool's three, and Immich's three all show Preparing → Pulling image (X of N) → Creating container → Waiting for health → Done — no more sitting at 0% until the very end.", - "Apps no longer disappear from the dashboard mid-install. The container scanner now respects in-flight installs, updates, and removals, and won't evict an app whose containers haven't finished launching.", - "IndeedHub fresh installs no longer crashloop. Five missing environment variables (DATABASE_PORT, QUEUE_HOST, QUEUE_PORT, S3_PRIVATE_BUCKET_NAME, AES_MASTER_SECRET) are now set so the API boots. The node's Nostr signer integration works on fresh installs.", - "Tailscale install no longer fails with 'executable not found'. Container command was a malformed shell string; now a proper command array.", - "Removed three broken catalog entries that hung installs for 10 minutes (dwn, endurain, ollama — no source images in our registries). Nextcloud restored, sourced from docker.io.", - "Bitcoin Core update path uses the correct image name (was looking for nonexistent lfg2025/bitcoin:28.4).", - "New ISO installs now allocate swap (sized to RAM, capped at 8GB, on the encrypted data partition). Without swap, container builds and memory spikes were hitting OOM under load." + "Health monitor no longer pages \"Auto-restart failed\" for orphaned containers. After a variant switch (bitcoin-core ↔ bitcoin-knots) the previous variant's container could survive uninstall and the health monitor would try restarting it forever.", + "Apps no longer disappear from My Apps when an install fails. The card stays visible with an explicit failure reason so the user can retry or uninstall instead of guessing what happened.", + "Multi-image stack pulls now actually advance the progress bar. Was sticking at 20% until all pulls finished; now interpolates between 20% and 70% based on which image of N has landed.", + "Pulled four docker.io images (bitcoin, gitea, nextcloud, valkey) into the lfg2025 registries on OVH and tx1138. Removes a docker.io dependency from first-boot installs." ], "components": [ - { - "name": "archipelago", - "current_version": "1.7.45-alpha", - "new_version": "1.7.45-alpha", - "download_url": "https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/v1.7.45-alpha/archipelago", - "sha256": "ca1958b0f420cc6e73aa4bc161e20ebe7750e933888368394ad17a3f3a36cfad", - "size_bytes": 41618344 - }, - { - "name": "archipelago-frontend-1.7.45-alpha.tar.gz", - "current_version": "1.7.45-alpha", - "new_version": "1.7.45-alpha", - "download_url": "https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/v1.7.45-alpha/archipelago-frontend-1.7.45-alpha.tar.gz", - "sha256": "59d538768e92a1cd726afd272838dbdd581c87780140792b2818434ef2ae7b81", - "size_bytes": 77025110 - } + { "name": "archipelago", "current_version": "1.7.46-alpha", "new_version": "1.7.46-alpha", "download_url": "https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/v1.7.46-alpha/archipelago", "sha256": "0478b03ab2860a8f10650b06fbce5d4e01de3b71fbf26bebf726d688c33cea59", "size_bytes": 41628088 }, + { "name": "archipelago-frontend-1.7.46-alpha.tar.gz", "current_version": "1.7.46-alpha", "new_version": "1.7.46-alpha", "download_url": "https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/v1.7.46-alpha/archipelago-frontend-1.7.46-alpha.tar.gz", "sha256": "594177f1943e8abafc812f3853bd6ce31e5c4df6cd90aca02d3aa408fbd4e669", "size_bytes": 77026680 } ] } diff --git a/releases/v1.7.46-alpha/archipelago b/releases/v1.7.46-alpha/archipelago new file mode 100755 index 00000000..76b7f9b0 Binary files /dev/null and b/releases/v1.7.46-alpha/archipelago differ diff --git a/releases/v1.7.46-alpha/archipelago-frontend-1.7.46-alpha.tar.gz b/releases/v1.7.46-alpha/archipelago-frontend-1.7.46-alpha.tar.gz new file mode 100644 index 00000000..2300e642 Binary files /dev/null and b/releases/v1.7.46-alpha/archipelago-frontend-1.7.46-alpha.tar.gz differ diff --git a/scripts/resilience/resilience.sh b/scripts/resilience/resilience.sh index 9ca86638..a9780ae5 100755 --- a/scripts/resilience/resilience.sh +++ b/scripts/resilience/resilience.sh @@ -129,19 +129,34 @@ snapshot_containers() { ssh_run "podman ps -a --format '{{.Names}}' | sort" } -# Whether $app currently has any of its expected containers running. Uses +# Whether $app currently has ALL of its expected containers running. Uses # the per-app metadata table in lib.sh (expected_containers_for) so variant # apps (bitcoin-knots/bitcoin-core sharing slots) and stacks are detected # correctly. Falls back to name-prefix match for apps the table doesn't know. +# +# Returns true only when every expected container is present. Earlier +# versions returned true on ANY match — that caused dep installs (e.g. +# bitcoin-knots required by btcpay) to be declared "installed" as soon as +# the backend container appeared, before the UI companion (archy-bitcoin-ui) +# was up. The before-snapshot then missed the companion, the after-snapshot +# caught it, and it leaked into the dependent app's "new containers" set, +# false-positive-FAILing stop/uninstall when the companion (correctly) did +# not respond to the dependent app's package.stop. app_already_installed() { local app="$1" local snap; snap=$(snapshot_containers) local expected expected=$(expected_containers_for "$app") - local c - for c in $expected; do - echo "$snap" | grep -qxF "$c" && return 0 - done + if [ -n "$expected" ] && [ "$expected" != "$app" ]; then + local c missing=0 + for c in $expected; do + echo "$snap" | grep -qxF "$c" || missing=1 + done + [ "$missing" -eq 0 ] && return 0 + # Fall through to prefix match if the expected_containers list has + # gaps; a partial install still counts as "installed enough" for + # preclean purposes. + fi # Generic prefix fallback for apps not in the expected_containers_for table. echo "$snap" | grep -qE "^(${app}|${app}-|archy-${app}|archy-${app}-)" } @@ -291,8 +306,18 @@ run_app_matrix() { fi # ── 02 ui_probe ────────────────────────────────────────────── + # Retry with backoff — install just finished, but the app's backend + # (fedimint, immich, mempool stack) may take 30+s to be ready to serve + # HTTP. Probing immediately false-positive-FAILed those apps; pass on + # first 2xx/3xx within 60s. local code - code=$(probe_app_proxy "$app") + local ui_deadline=$(($(date +%s) + 60)) + while :; do + code=$(probe_app_proxy "$app") + [[ "$code" =~ ^(2[0-9][0-9]|3[0-9][0-9])$ ]] && break + [ "$(date +%s)" -ge "$ui_deadline" ] && break + sleep 5 + done # Accept all 2xx/3xx — proxy reaches backend, app may redirect to login, # serve OAuth flow (307), or use 308 permanent. 401/403 still fail because # those mean "backend reached, app rejected request" which is the @@ -300,17 +325,27 @@ run_app_matrix() { if [[ "$code" =~ ^(2[0-9][0-9]|3[0-9][0-9])$ ]]; then record "$app" ui_probe PASS "HTTP $code" else - record "$app" ui_probe FAIL "HTTP $code (expected 2xx/3xx)" + record "$app" ui_probe FAIL "HTTP $code (expected 2xx/3xx, retried 60s)" fi # ── 03 auth_probe (only for apps with a credentialed/data endpoint) ── + # Same backoff treatment: bitcoin-ui's nginx config bind-mount is + # picked up at start, but the bitcoin-core backend may not have + # accepted RPC connections yet on a fresh install. local probe_code; local pass_codes + pass_codes=$(auth_probe_pass_codes "$app") if probe_code=$(auth_probe_for "$app" 2>/dev/null) && [ -n "$probe_code" ]; then - pass_codes=$(auth_probe_pass_codes "$app") + local auth_deadline=$(($(date +%s) + 60)) + while :; do + echo " $pass_codes " | grep -qF " $probe_code " && break + [ "$(date +%s)" -ge "$auth_deadline" ] && break + sleep 5 + probe_code=$(auth_probe_for "$app" 2>/dev/null) || break + done if echo " $pass_codes " | grep -qF " $probe_code "; then record "$app" auth_probe PASS "HTTP $probe_code" else - record "$app" auth_probe FAIL "HTTP $probe_code (expected one of: $pass_codes — credential plumbing broken)" + record "$app" auth_probe FAIL "HTTP $probe_code (expected one of: $pass_codes; retried 60s — credential plumbing broken)" fi else record "$app" auth_probe SKIP "no authenticated probe defined"