1947 lines
78 KiB
Rust
Raw Normal View History

//! Update system: check for updates, download deltas, apply with rollback.
use anyhow::{Context, Result};
use chrono::Timelike;
use serde::{Deserialize, Serialize};
use std::path::Path;
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use tokio::fs;
2026-06-12 03:00:15 -04:00
use tracing::{debug, info, warn};
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
/// Live download progress counters. Updated by download_component_resumable
/// as bytes arrive and read by the update.status RPC so the UI can show
/// a real progress bar instead of a fake creep. Global because the
/// download runs in one place at a time; no need for per-handler state.
pub static DOWNLOAD_BYTES: AtomicU64 = AtomicU64::new(0);
pub static DOWNLOAD_TOTAL: AtomicU64 = AtomicU64::new(0);
/// Set true to ask the in-flight download loop to bail out at the next
/// chunk boundary. Read via `is_canceled`; reset at the start of every
/// `download_update` run. Also flipped by the `cancel_download` RPC.
pub static DOWNLOAD_CANCEL: AtomicBool = AtomicBool::new(false);
/// Monotonic ms timestamp of the last time DOWNLOAD_BYTES advanced.
/// Lets `update.status` flag a download as "stalled" when no bytes have
/// arrived for a while, so the UI can offer a Cancel button with more
/// confidence than "looks stuck at 0%".
pub static DOWNLOAD_PROGRESS_AT: AtomicU64 = AtomicU64::new(0);
fn now_ms() -> u64 {
use std::time::{SystemTime, UNIX_EPOCH};
SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_millis() as u64)
.unwrap_or(0)
}
fn is_canceled() -> bool {
DOWNLOAD_CANCEL.load(Ordering::Relaxed)
}
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
/// Parse "MAJOR.MINOR.PATCH[-suffix]" into a tuple; suffix is ignored.
/// Returns None if the numeric portion can't be parsed — callers should
/// fall back to string comparison in that case so we don't silently
/// mis-rank versions we don't understand.
fn parse_version_triple(v: &str) -> Option<(u32, u32, u32)> {
let core = v.split('-').next().unwrap_or(v);
let mut parts = core.split('.');
let major: u32 = parts.next()?.parse().ok()?;
let minor: u32 = parts.next()?.parse().ok()?;
let patch: u32 = parts.next()?.parse().ok()?;
Some((major, minor, patch))
}
/// Is `candidate` strictly newer than `current`? Used to guard against
/// the manifest offering a version we've already passed (e.g. a stale
/// cached manifest or a node that sideloaded past the manifest's
/// latest). Falls back to string inequality if either version doesn't
/// parse, preserving the old behaviour for unusual version strings.
fn is_newer(candidate: &str, current: &str) -> bool {
match (
parse_version_triple(candidate),
parse_version_triple(current),
) {
(Some(a), Some(b)) => a > b,
_ => candidate != current,
}
}
const DEFAULT_UPDATE_MANIFEST_URL: &str =
release(v1.7.31-alpha): idempotent IndeedHub install + auto-merge default mirrors/registries + 3rd OVH update mirror - Backend: install.rs registry reachability probe now strips the `host[:port]/namespace` suffix before appending `/v2/` (the Docker V2 API lives at the host root, not under the namespace) and accepts HTTP 405 in addition to 200/401 as "registry daemon alive". This fixes false "unreachable" reports on the Test button for Gitea and other registries that protect their /v2/ endpoint. - Backend: stacks.rs install_indeedhub_stack now force-removes any leftover indeedhub-* containers and indeedhub-net before creating the stack. A partial install (or the old first-boot stub racing the installer) used to leave containers around that blocked re-install with "name already in use". Re-running the App Store install now self-heals. - Backend: registry.rs load_registries auto-merges any default registry URLs missing from the saved config (appended with priority max+10+i, persisted). Lets new default mirrors (e.g. Server 3 OVH) roll out to existing nodes without manual config edits. Explicit removals still stick — URLs absent from disk AND absent from defaults stay gone. - Backend: update.rs adds DEFAULT_TERTIARY_MIRROR_URL at http://146.59.87.168:3000/ (Server 3 OVH) to default_mirrors, with the same auto-merge-on-load behavior as registries. Test updated for 3-mirror default (.160, tx1138, .168). - Scripts: dropped the first-boot IndeedHub stub (~38 lines in first-boot-containers.sh §8b). It predated the proper stack installer, raced it, and was the main source of the name-conflict mess the stacks.rs cleanup above now also guards against. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 03:26:09 -04:00
"http://146.59.87.168:3000/lfg2025/archy/raw/branch/main/releases/manifest.json";
chore: release v1.7.45-alpha Resilience-validated release. Three full sweeps of the new resilience harness against .228 confirm no shipstoppers. Big user-visible: - Bitcoin RPC auth durably correct via host-rendered nginx.conf bind-mount, replaces fragile post-start exec that failed under restricted-cap rootless podman ("crun: write cgroup.procs: Permission denied") - Multi-container stack installs (indeedhub, immich, btcpay, mempool) now emit phase events at every boundary so the progress bar advances - Apps no longer vanish from the dashboard mid-install (absent-scanner skips packages in transitional states) - Indeedhub fresh installs work end-to-end (was 8500+ restart loop): five missing env vars (DATABASE_PORT, QUEUE_HOST, QUEUE_PORT, S3_PRIVATE_BUCKET_NAME, AES_MASTER_SECRET) added to install code - Tailscale install fixed: --entrypoint string was being passed as a single shell-line arg; switched to custom_args array - Catalog cleaned of broken entries (dwn, endurain, ollama removed; nextcloud restored on docker.io) - Bitcoin Core update path uses correct image (was looking for nonexistent lfg2025/bitcoin:28.4) - ISO installs now allocate swap on the encrypted data partition Infra: - New resilience harness (scripts/resilience/) — black-box state-machine tester, every app × every transition. Run before each release. Sweep #3 final: PASS 107 / FAIL 12 / SKIP 14. The 12 fails are 1 cosmetic (homeassistant trusted_hosts), 8 harness/timing false-positives, and 3 non-shipstopper tracked items. Down from 23 in baseline sweep #1. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 12:31:45 -04:00
/// Secondary mirror on tx1138 gitea — independent network path so a
/// single-provider outage doesn't knock out both mirrors.
const DEFAULT_SECONDARY_MIRROR_URL: &str =
"https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/manifest.json";
const UPDATE_STATE_FILE: &str = "update_state.json";
const UPDATE_MIRRORS_FILE: &str = "update-mirrors.json";
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 + v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500) with no recovery path short of SSH. This release adds a self-check guardrail to the update flow. What changed: - apply_update() writes a pending-verify marker with old+new version and a 150s deadline immediately before scheduling the service restart. - verify_pending_update() runs from main.rs startup. If the marker is present and within its freshness window, the new binary waits 15s for nginx + backend to settle, then probes https://127.0.0.1/ every 5s for up to 90s (self-signed certs accepted). - On any probe success within the window, the marker is cleared and nothing else happens. - On window-exhaust, the new binary: 1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts> (quarantined, not deleted, so we can post-mortem). 2. Restores web-ui.bak on top of web-ui. 3. Calls rollback_update() to restore the previous binary. 4. Updates state.current_version to reflect the rollback. 5. systemctl --no-block restart archipelago so the OLD binary boots. - Markers older than 10 minutes are treated as stale and cleared without probing, so a crashed-during-startup marker from weeks ago cannot spontaneously roll back a healthy node on a later reboot. - rollback_update() binary copy now goes through host_sudo instead of tokio::fs::copy, so it escapes the service's ProtectSystem=strict mount namespace. Without this, the rollback silently failed with EROFS on /usr/local/bin and orphaned the rollback - the exact opposite of what auto-rollback is for. Tests: 4 new unit tests in update::tests covering marker round-trip, absent-marker noop, no-panic on verify_pending_update with nothing to verify, and an invariant assert that the 90s probe window stays below the 600s stale threshold. All passing. Side fix: scripts/create-release-manifest.sh was dying with exit 141 (SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail. Replaced with a single awk NR==1 that doesn't short-circuit the upstream pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
/// Marker written by apply_update() just before the service restart and
/// consumed by verify_pending_update() in the NEW binary's startup path.
/// If present, the new binary probes the frontend; if the probe fails,
/// rollback_update() runs and the service restarts on the old binary.
/// Closes the "OTA broke nginx fleet-wide with no auto-rollback" failure
/// mode from 2026-04-22 (v1.7.38/39 tarball-perms bug).
const PENDING_VERIFY_FILE: &str = "update-pending-verify.json";
/// Probe timeout for the frontend health check (total time including
/// retries). Generous: the new binary has to come fully up, health
/// monitor settles, nginx has to re-read any snippet changes. 90s is
/// comfortably longer than the slowest observed startup.
const PENDING_VERIFY_WINDOW_SECS: u64 = 90;
/// If the marker is older than this on read, treat it as stale and
/// delete without probing. Guards against a node that somehow failed
/// to run verification at all (e.g. crashed during startup) from
/// spontaneously rolling back days later when the user reboots.
const PENDING_VERIFY_MAX_AGE_SECS: i64 = 600;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct UpdateMirror {
/// Full URL to `manifest.json`. Download URLs in the fetched
/// manifest are origin-rewritten to match this URL's scheme+host+
/// port, so hitting a mirror pulls its components from the same
/// mirror rather than whatever absolute host the publisher baked in.
pub url: String,
/// Human-readable label for the UI ("Server 1", "Home VPS", …).
#[serde(default)]
pub label: String,
}
fn mirrors_path(data_dir: &Path) -> std::path::PathBuf {
data_dir.join(UPDATE_MIRRORS_FILE)
}
fn default_mirrors() -> Vec<UpdateMirror> {
vec![
UpdateMirror {
chore: release v1.7.45-alpha Resilience-validated release. Three full sweeps of the new resilience harness against .228 confirm no shipstoppers. Big user-visible: - Bitcoin RPC auth durably correct via host-rendered nginx.conf bind-mount, replaces fragile post-start exec that failed under restricted-cap rootless podman ("crun: write cgroup.procs: Permission denied") - Multi-container stack installs (indeedhub, immich, btcpay, mempool) now emit phase events at every boundary so the progress bar advances - Apps no longer vanish from the dashboard mid-install (absent-scanner skips packages in transitional states) - Indeedhub fresh installs work end-to-end (was 8500+ restart loop): five missing env vars (DATABASE_PORT, QUEUE_HOST, QUEUE_PORT, S3_PRIVATE_BUCKET_NAME, AES_MASTER_SECRET) added to install code - Tailscale install fixed: --entrypoint string was being passed as a single shell-line arg; switched to custom_args array - Catalog cleaned of broken entries (dwn, endurain, ollama removed; nextcloud restored on docker.io) - Bitcoin Core update path uses correct image (was looking for nonexistent lfg2025/bitcoin:28.4) - ISO installs now allocate swap on the encrypted data partition Infra: - New resilience harness (scripts/resilience/) — black-box state-machine tester, every app × every transition. Run before each release. Sweep #3 final: PASS 107 / FAIL 12 / SKIP 14. The 12 fails are 1 cosmetic (homeassistant trusted_hosts), 8 harness/timing false-positives, and 3 non-shipstopper tracked items. Down from 23 in baseline sweep #1. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 12:31:45 -04:00
url: DEFAULT_UPDATE_MANIFEST_URL.to_string(),
label: "Server 1 (OVH)".to_string(),
},
UpdateMirror {
chore: release v1.7.45-alpha Resilience-validated release. Three full sweeps of the new resilience harness against .228 confirm no shipstoppers. Big user-visible: - Bitcoin RPC auth durably correct via host-rendered nginx.conf bind-mount, replaces fragile post-start exec that failed under restricted-cap rootless podman ("crun: write cgroup.procs: Permission denied") - Multi-container stack installs (indeedhub, immich, btcpay, mempool) now emit phase events at every boundary so the progress bar advances - Apps no longer vanish from the dashboard mid-install (absent-scanner skips packages in transitional states) - Indeedhub fresh installs work end-to-end (was 8500+ restart loop): five missing env vars (DATABASE_PORT, QUEUE_HOST, QUEUE_PORT, S3_PRIVATE_BUCKET_NAME, AES_MASTER_SECRET) added to install code - Tailscale install fixed: --entrypoint string was being passed as a single shell-line arg; switched to custom_args array - Catalog cleaned of broken entries (dwn, endurain, ollama removed; nextcloud restored on docker.io) - Bitcoin Core update path uses correct image (was looking for nonexistent lfg2025/bitcoin:28.4) - ISO installs now allocate swap on the encrypted data partition Infra: - New resilience harness (scripts/resilience/) — black-box state-machine tester, every app × every transition. Run before each release. Sweep #3 final: PASS 107 / FAIL 12 / SKIP 14. The 12 fails are 1 cosmetic (homeassistant trusted_hosts), 8 harness/timing false-positives, and 3 non-shipstopper tracked items. Down from 23 in baseline sweep #1. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 12:31:45 -04:00
url: DEFAULT_SECONDARY_MIRROR_URL.to_string(),
label: "Server 2 (tx1138)".to_string(),
},
]
}
/// Load the operator-configured mirror list. Returns defaults if the
/// file doesn't exist yet, so a node OTA'd from a pre-mirrors release
release(v1.7.31-alpha): idempotent IndeedHub install + auto-merge default mirrors/registries + 3rd OVH update mirror - Backend: install.rs registry reachability probe now strips the `host[:port]/namespace` suffix before appending `/v2/` (the Docker V2 API lives at the host root, not under the namespace) and accepts HTTP 405 in addition to 200/401 as "registry daemon alive". This fixes false "unreachable" reports on the Test button for Gitea and other registries that protect their /v2/ endpoint. - Backend: stacks.rs install_indeedhub_stack now force-removes any leftover indeedhub-* containers and indeedhub-net before creating the stack. A partial install (or the old first-boot stub racing the installer) used to leave containers around that blocked re-install with "name already in use". Re-running the App Store install now self-heals. - Backend: registry.rs load_registries auto-merges any default registry URLs missing from the saved config (appended with priority max+10+i, persisted). Lets new default mirrors (e.g. Server 3 OVH) roll out to existing nodes without manual config edits. Explicit removals still stick — URLs absent from disk AND absent from defaults stay gone. - Backend: update.rs adds DEFAULT_TERTIARY_MIRROR_URL at http://146.59.87.168:3000/ (Server 3 OVH) to default_mirrors, with the same auto-merge-on-load behavior as registries. Test updated for 3-mirror default (.160, tx1138, .168). - Scripts: dropped the first-boot IndeedHub stub (~38 lines in first-boot-containers.sh §8b). It predated the proper stack installer, raced it, and was the main source of the name-conflict mess the stacks.rs cleanup above now also guards against. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 03:26:09 -04:00
/// starts with the current default mirrors available without any
/// manual config.
///
/// Migration: any default mirror URL that isn't already in the saved
/// list gets appended at the end. This lets us add new default mirrors
/// (e.g. a new Server 3) and have them appear on existing nodes after
/// an update, without requiring manual config edits. Explicit removals
/// stick — once an operator removes a URL it stays gone unless it's
/// later re-added to defaults.
pub async fn load_mirrors(data_dir: &Path) -> Result<Vec<UpdateMirror>> {
let path = mirrors_path(data_dir);
if !path.exists() {
return Ok(default_mirrors());
}
let bytes = fs::read(&path)
.await
.with_context(|| format!("read {}", path.display()))?;
release(v1.7.31-alpha): idempotent IndeedHub install + auto-merge default mirrors/registries + 3rd OVH update mirror - Backend: install.rs registry reachability probe now strips the `host[:port]/namespace` suffix before appending `/v2/` (the Docker V2 API lives at the host root, not under the namespace) and accepts HTTP 405 in addition to 200/401 as "registry daemon alive". This fixes false "unreachable" reports on the Test button for Gitea and other registries that protect their /v2/ endpoint. - Backend: stacks.rs install_indeedhub_stack now force-removes any leftover indeedhub-* containers and indeedhub-net before creating the stack. A partial install (or the old first-boot stub racing the installer) used to leave containers around that blocked re-install with "name already in use". Re-running the App Store install now self-heals. - Backend: registry.rs load_registries auto-merges any default registry URLs missing from the saved config (appended with priority max+10+i, persisted). Lets new default mirrors (e.g. Server 3 OVH) roll out to existing nodes without manual config edits. Explicit removals still stick — URLs absent from disk AND absent from defaults stay gone. - Backend: update.rs adds DEFAULT_TERTIARY_MIRROR_URL at http://146.59.87.168:3000/ (Server 3 OVH) to default_mirrors, with the same auto-merge-on-load behavior as registries. Test updated for 3-mirror default (.160, tx1138, .168). - Scripts: dropped the first-boot IndeedHub stub (~38 lines in first-boot-containers.sh §8b). It predated the proper stack installer, raced it, and was the main source of the name-conflict mess the stacks.rs cleanup above now also guards against. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 03:26:09 -04:00
let mut list: Vec<UpdateMirror> =
serde_json::from_slice(&bytes).with_context(|| format!("parse {}", path.display()))?;
if list.is_empty() {
release(v1.7.31-alpha): idempotent IndeedHub install + auto-merge default mirrors/registries + 3rd OVH update mirror - Backend: install.rs registry reachability probe now strips the `host[:port]/namespace` suffix before appending `/v2/` (the Docker V2 API lives at the host root, not under the namespace) and accepts HTTP 405 in addition to 200/401 as "registry daemon alive". This fixes false "unreachable" reports on the Test button for Gitea and other registries that protect their /v2/ endpoint. - Backend: stacks.rs install_indeedhub_stack now force-removes any leftover indeedhub-* containers and indeedhub-net before creating the stack. A partial install (or the old first-boot stub racing the installer) used to leave containers around that blocked re-install with "name already in use". Re-running the App Store install now self-heals. - Backend: registry.rs load_registries auto-merges any default registry URLs missing from the saved config (appended with priority max+10+i, persisted). Lets new default mirrors (e.g. Server 3 OVH) roll out to existing nodes without manual config edits. Explicit removals still stick — URLs absent from disk AND absent from defaults stay gone. - Backend: update.rs adds DEFAULT_TERTIARY_MIRROR_URL at http://146.59.87.168:3000/ (Server 3 OVH) to default_mirrors, with the same auto-merge-on-load behavior as registries. Test updated for 3-mirror default (.160, tx1138, .168). - Scripts: dropped the first-boot IndeedHub stub (~38 lines in first-boot-containers.sh §8b). It predated the proper stack installer, raced it, and was the main source of the name-conflict mess the stacks.rs cleanup above now also guards against. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 03:26:09 -04:00
return Ok(default_mirrors());
}
// One-time migration: the Hetzner VPS at 23.182.128.160 was
// decommissioned 2026-04-23. Existing nodes have it baked into their
// saved mirror list (was the original Server 1). Strip it on load so
// we don't spend seconds per install timing out against a dead host.
// Exception to the usual "explicit removals stick" rule: the user
// never chose to add this — it was a default.
let before = list.len();
list.retain(|m| !m.url.contains("23.182.128.160"));
let mut changed = list.len() != before;
release(v1.7.31-alpha): idempotent IndeedHub install + auto-merge default mirrors/registries + 3rd OVH update mirror - Backend: install.rs registry reachability probe now strips the `host[:port]/namespace` suffix before appending `/v2/` (the Docker V2 API lives at the host root, not under the namespace) and accepts HTTP 405 in addition to 200/401 as "registry daemon alive". This fixes false "unreachable" reports on the Test button for Gitea and other registries that protect their /v2/ endpoint. - Backend: stacks.rs install_indeedhub_stack now force-removes any leftover indeedhub-* containers and indeedhub-net before creating the stack. A partial install (or the old first-boot stub racing the installer) used to leave containers around that blocked re-install with "name already in use". Re-running the App Store install now self-heals. - Backend: registry.rs load_registries auto-merges any default registry URLs missing from the saved config (appended with priority max+10+i, persisted). Lets new default mirrors (e.g. Server 3 OVH) roll out to existing nodes without manual config edits. Explicit removals still stick — URLs absent from disk AND absent from defaults stay gone. - Backend: update.rs adds DEFAULT_TERTIARY_MIRROR_URL at http://146.59.87.168:3000/ (Server 3 OVH) to default_mirrors, with the same auto-merge-on-load behavior as registries. Test updated for 3-mirror default (.160, tx1138, .168). - Scripts: dropped the first-boot IndeedHub stub (~38 lines in first-boot-containers.sh §8b). It predated the proper stack installer, raced it, and was the main source of the name-conflict mess the stacks.rs cleanup above now also guards against. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 03:26:09 -04:00
// Merge in any default URLs the saved config is missing.
let known: std::collections::HashSet<String> = list.iter().map(|m| m.url.clone()).collect();
release(v1.7.31-alpha): idempotent IndeedHub install + auto-merge default mirrors/registries + 3rd OVH update mirror - Backend: install.rs registry reachability probe now strips the `host[:port]/namespace` suffix before appending `/v2/` (the Docker V2 API lives at the host root, not under the namespace) and accepts HTTP 405 in addition to 200/401 as "registry daemon alive". This fixes false "unreachable" reports on the Test button for Gitea and other registries that protect their /v2/ endpoint. - Backend: stacks.rs install_indeedhub_stack now force-removes any leftover indeedhub-* containers and indeedhub-net before creating the stack. A partial install (or the old first-boot stub racing the installer) used to leave containers around that blocked re-install with "name already in use". Re-running the App Store install now self-heals. - Backend: registry.rs load_registries auto-merges any default registry URLs missing from the saved config (appended with priority max+10+i, persisted). Lets new default mirrors (e.g. Server 3 OVH) roll out to existing nodes without manual config edits. Explicit removals still stick — URLs absent from disk AND absent from defaults stay gone. - Backend: update.rs adds DEFAULT_TERTIARY_MIRROR_URL at http://146.59.87.168:3000/ (Server 3 OVH) to default_mirrors, with the same auto-merge-on-load behavior as registries. Test updated for 3-mirror default (.160, tx1138, .168). - Scripts: dropped the first-boot IndeedHub stub (~38 lines in first-boot-containers.sh §8b). It predated the proper stack installer, raced it, and was the main source of the name-conflict mess the stacks.rs cleanup above now also guards against. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 03:26:09 -04:00
let defaults = default_mirrors();
for def in &defaults {
if !known.contains(&def.url) {
list.push(def.clone());
changed = true;
release(v1.7.31-alpha): idempotent IndeedHub install + auto-merge default mirrors/registries + 3rd OVH update mirror - Backend: install.rs registry reachability probe now strips the `host[:port]/namespace` suffix before appending `/v2/` (the Docker V2 API lives at the host root, not under the namespace) and accepts HTTP 405 in addition to 200/401 as "registry daemon alive". This fixes false "unreachable" reports on the Test button for Gitea and other registries that protect their /v2/ endpoint. - Backend: stacks.rs install_indeedhub_stack now force-removes any leftover indeedhub-* containers and indeedhub-net before creating the stack. A partial install (or the old first-boot stub racing the installer) used to leave containers around that blocked re-install with "name already in use". Re-running the App Store install now self-heals. - Backend: registry.rs load_registries auto-merges any default registry URLs missing from the saved config (appended with priority max+10+i, persisted). Lets new default mirrors (e.g. Server 3 OVH) roll out to existing nodes without manual config edits. Explicit removals still stick — URLs absent from disk AND absent from defaults stay gone. - Backend: update.rs adds DEFAULT_TERTIARY_MIRROR_URL at http://146.59.87.168:3000/ (Server 3 OVH) to default_mirrors, with the same auto-merge-on-load behavior as registries. Test updated for 3-mirror default (.160, tx1138, .168). - Scripts: dropped the first-boot IndeedHub stub (~38 lines in first-boot-containers.sh §8b). It predated the proper stack installer, raced it, and was the main source of the name-conflict mess the stacks.rs cleanup above now also guards against. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 03:26:09 -04:00
}
}
let before_order: Vec<String> = list.iter().map(|m| m.url.clone()).collect();
force_ovh_update_primary(&mut list);
changed = changed || before_order != list.iter().map(|m| m.url.clone()).collect::<Vec<_>>();
if changed {
release(v1.7.31-alpha): idempotent IndeedHub install + auto-merge default mirrors/registries + 3rd OVH update mirror - Backend: install.rs registry reachability probe now strips the `host[:port]/namespace` suffix before appending `/v2/` (the Docker V2 API lives at the host root, not under the namespace) and accepts HTTP 405 in addition to 200/401 as "registry daemon alive". This fixes false "unreachable" reports on the Test button for Gitea and other registries that protect their /v2/ endpoint. - Backend: stacks.rs install_indeedhub_stack now force-removes any leftover indeedhub-* containers and indeedhub-net before creating the stack. A partial install (or the old first-boot stub racing the installer) used to leave containers around that blocked re-install with "name already in use". Re-running the App Store install now self-heals. - Backend: registry.rs load_registries auto-merges any default registry URLs missing from the saved config (appended with priority max+10+i, persisted). Lets new default mirrors (e.g. Server 3 OVH) roll out to existing nodes without manual config edits. Explicit removals still stick — URLs absent from disk AND absent from defaults stay gone. - Backend: update.rs adds DEFAULT_TERTIARY_MIRROR_URL at http://146.59.87.168:3000/ (Server 3 OVH) to default_mirrors, with the same auto-merge-on-load behavior as registries. Test updated for 3-mirror default (.160, tx1138, .168). - Scripts: dropped the first-boot IndeedHub stub (~38 lines in first-boot-containers.sh §8b). It predated the proper stack installer, raced it, and was the main source of the name-conflict mess the stacks.rs cleanup above now also guards against. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 03:26:09 -04:00
let _ = save_mirrors(data_dir, &list).await;
}
release(v1.7.31-alpha): idempotent IndeedHub install + auto-merge default mirrors/registries + 3rd OVH update mirror - Backend: install.rs registry reachability probe now strips the `host[:port]/namespace` suffix before appending `/v2/` (the Docker V2 API lives at the host root, not under the namespace) and accepts HTTP 405 in addition to 200/401 as "registry daemon alive". This fixes false "unreachable" reports on the Test button for Gitea and other registries that protect their /v2/ endpoint. - Backend: stacks.rs install_indeedhub_stack now force-removes any leftover indeedhub-* containers and indeedhub-net before creating the stack. A partial install (or the old first-boot stub racing the installer) used to leave containers around that blocked re-install with "name already in use". Re-running the App Store install now self-heals. - Backend: registry.rs load_registries auto-merges any default registry URLs missing from the saved config (appended with priority max+10+i, persisted). Lets new default mirrors (e.g. Server 3 OVH) roll out to existing nodes without manual config edits. Explicit removals still stick — URLs absent from disk AND absent from defaults stay gone. - Backend: update.rs adds DEFAULT_TERTIARY_MIRROR_URL at http://146.59.87.168:3000/ (Server 3 OVH) to default_mirrors, with the same auto-merge-on-load behavior as registries. Test updated for 3-mirror default (.160, tx1138, .168). - Scripts: dropped the first-boot IndeedHub stub (~38 lines in first-boot-containers.sh §8b). It predated the proper stack installer, raced it, and was the main source of the name-conflict mess the stacks.rs cleanup above now also guards against. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 03:26:09 -04:00
Ok(list)
}
fn force_ovh_update_primary(list: &mut Vec<UpdateMirror>) {
let defaults = default_mirrors();
for def in &defaults {
if !list.iter().any(|m| m.url == def.url) {
list.push(def.clone());
}
}
for mirror in list.iter_mut() {
if mirror.url == DEFAULT_UPDATE_MANIFEST_URL {
mirror.label = "Server 1 (OVH)".to_string();
} else if mirror.url == DEFAULT_SECONDARY_MIRROR_URL {
mirror.label = "Server 2 (tx1138)".to_string();
}
}
list.sort_by_key(|m| {
if m.url == DEFAULT_UPDATE_MANIFEST_URL {
0
} else if m.url == DEFAULT_SECONDARY_MIRROR_URL {
1
} else {
2
}
});
}
pub async fn save_mirrors(data_dir: &Path, mirrors: &[UpdateMirror]) -> Result<()> {
fs::create_dir_all(data_dir)
.await
.with_context(|| format!("mkdir {}", data_dir.display()))?;
let path = mirrors_path(data_dir);
let tmp = path.with_extension("json.tmp");
let json = serde_json::to_vec_pretty(mirrors).context("serialize mirrors")?;
fs::write(&tmp, json)
.await
.with_context(|| format!("write {}", tmp.display()))?;
fs::rename(&tmp, &path)
.await
.with_context(|| format!("rename {} -> {}", tmp.display(), path.display()))?;
Ok(())
}
/// Parse a manifest URL and return its `scheme://host[:port]` prefix.
/// Used by `rewrite_manifest_origins` so a manifest fetched from a
/// mirror points component downloads back at the same mirror rather
/// than whatever absolute URL the publisher baked in.
fn manifest_origin(manifest_url: &str) -> Option<String> {
let rest = manifest_url
.strip_prefix("https://")
.map(|r| ("https", r))
.or_else(|| manifest_url.strip_prefix("http://").map(|r| ("http", r)))?;
let (scheme, after_scheme) = rest;
let host_and_port = after_scheme.split('/').next()?;
if host_and_port.is_empty() {
return None;
}
Some(format!("{}://{}", scheme, host_and_port))
}
/// Rewrite every component `download_url` so its origin matches the
/// manifest URL we just fetched. Preserves the path portion (which is
/// consistent across mirrors — every gitea serves `/lfg2025/archy/raw/…`).
/// Leaves URLs with a different path shape untouched (some operator
/// might mirror with a custom layout; in that case we don't guess).
fn rewrite_manifest_origins(manifest: &mut UpdateManifest, manifest_url: &str) {
let Some(new_origin) = manifest_origin(manifest_url) else {
return;
};
for c in manifest.components.iter_mut() {
if let Some(orig_origin) = manifest_origin(&c.download_url) {
if orig_origin != new_origin {
let path = c.download_url.trim_start_matches(&orig_origin).to_string();
c.download_url = format!("{}{}", new_origin, path);
}
}
}
}
/// Which manifest URL to try FIRST — operator override via env wins,
/// otherwise the first entry in the mirrors list, otherwise the hard
/// default. Callers that need the full mirror walk should use
/// `load_mirrors` directly.
fn update_manifest_url() -> String {
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
std::env::var("ARCHIPELAGO_UPDATE_URL")
.unwrap_or_else(|_| DEFAULT_UPDATE_MANIFEST_URL.to_string())
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct UpdateManifest {
pub version: String,
pub release_date: String,
pub changelog: Vec<String>,
pub components: Vec<ComponentUpdate>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ComponentUpdate {
pub name: String,
pub current_version: String,
pub new_version: String,
pub download_url: String,
pub sha256: String,
pub size_bytes: u64,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
#[derive(Default)]
pub enum UpdateSchedule {
Manual,
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
#[default]
DailyCheck,
AutoApply,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct UpdateState {
pub current_version: String,
pub last_check: Option<String>,
pub available_update: Option<UpdateManifest>,
pub update_in_progress: bool,
pub rollback_available: bool,
#[serde(default)]
pub schedule: UpdateSchedule,
/// URL of the mirror whose manifest populated `available_update`.
/// Surfaces in the UI so operators can tell at a glance which mirror
/// their node actually hit (vs. just which is configured primary).
#[serde(default)]
pub manifest_mirror: Option<String>,
}
impl Default for UpdateState {
fn default() -> Self {
Self {
current_version: env!("CARGO_PKG_VERSION").to_string(),
last_check: None,
available_update: None,
update_in_progress: false,
rollback_available: false,
schedule: UpdateSchedule::DailyCheck,
manifest_mirror: None,
}
}
}
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 + v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500) with no recovery path short of SSH. This release adds a self-check guardrail to the update flow. What changed: - apply_update() writes a pending-verify marker with old+new version and a 150s deadline immediately before scheduling the service restart. - verify_pending_update() runs from main.rs startup. If the marker is present and within its freshness window, the new binary waits 15s for nginx + backend to settle, then probes https://127.0.0.1/ every 5s for up to 90s (self-signed certs accepted). - On any probe success within the window, the marker is cleared and nothing else happens. - On window-exhaust, the new binary: 1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts> (quarantined, not deleted, so we can post-mortem). 2. Restores web-ui.bak on top of web-ui. 3. Calls rollback_update() to restore the previous binary. 4. Updates state.current_version to reflect the rollback. 5. systemctl --no-block restart archipelago so the OLD binary boots. - Markers older than 10 minutes are treated as stale and cleared without probing, so a crashed-during-startup marker from weeks ago cannot spontaneously roll back a healthy node on a later reboot. - rollback_update() binary copy now goes through host_sudo instead of tokio::fs::copy, so it escapes the service's ProtectSystem=strict mount namespace. Without this, the rollback silently failed with EROFS on /usr/local/bin and orphaned the rollback - the exact opposite of what auto-rollback is for. Tests: 4 new unit tests in update::tests covering marker round-trip, absent-marker noop, no-panic on verify_pending_update with nothing to verify, and an invariant assert that the 90s probe window stays below the 600s stale threshold. All passing. Side fix: scripts/create-release-manifest.sh was dying with exit 141 (SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail. Replaced with a single awk NR==1 that doesn't short-circuit the upstream pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
/// Marker written by apply_update() just before the service restart and
/// consumed by verify_pending_update() in the NEW binary's startup path.
/// See PENDING_VERIFY_FILE for the full rationale — this is the hook
/// that turns "nginx 500 on every page after OTA" from an unrecoverable
/// field incident into an automatic rollback.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PendingVerification {
/// RFC3339 timestamp of the apply that wrote this marker.
pub applied_at: String,
/// Version we just applied (what the NEW binary should be running).
pub new_version: String,
/// Version the outgoing binary was running (what we roll back to).
pub previous_version: String,
/// Unix epoch seconds after which the probe should give up and
/// trigger rollback. Prevents a probe from retrying forever if e.g.
/// nginx is totally wedged.
pub deadline_ts: i64,
}
async fn write_pending_verification(data_dir: &Path, marker: &PendingVerification) -> Result<()> {
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 + v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500) with no recovery path short of SSH. This release adds a self-check guardrail to the update flow. What changed: - apply_update() writes a pending-verify marker with old+new version and a 150s deadline immediately before scheduling the service restart. - verify_pending_update() runs from main.rs startup. If the marker is present and within its freshness window, the new binary waits 15s for nginx + backend to settle, then probes https://127.0.0.1/ every 5s for up to 90s (self-signed certs accepted). - On any probe success within the window, the marker is cleared and nothing else happens. - On window-exhaust, the new binary: 1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts> (quarantined, not deleted, so we can post-mortem). 2. Restores web-ui.bak on top of web-ui. 3. Calls rollback_update() to restore the previous binary. 4. Updates state.current_version to reflect the rollback. 5. systemctl --no-block restart archipelago so the OLD binary boots. - Markers older than 10 minutes are treated as stale and cleared without probing, so a crashed-during-startup marker from weeks ago cannot spontaneously roll back a healthy node on a later reboot. - rollback_update() binary copy now goes through host_sudo instead of tokio::fs::copy, so it escapes the service's ProtectSystem=strict mount namespace. Without this, the rollback silently failed with EROFS on /usr/local/bin and orphaned the rollback - the exact opposite of what auto-rollback is for. Tests: 4 new unit tests in update::tests covering marker round-trip, absent-marker noop, no-panic on verify_pending_update with nothing to verify, and an invariant assert that the 90s probe window stays below the 600s stale threshold. All passing. Side fix: scripts/create-release-manifest.sh was dying with exit 141 (SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail. Replaced with a single awk NR==1 that doesn't short-circuit the upstream pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
let path = data_dir.join(PENDING_VERIFY_FILE);
let data = serde_json::to_string_pretty(marker).context("serialize pending-verify marker")?;
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 + v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500) with no recovery path short of SSH. This release adds a self-check guardrail to the update flow. What changed: - apply_update() writes a pending-verify marker with old+new version and a 150s deadline immediately before scheduling the service restart. - verify_pending_update() runs from main.rs startup. If the marker is present and within its freshness window, the new binary waits 15s for nginx + backend to settle, then probes https://127.0.0.1/ every 5s for up to 90s (self-signed certs accepted). - On any probe success within the window, the marker is cleared and nothing else happens. - On window-exhaust, the new binary: 1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts> (quarantined, not deleted, so we can post-mortem). 2. Restores web-ui.bak on top of web-ui. 3. Calls rollback_update() to restore the previous binary. 4. Updates state.current_version to reflect the rollback. 5. systemctl --no-block restart archipelago so the OLD binary boots. - Markers older than 10 minutes are treated as stale and cleared without probing, so a crashed-during-startup marker from weeks ago cannot spontaneously roll back a healthy node on a later reboot. - rollback_update() binary copy now goes through host_sudo instead of tokio::fs::copy, so it escapes the service's ProtectSystem=strict mount namespace. Without this, the rollback silently failed with EROFS on /usr/local/bin and orphaned the rollback - the exact opposite of what auto-rollback is for. Tests: 4 new unit tests in update::tests covering marker round-trip, absent-marker noop, no-panic on verify_pending_update with nothing to verify, and an invariant assert that the 90s probe window stays below the 600s stale threshold. All passing. Side fix: scripts/create-release-manifest.sh was dying with exit 141 (SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail. Replaced with a single awk NR==1 that doesn't short-circuit the upstream pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
fs::write(&path, data)
.await
.with_context(|| format!("write pending-verify marker to {}", path.display()))?;
Ok(())
}
async fn read_pending_verification(data_dir: &Path) -> Option<PendingVerification> {
let path = data_dir.join(PENDING_VERIFY_FILE);
let data = fs::read_to_string(&path).await.ok()?;
serde_json::from_str(&data).ok()
}
async fn clear_pending_verification(data_dir: &Path) {
let path = data_dir.join(PENDING_VERIFY_FILE);
let _ = fs::remove_file(&path).await;
}
/// Probe the local frontend through nginx. Returns Ok(()) on the first
/// response that's 2xx or 3xx; errors on timeout / connection refused /
/// any 4xx/5xx. `accept_self_signed` because nodes use a self-signed
/// cert the reqwest default root-set doesn't trust.
async fn probe_frontend_once() -> Result<()> {
let client = reqwest::Client::builder()
.danger_accept_invalid_certs(true)
.timeout(std::time::Duration::from_secs(5))
.build()
.context("build probe client")?;
// Prefer HTTPS since that's the failure mode we're catching (nginx
// 500 on the PWA). HTTP usually redirects to HTTPS and would mask
// the bug.
let resp = client
.get("https://127.0.0.1/")
.send()
.await
.context("probe GET https://127.0.0.1/")?;
let status = resp.status();
if status.is_success() || status.is_redirection() {
return Ok(());
}
anyhow::bail!("frontend probe returned HTTP {}", status);
}
/// Called from main.rs startup. If a pending-verification marker is
/// present, probe the frontend; on failure, trigger rollback and
/// restart the service so the OLD binary boots.
///
/// This is the "post-OTA auto-rollback" guardrail. If ANY problem in
/// the new version takes down the PWA (bad tarball perms as in v1.7.38,
/// a broken service worker, a missing asset, a backend panic on first
/// boot), the node self-heals back to the previous working state
/// without SSH intervention.
pub async fn verify_pending_update(data_dir: &Path) {
let marker = match read_pending_verification(data_dir).await {
Some(m) => m,
None => return, // No update pending; nothing to verify.
};
// Guard against a marker left behind by some earlier crash path —
// don't want a user who reboots days later to suddenly get
// rolled back because the marker was never cleared.
let applied_at = chrono::DateTime::parse_from_rfc3339(&marker.applied_at);
if let Ok(ts) = applied_at {
let age = chrono::Utc::now() - ts.with_timezone(&chrono::Utc);
if age.num_seconds() > PENDING_VERIFY_MAX_AGE_SECS {
tracing::warn!(
age_secs = age.num_seconds(),
"pending-verify marker is stale, clearing without probing"
);
clear_pending_verification(data_dir).await;
return;
}
}
info!(
new_version = %marker.new_version,
previous_version = %marker.previous_version,
"Post-OTA verification: probing frontend at https://127.0.0.1/"
);
// Give the new service time to bind its listeners + nginx to
// pick up any config changes. 15s matches what we observed on
// .116 during the v1.7.40 rollout recovery.
tokio::time::sleep(std::time::Duration::from_secs(15)).await;
let deadline =
std::time::Instant::now() + std::time::Duration::from_secs(PENDING_VERIFY_WINDOW_SECS);
let mut attempt = 0u32;
let mut last_err: Option<String> = None;
while std::time::Instant::now() < deadline {
attempt += 1;
match probe_frontend_once().await {
Ok(()) => {
info!(attempt, "Post-OTA verification succeeded — clearing marker");
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 + v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500) with no recovery path short of SSH. This release adds a self-check guardrail to the update flow. What changed: - apply_update() writes a pending-verify marker with old+new version and a 150s deadline immediately before scheduling the service restart. - verify_pending_update() runs from main.rs startup. If the marker is present and within its freshness window, the new binary waits 15s for nginx + backend to settle, then probes https://127.0.0.1/ every 5s for up to 90s (self-signed certs accepted). - On any probe success within the window, the marker is cleared and nothing else happens. - On window-exhaust, the new binary: 1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts> (quarantined, not deleted, so we can post-mortem). 2. Restores web-ui.bak on top of web-ui. 3. Calls rollback_update() to restore the previous binary. 4. Updates state.current_version to reflect the rollback. 5. systemctl --no-block restart archipelago so the OLD binary boots. - Markers older than 10 minutes are treated as stale and cleared without probing, so a crashed-during-startup marker from weeks ago cannot spontaneously roll back a healthy node on a later reboot. - rollback_update() binary copy now goes through host_sudo instead of tokio::fs::copy, so it escapes the service's ProtectSystem=strict mount namespace. Without this, the rollback silently failed with EROFS on /usr/local/bin and orphaned the rollback - the exact opposite of what auto-rollback is for. Tests: 4 new unit tests in update::tests covering marker round-trip, absent-marker noop, no-panic on verify_pending_update with nothing to verify, and an invariant assert that the 90s probe window stays below the 600s stale threshold. All passing. Side fix: scripts/create-release-manifest.sh was dying with exit 141 (SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail. Replaced with a single awk NR==1 that doesn't short-circuit the upstream pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
clear_pending_verification(data_dir).await;
return;
}
Err(e) => {
let msg = e.to_string();
tracing::warn!(attempt, error = %msg, "Post-OTA probe failed, retrying");
last_err = Some(msg);
}
}
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
}
tracing::error!(
attempts = attempt,
window_secs = PENDING_VERIFY_WINDOW_SECS,
last_error = last_err.as_deref().unwrap_or(""),
new_version = %marker.new_version,
previous_version = %marker.previous_version,
"Post-OTA verification FAILED — rolling back"
);
// Restore web-ui.bak on top of web-ui. update.rs keeps web-ui.bak
// from the previous apply; moving it back is the frontend half of
// the rollback. The binary half is handled by rollback_update().
let web_ui_bak = Path::new("/opt/archipelago/web-ui.bak");
let web_ui = "/opt/archipelago/web-ui";
if web_ui_bak.exists() {
let ts = chrono::Utc::now().timestamp_millis();
let quarantine = format!("/opt/archipelago/web-ui.failed.{}", ts);
let _ = host_sudo(&["mv", web_ui, &quarantine]).await;
let _ = host_sudo(&["mv", web_ui_bak.to_str().unwrap_or(""), web_ui]).await;
tracing::info!(quarantined = %quarantine, "Restored web-ui from web-ui.bak");
} else {
tracing::warn!("web-ui.bak not present — frontend cannot be rolled back, only binary");
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 + v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500) with no recovery path short of SSH. This release adds a self-check guardrail to the update flow. What changed: - apply_update() writes a pending-verify marker with old+new version and a 150s deadline immediately before scheduling the service restart. - verify_pending_update() runs from main.rs startup. If the marker is present and within its freshness window, the new binary waits 15s for nginx + backend to settle, then probes https://127.0.0.1/ every 5s for up to 90s (self-signed certs accepted). - On any probe success within the window, the marker is cleared and nothing else happens. - On window-exhaust, the new binary: 1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts> (quarantined, not deleted, so we can post-mortem). 2. Restores web-ui.bak on top of web-ui. 3. Calls rollback_update() to restore the previous binary. 4. Updates state.current_version to reflect the rollback. 5. systemctl --no-block restart archipelago so the OLD binary boots. - Markers older than 10 minutes are treated as stale and cleared without probing, so a crashed-during-startup marker from weeks ago cannot spontaneously roll back a healthy node on a later reboot. - rollback_update() binary copy now goes through host_sudo instead of tokio::fs::copy, so it escapes the service's ProtectSystem=strict mount namespace. Without this, the rollback silently failed with EROFS on /usr/local/bin and orphaned the rollback - the exact opposite of what auto-rollback is for. Tests: 4 new unit tests in update::tests covering marker round-trip, absent-marker noop, no-panic on verify_pending_update with nothing to verify, and an invariant assert that the 90s probe window stays below the 600s stale threshold. All passing. Side fix: scripts/create-release-manifest.sh was dying with exit 141 (SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail. Replaced with a single awk NR==1 that doesn't short-circuit the upstream pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
}
if let Err(e) = rollback_update(data_dir).await {
tracing::error!(error = %e, "rollback_update() failed during post-OTA verification");
// Leave the marker in place so a future boot gets another shot.
return;
}
clear_pending_verification(data_dir).await;
// Record why we rolled back so the UI can show it on the next boot.
if let Ok(mut state) = load_state(data_dir).await {
state.current_version = marker.previous_version.clone();
if let Err(e) = save_state(data_dir, &state).await {
tracing::warn!(error = %e, "Failed to update state after rollback");
}
}
// Restart so the old binary takes over. --no-block because we're
// the service; systemd can't wait for us to exit before starting
// the old process.
let _ = host_sudo(&["systemctl", "--no-block", "restart", "archipelago"]).await;
}
pub async fn load_state(data_dir: &Path) -> Result<UpdateState> {
let path = data_dir.join(UPDATE_STATE_FILE);
if !path.exists() {
let state = UpdateState::default();
save_state(data_dir, &state).await?;
return Ok(state);
}
let data = fs::read_to_string(&path)
.await
.context("Reading update state")?;
let mut state: UpdateState = serde_json::from_str(&data).context("Parsing update state")?;
feat(identity,update): default avatars, public blobs, long-running downloads Follow-up to 1fb71b4b on the same v1.7.0-alpha line. Identity avatars • New module `avatar.rs` generates two deterministic SVG styles keyed off the pubkey: a 5×5 mirrored identicon for sub-identities and a hexagonal-network motif for the master (seed index 0) identity. Both returned as base64 data URLs, so a fresh identity has a recognisable picture before the user uploads anything. • `IdentityManager::create()` and `create_from_seed()` populate `profile.picture` on creation. Index 0 gets the node SVG; all other seed-derived + ad-hoc identities get the identicon. Blob store — public flag for profile assets • `BlobMeta.public` (default false) added; `BlobStore::put()` takes a `public: bool`. Missing in legacy meta files = false. • `POST /api/blob` now stores uploads with public=true and returns `public_url` alongside `self_test_url`. public_url is `http://<node-onion>/blob/<cid>` (no cap) if Tor has published the archipelago hidden service, else falls back to the local path. • `GET /blob/<cid>` bypasses the HMAC capability check when the requested blob is flagged public — external Nostr clients fetching a kind-0 `picture` URL can't hold a cap. • Mesh callers (content_ref attachments, dispatch rehydration) pin public=false explicitly so nothing leaks out of the mesh path. Profile editor UX • Collapsed Save + Save & Publish into one button — the Save action now persists locally AND publishes the kind-0 metadata event in one step. Uploads store `public_url` into `profile.picture` / `profile.banner` so the published URL is reachable by external clients. Update client — the 15-second cliff • Frontend `rpcClient.call` for `update.download` now has an explicit 30-minute timeout (was falling back to the default 15 s). `update.apply` gets 5 min, `update.git-apply` gets 15 min. Matches what the backend is actually willing to wait for. • Backend `load_state()` reconciles `state.current_version` with `CARGO_PKG_VERSION` on every start. Sideloaded or reflashed nodes were stuck advertising the old version even with a new binary in place, which kept re-offering the same release as an update. Manifest changelog rewritten for fleet readers per the saved feedback (no function names, no file paths). Artefacts refreshed: binary 12f838c5…5ba82d 40381864 frontend dc3b63af…e9a8370 76984288 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 10:03:38 -04:00
2026-06-12 03:00:15 -04:00
let mut changed = false;
feat(identity,update): default avatars, public blobs, long-running downloads Follow-up to 1fb71b4b on the same v1.7.0-alpha line. Identity avatars • New module `avatar.rs` generates two deterministic SVG styles keyed off the pubkey: a 5×5 mirrored identicon for sub-identities and a hexagonal-network motif for the master (seed index 0) identity. Both returned as base64 data URLs, so a fresh identity has a recognisable picture before the user uploads anything. • `IdentityManager::create()` and `create_from_seed()` populate `profile.picture` on creation. Index 0 gets the node SVG; all other seed-derived + ad-hoc identities get the identicon. Blob store — public flag for profile assets • `BlobMeta.public` (default false) added; `BlobStore::put()` takes a `public: bool`. Missing in legacy meta files = false. • `POST /api/blob` now stores uploads with public=true and returns `public_url` alongside `self_test_url`. public_url is `http://<node-onion>/blob/<cid>` (no cap) if Tor has published the archipelago hidden service, else falls back to the local path. • `GET /blob/<cid>` bypasses the HMAC capability check when the requested blob is flagged public — external Nostr clients fetching a kind-0 `picture` URL can't hold a cap. • Mesh callers (content_ref attachments, dispatch rehydration) pin public=false explicitly so nothing leaks out of the mesh path. Profile editor UX • Collapsed Save + Save & Publish into one button — the Save action now persists locally AND publishes the kind-0 metadata event in one step. Uploads store `public_url` into `profile.picture` / `profile.banner` so the published URL is reachable by external clients. Update client — the 15-second cliff • Frontend `rpcClient.call` for `update.download` now has an explicit 30-minute timeout (was falling back to the default 15 s). `update.apply` gets 5 min, `update.git-apply` gets 15 min. Matches what the backend is actually willing to wait for. • Backend `load_state()` reconciles `state.current_version` with `CARGO_PKG_VERSION` on every start. Sideloaded or reflashed nodes were stuck advertising the old version even with a new binary in place, which kept re-offering the same release as an update. Manifest changelog rewritten for fleet readers per the saved feedback (no function names, no file paths). Artefacts refreshed: binary 12f838c5…5ba82d 40381864 frontend dc3b63af…e9a8370 76984288 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 10:03:38 -04:00
// Keep current_version in sync with the binary. Sideloaded nodes
// (ssh + cp /usr/local/bin/archipelago) don't touch the state file,
// so without this the running 1.7.0-alpha binary would keep seeing
// `current_version: "1.6.0-alpha"` and re-offer itself as an update.
let running = env!("CARGO_PKG_VERSION");
if state.current_version != running {
state.current_version = running.to_string();
// Binary version changed (sideload or apply). Any stored
// `available_update` is either redundant (points at the running
// version) or stale (points at a version we've already passed —
// which would surface as a "downgrade" offer in the UI). Clear
// it unconditionally; the next check_for_updates will repopulate
// if there's genuinely something newer.
state.available_update = None;
state.manifest_mirror = None;
2026-06-12 03:00:15 -04:00
changed = true;
}
// `update_in_progress` means a manifest OTA is downloaded and staged,
// ready for apply. Older git/self-build update paths could leave this
// flag stuck true without a staging directory, which traps the UI in an
// unrecoverable state. Heal that on every state load.
if state.update_in_progress && !has_staged_update(data_dir).await {
warn!(
staging = %data_dir.join("update-staging").display(),
"Clearing stale update_in_progress without staged OTA files"
);
state.update_in_progress = false;
changed = true;
}
if changed {
feat(identity,update): default avatars, public blobs, long-running downloads Follow-up to 1fb71b4b on the same v1.7.0-alpha line. Identity avatars • New module `avatar.rs` generates two deterministic SVG styles keyed off the pubkey: a 5×5 mirrored identicon for sub-identities and a hexagonal-network motif for the master (seed index 0) identity. Both returned as base64 data URLs, so a fresh identity has a recognisable picture before the user uploads anything. • `IdentityManager::create()` and `create_from_seed()` populate `profile.picture` on creation. Index 0 gets the node SVG; all other seed-derived + ad-hoc identities get the identicon. Blob store — public flag for profile assets • `BlobMeta.public` (default false) added; `BlobStore::put()` takes a `public: bool`. Missing in legacy meta files = false. • `POST /api/blob` now stores uploads with public=true and returns `public_url` alongside `self_test_url`. public_url is `http://<node-onion>/blob/<cid>` (no cap) if Tor has published the archipelago hidden service, else falls back to the local path. • `GET /blob/<cid>` bypasses the HMAC capability check when the requested blob is flagged public — external Nostr clients fetching a kind-0 `picture` URL can't hold a cap. • Mesh callers (content_ref attachments, dispatch rehydration) pin public=false explicitly so nothing leaks out of the mesh path. Profile editor UX • Collapsed Save + Save & Publish into one button — the Save action now persists locally AND publishes the kind-0 metadata event in one step. Uploads store `public_url` into `profile.picture` / `profile.banner` so the published URL is reachable by external clients. Update client — the 15-second cliff • Frontend `rpcClient.call` for `update.download` now has an explicit 30-minute timeout (was falling back to the default 15 s). `update.apply` gets 5 min, `update.git-apply` gets 15 min. Matches what the backend is actually willing to wait for. • Backend `load_state()` reconciles `state.current_version` with `CARGO_PKG_VERSION` on every start. Sideloaded or reflashed nodes were stuck advertising the old version even with a new binary in place, which kept re-offering the same release as an update. Manifest changelog rewritten for fleet readers per the saved feedback (no function names, no file paths). Artefacts refreshed: binary 12f838c5…5ba82d 40381864 frontend dc3b63af…e9a8370 76984288 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 10:03:38 -04:00
save_state(data_dir, &state).await?;
}
Ok(state)
}
2026-06-12 03:00:15 -04:00
async fn has_staged_update(data_dir: &Path) -> bool {
let staging_dir = data_dir.join("update-staging");
let Ok(mut entries) = fs::read_dir(&staging_dir).await else {
return false;
};
matches!(entries.next_entry().await, Ok(Some(_)))
}
pub async fn save_state(data_dir: &Path, state: &UpdateState) -> Result<()> {
let path = data_dir.join(UPDATE_STATE_FILE);
let data = serde_json::to_string_pretty(state)?;
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
fs::write(&path, data).await.context("Writing update state")
}
/// Check for available updates by walking the mirror list. The first
/// mirror that returns a parseable manifest with a strictly-newer
/// version wins; if no mirror offers a newer version, the node is
/// reported as up-to-date. Per-mirror we retry up to 3 times on
/// transient failures.
///
/// Manifest `download_url`s are origin-rewritten to match the mirror
/// we fetched from, so switching mirrors in the UI also switches where
/// component downloads come from — even if the publisher baked an
/// absolute URL pointing at a different server into the manifest.
pub async fn check_for_updates(data_dir: &Path) -> Result<UpdateState> {
let mut state = load_state(data_dir).await?;
info!("Checking for updates...");
let client = reqwest::Client::builder()
// Short per-attempt HTTP timeout so a wedged mirror doesn't
// delay the whole check — we'd rather move on to the next
// mirror quickly than sit waiting on a slow one. 15s covers
// slow but alive mirrors.
.timeout(std::time::Duration::from_secs(15))
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
.connect_timeout(std::time::Duration::from_secs(10))
.build()
.context("Failed to create HTTP client")?;
// Env override (ARCHIPELAGO_UPDATE_URL) short-circuits the mirror
// list — used on dev boxes that point at a local gitea. Otherwise
// walk the operator-configured list and fall through on failure.
let mirrors: Vec<String> = if std::env::var("ARCHIPELAGO_UPDATE_URL").is_ok() {
vec![update_manifest_url()]
} else {
load_mirrors(data_dir)
.await
.unwrap_or_else(|_| default_mirrors())
.into_iter()
.map(|m| m.url)
.collect()
};
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
let mut last_err: Option<String> = None;
let mut handled = false;
'mirrors: for manifest_url in mirrors.iter() {
for attempt in 1..=3u8 {
if attempt > 1 {
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
}
match client.get(manifest_url).send().await {
Ok(resp) if resp.status().is_success() => match resp.json::<UpdateManifest>().await
{
Ok(mut manifest) => {
rewrite_manifest_origins(&mut manifest, manifest_url);
if is_newer(&manifest.version, &state.current_version) {
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
info!(
current = %state.current_version,
available = %manifest.version,
mirror = %manifest_url,
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
"Update available"
);
state.available_update = Some(manifest);
state.manifest_mirror = Some(manifest_url.clone());
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
} else {
// Manifest version matches us or is behind
// us — either we're current, or this mirror
// is stale. Try the next mirror; if all are
// stale or at our version we'll fall through
// to "up to date".
debug!(
current = %state.current_version,
manifest = %manifest.version,
mirror = %manifest_url,
"No newer version in manifest"
);
state.manifest_mirror = None;
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
state.available_update = None;
handled = true;
continue 'mirrors;
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
}
handled = true;
break 'mirrors;
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
}
Err(e) => last_err = Some(format!("{}: parse: {}", manifest_url, e)),
},
Ok(resp) => {
last_err = Some(format!("{}: HTTP {}", manifest_url, resp.status()));
}
Err(e) => {
last_err = Some(format!("{}: {}", manifest_url, e));
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
}
}
}
tracing::debug!(mirror = %manifest_url, "Mirror exhausted, trying next");
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
}
if !handled {
if let Some(e) = last_err {
debug!("Update check failed across all mirrors: {}", e);
}
}
state.last_check = Some(chrono::Utc::now().to_rfc3339());
save_state(data_dir, &state).await?;
Ok(state)
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MirrorTestResult {
pub reachable: bool,
pub latency_ms: u64,
pub http_status: Option<u16>,
pub error: Option<String>,
}
/// Ping a mirror's manifest URL and return reachability + wall-clock
/// latency. Used by the "Test mirror" button so operators can sanity-
/// check a newly added mirror without running a full update check.
pub async fn test_mirror(url: &str) -> MirrorTestResult {
let client = match reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(10))
.connect_timeout(std::time::Duration::from_secs(5))
.build()
{
Ok(c) => c,
Err(e) => {
return MirrorTestResult {
reachable: false,
latency_ms: 0,
http_status: None,
error: Some(format!("client build failed: {}", e)),
}
}
};
let start = std::time::Instant::now();
match client.get(url).send().await {
Ok(resp) => {
let latency_ms = start.elapsed().as_millis() as u64;
let status = resp.status();
if status.is_success() {
MirrorTestResult {
reachable: true,
latency_ms,
http_status: Some(status.as_u16()),
error: None,
}
} else {
MirrorTestResult {
reachable: false,
latency_ms,
http_status: Some(status.as_u16()),
error: Some(format!("HTTP {}", status.as_u16())),
}
}
}
Err(e) => {
let latency_ms = start.elapsed().as_millis() as u64;
MirrorTestResult {
reachable: false,
latency_ms,
http_status: None,
error: Some(e.to_string()),
}
}
}
}
/// Get current update status without checking remote.
pub async fn get_status(data_dir: &Path) -> Result<UpdateState> {
load_state(data_dir).await
}
/// Dismiss the available update notification.
pub async fn dismiss_update(data_dir: &Path) -> Result<()> {
let mut state = load_state(data_dir).await?;
state.available_update = None;
save_state(data_dir, &state).await
}
/// Download update components to a staging directory.
/// Verifies SHA256 hash for each component.
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
///
/// Robustness: each component download is **resumable** via HTTP Range
/// requests and retried up to 6 times with exponential backoff. When
/// gitea drops the connection mid-stream (happens regularly at slow
/// raw-file throughput), the next attempt picks up where the previous
/// one left off instead of restarting from byte zero. SHA256 is
/// verified over the complete file at the end of each component, so a
/// partially-corrupt resume still fails cleanly.
pub async fn download_update(data_dir: &Path) -> Result<DownloadProgress> {
2026-05-13 15:09:22 -04:00
let mut state = load_state(data_dir).await?;
if state.available_update.is_none() {
state = check_for_updates(data_dir).await?;
}
let manifest = state
.available_update
.as_ref()
2026-05-13 15:09:22 -04:00
.ok_or_else(|| anyhow::anyhow!("No update is available to download"))?;
let staging_dir = data_dir.join("update-staging");
fs::create_dir_all(&staging_dir)
.await
.context("Failed to create staging dir")?;
let client = reqwest::Client::builder()
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
// Per-request budget; each attempt gets the full hour. A retry
// restarts the budget cleanly.
release(v1.7.14-alpha): install overlay + FIPS real fix + AIUI restore Install UX SystemUpdate.vue now shows a full-screen overlay after apply: the BitcoinFaceAscii logo, a target-version label, an indeterminate progress stripe (solid orange; solid green on ready), and an elapsed-time readout. Polls /health every 1.5s and auto-reloads once the backend reports the new version. 3-min stall → "Reload now" button. Download UI also shows a spinner + "Finishing download — verifying checksum…" while the fake bar sits at 95%. FIPS reconnect — for real this time New fips.reconnect RPC does stop → start → wait 20s → re-poll → classify. Classification buckets: connected / daemon_down / no_seed_key / no_outbound_udp_or_anchor_down / peers_but_no_anchor, each with a plain-language hint surfaced verbatim by the Reconnect button. The real reason nodes like .198/.253 couldn't reach the anchor: identity::write_fips_key_from_seed was writing fips_key.pub as a bech32 npub TEXT file, but upstream fips expects 32 raw bytes. The daemon silently authenticated with garbage. Fix: PublicKey::to_bytes() → raw 32 bytes, and new fips::config::normalize_pub_file migrates legacy files by decoding the npub and rewriting in place. fips.reconnect also re-installs the config + healed keys to /etc/fips before restarting. AIUI preservation + restore apply_update was wiping /opt/archipelago/web-ui/aiui because the Vue build doesn't include it — every OTA lost the Claude sidebar. The preserve block now copies aiui/ + archipelago-companion.apk from the old web-ui into the staging dir before the swap, and prefers new-tar versions if present. To restore it on the three nodes that already lost it (.116/.198/.253), this release bundles the 85 MB aiui build into the frontend tarball. Frontend component size is now ~155 MB. Download / install timeouts Backend download client timeout 1800s → 3600s (1 h). Larger tarball + slow gitea raw throughput put us above the old cap. Frontend update.download rpc timeout 30 min → 65 min to match. package.install rpc timeout 15 min → 45 min — IndeedHub pulls 6 images and was timing out mid-install. UI nit "Rollback to Previous" → "Rollback Available". App-catalog proxy already landed in v1.7.13. Artefacts: archipelago 725e18e6…3c525e6 40462288 archipelago-frontend-1.7.14-alpha.tar.gz c35284be…ff2c16 162077052 (+aiui) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 16:40:25 -04:00
.timeout(std::time::Duration::from_secs(3600))
.connect_timeout(std::time::Duration::from_secs(30))
.build()
.context("Failed to create HTTP client")?;
let mut downloaded = 0u64;
let total_bytes: u64 = manifest.components.iter().map(|c| c.size_bytes).sum();
info!(
version = %manifest.version,
components = manifest.components.len(),
total_bytes,
staging = %staging_dir.display(),
"Starting update download"
);
// Clear any stale cancel flag from a prior aborted run, then seed
// the live counters so polls during the handshake show the right
// denominator immediately instead of 0/0 → NaN%.
DOWNLOAD_CANCEL.store(false, Ordering::Relaxed);
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
DOWNLOAD_TOTAL.store(total_bytes, Ordering::Relaxed);
DOWNLOAD_BYTES.store(0, Ordering::Relaxed);
DOWNLOAD_PROGRESS_AT.store(now_ms(), Ordering::Relaxed);
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
for component in &manifest.components {
if is_canceled() {
DOWNLOAD_TOTAL.store(0, Ordering::Relaxed);
DOWNLOAD_BYTES.store(0, Ordering::Relaxed);
anyhow::bail!("Download canceled");
}
info!(name = %component.name, url = %component.download_url, "Downloading component");
let dest = staging_dir.join(&component.name);
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
download_component_resumable(&client, component, &dest, downloaded).await?;
downloaded += component.size_bytes;
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
DOWNLOAD_BYTES.store(downloaded, Ordering::Relaxed);
info!(
name = %component.name,
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
bytes = component.size_bytes,
"Component downloaded and verified"
);
}
// Mark update as downloaded
let mut state = load_state(data_dir).await?;
state.update_in_progress = true;
save_state(data_dir, &state).await?;
Ok(DownloadProgress {
total_bytes,
downloaded_bytes: downloaded,
components_downloaded: manifest.components.len(),
staging_dir: staging_dir.to_string_lossy().to_string(),
})
}
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
/// Download a single component to `dest`, resuming from the end of
/// any existing partial file via a Range request. Retries up to 6
/// times with exponential backoff (5s, 15s, 30s, 60s, 120s, 180s).
/// Verifies the SHA256 over the full file at the end.
async fn download_component_resumable(
client: &reqwest::Client,
component: &ComponentUpdate,
dest: &Path,
prior_total: u64,
) -> Result<()> {
use sha2::{Digest, Sha256};
use tokio::io::AsyncWriteExt;
const MAX_ATTEMPTS: u32 = 6;
const BACKOFFS: [u64; 5] = [5, 15, 30, 60, 120];
let mut last_err: Option<anyhow::Error> = None;
for attempt in 1..=MAX_ATTEMPTS {
let existing_len = match tokio::fs::metadata(dest).await {
Ok(m) => m.len(),
Err(_) => 0,
};
if existing_len >= component.size_bytes {
// File is already complete — break out and go verify.
break;
}
if attempt > 1 {
let delay = BACKOFFS[(attempt as usize - 2).min(BACKOFFS.len() - 1)];
tracing::warn!(
name = %component.name,
attempt,
resume_at = existing_len,
"Retrying download in {}s (previous error: {})",
delay,
last_err.as_ref().map(|e| e.to_string()).unwrap_or_default()
);
// Sleep in 500ms slices so a Cancel during backoff wakes
// promptly instead of waiting out the full exponential window.
let slices = delay * 2;
for _ in 0..slices {
if is_canceled() {
anyhow::bail!("Download canceled");
}
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
}
}
if is_canceled() {
anyhow::bail!("Download canceled");
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
}
let mut req = client.get(&component.download_url);
if existing_len > 0 {
req = req.header("Range", format!("bytes={}-", existing_len));
}
let resp = match req.send().await {
Ok(r) => r,
Err(e) => {
last_err = Some(anyhow::anyhow!(e));
continue;
}
};
let status = resp.status();
// 200 OK on a fresh start, 206 Partial Content on a resume
// that the server honoured. Anything else is a problem.
let is_resume = existing_len > 0 && status == reqwest::StatusCode::PARTIAL_CONTENT;
let is_fresh = existing_len == 0 && status.is_success();
let server_ignored_range = existing_len > 0 && status == reqwest::StatusCode::OK;
if !is_resume && !is_fresh && !server_ignored_range {
last_err = Some(anyhow::anyhow!(
"HTTP {} for {} (resume offset {})",
status,
component.name,
existing_len
));
continue;
}
// If the server ignored Range (returned 200 with the full
// body), wipe the partial file and start over.
let mut file = if server_ignored_range {
let _ = tokio::fs::remove_file(dest).await;
tokio::fs::OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open(dest)
.await
.context("open staging file")?
} else if is_resume {
tokio::fs::OpenOptions::new()
.append(true)
.open(dest)
.await
.context("open staging file for append")?
} else {
tokio::fs::OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open(dest)
.await
.context("open staging file")?
};
let mut resp = resp;
let mut stream_err = false;
let mut on_disk = existing_len;
let mut canceled = false;
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
loop {
if is_canceled() {
canceled = true;
break;
}
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
match resp.chunk().await {
Ok(Some(bytes)) => {
if let Err(e) = file.write_all(&bytes).await {
last_err = Some(anyhow::anyhow!(e).context("writing chunk"));
stream_err = true;
break;
}
on_disk += bytes.len() as u64;
DOWNLOAD_BYTES.store(
prior_total + on_disk.min(component.size_bytes),
Ordering::Relaxed,
);
DOWNLOAD_PROGRESS_AT.store(now_ms(), Ordering::Relaxed);
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
}
Ok(None) => break, // stream ended cleanly
Err(e) => {
last_err = Some(anyhow::anyhow!(e).context("reading chunk"));
stream_err = true;
break;
}
}
}
if canceled {
let _ = file.flush().await;
drop(file);
DOWNLOAD_TOTAL.store(0, Ordering::Relaxed);
DOWNLOAD_BYTES.store(0, Ordering::Relaxed);
anyhow::bail!("Download canceled");
}
release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress download_update Each component download is now resumable via HTTP Range requests (Range: bytes=N-) and retried up to 6 times with exponential backoff (5/15/30/60/120/180s). On a dropped connection the next attempt picks up at the last written byte offset instead of restarting at zero. Streams via reqwest::Response::chunk() to the staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA is verified over the complete file at the end of each component; mismatch nukes the staged file and restarts from scratch. Real download progress counters New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated from the chunk loop. update.status exposes them as download_progress.{bytes_downloaded, total_bytes, active}. The SystemUpdate.vue progress bar now polls update.status every second instead of incrementing a fake random counter — and crucially, if the user navigates away and back, the component picks up the in-progress download from the backend atomics immediately. Update-check retries handle_update_check now retries the manifest fetch up to 3 times with a 5s gap if the first try hits a transport error, so a momentary gitea hiccup doesn't make a node report "up to date" when there actually is a new release. Tight 10s connect timeout per attempt keeps the total bounded. Artefacts: archipelago 1070c87f…c081c162b 40584792 archipelago-frontend-1.7.15-alpha.tar.gz 8e630eba…63fd43f 162078068 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 17:17:58 -04:00
let _ = file.flush().await;
let _ = file.sync_all().await;
drop(file);
if stream_err {
continue;
}
// Stream ended cleanly. If we've got the expected size, verify
// the SHA and succeed. Otherwise loop to resume from the new
// offset on the next attempt.
let final_len = tokio::fs::metadata(dest)
.await
.map(|m| m.len())
.unwrap_or(0);
if final_len < component.size_bytes {
last_err = Some(anyhow::anyhow!(
"download truncated: got {} of {} bytes",
final_len,
component.size_bytes
));
continue;
}
// Full file — verify hash.
let bytes = tokio::fs::read(dest)
.await
.context("read staging file for hash check")?;
let hash = hex::encode(Sha256::digest(&bytes));
if hash == component.sha256 {
return Ok(());
}
// SHA mismatch — the file on disk is garbage. Nuke it and
// start over from scratch on the next attempt.
let _ = tokio::fs::remove_file(dest).await;
last_err = Some(anyhow::anyhow!(
"SHA256 mismatch for {}: expected {}, got {}",
component.name,
component.sha256,
hash
));
}
Err(last_err.unwrap_or_else(|| anyhow::anyhow!("download failed without a captured error")))
}
/// Cancel an in-flight download. Sets the cancellation flag so the
/// download loop bails out at the next chunk or backoff boundary, then
/// zeros the live counters and wipes the staging directory so the UI
/// sees "no active download" immediately and the next attempt starts
/// clean. Safe to call even when no download is running.
pub async fn cancel_download(data_dir: &Path) -> Result<()> {
DOWNLOAD_CANCEL.store(true, Ordering::Relaxed);
DOWNLOAD_BYTES.store(0, Ordering::Relaxed);
DOWNLOAD_TOTAL.store(0, Ordering::Relaxed);
let staging = data_dir.join("update-staging");
let wiped = if staging.exists() {
tokio::fs::remove_dir_all(&staging).await.is_ok()
} else {
false
};
// Clear the "downloaded, ready to apply" marker too — a canceled
// download is not a staged update.
let mut cleared_marker = false;
if let Ok(mut state) = load_state(data_dir).await {
if state.update_in_progress {
state.update_in_progress = false;
let _ = save_state(data_dir, &state).await;
cleared_marker = true;
}
}
info!(
staging = %staging.display(),
wiped,
cleared_marker,
"Update download canceled"
);
Ok(())
}
release(v1.7.10-alpha): apply namespace fix + FIPS cascade + profile polish THE apply fix archipelago.service uses ProtectSystem=strict, so /opt and /usr are read-only inside the service's mount namespace. sudo inherits that namespace — every sudo mkdir/mv/chown from apply_update was hitting EROFS even as root. Every prior "Failed to apply update" was a symptom of this. New `host_sudo()` helper wraps every filesystem call in `sudo systemd-run --wait --collect --pipe -- <cmd>`, which spawns a transient unit with systemd's default (no ProtectSystem) protections — the command runs in the host namespace and can touch /opt/archipelago + /usr/local/bin normally. FIPS cascade (#2) Home.vue and Server.vue both carry a FIPS row that previously only looked at {installed, service_active, key_present}. Now they also read anchor_connected + authenticated_peer_count and mirror the full FIPS card: green "Active · N peers" when healthy, orange "No anchor" when the DHT bootstrap has failed. Profile paste URL fallback (#4) Web5Identities.vue list + editor previously had `@error="display:none"` on the <img>, which hid the tag without re-rendering the fallback — a broken pasted URL showed up blank. Replaced with reactive pictureLoadFailed / listPictureFailed flags plus a watcher that resets on URL change. Broken URL now falls back to the initial (or identicon for seed-derived identities). Small-upload data URL (#3) Uploaded profile pictures ≤ 64 KB are now inlined as `data:image/png;base64,...` into profile.picture on the client before calling update-profile. That kind-0 event is fetchable by any Nostr client — no Tor needed. Larger uploads fall back to the onion-rooted public_url with a hint telling the user to paste a public https:// URL for broader visibility. Deferred: #1 FIPS Reconnect "actually fixes" — the current Reconnect calls fips.restart which clears the daemon state, but when the anchor is truly unreachable (UDP 8668 blocked by network/ISP), no amount of restart can help. A richer diagnostic is out of scope for this bundle. Artefacts: archipelago 4a77c704…82aa6f8 40379696 archipelago-frontend-1.7.10-alpha.tar.gz 0644a436…54f58 76983846 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 13:46:03 -04:00
/// Run a command as root, but *outside* the archipelago service's
/// restricted mount namespace.
///
/// archipelago.service uses `ProtectSystem=strict`, which makes `/opt`
/// and `/usr` read-only inside the service — and sudo inherits the
/// namespace, so `sudo mv /opt/archipelago/...` fails with EROFS even
/// though sudo itself is root. `systemd-run --wait` spawns a transient
/// service unit that inherits systemd's default protections (i.e. none
/// of ours), escaping the namespace.
release(v1.7.35-alpha): rootless-netns self-heal + app update button + bitcoin-core 28.4 + Node DID unification - core/archipelago/src/bootstrap.rs (NEW): embed scripts/container-doctor.sh and image-recipe/configs/archipelago-doctor.{service,timer} via include_str! and sync to disk + enable the timer on every archipelago startup. Idempotent (content-hash compare), dev-box symlink guard keeps the git checkout untouched, best-effort (warn-only on failure) so bootstrap never blocks server readiness. Wired in main.rs as a background tokio task. - scripts/container-doctor.sh: add fix_rootless_netns_egress(). Detects when the rootless-netns has lost its pasta tap (container-to-container still works but outbound DNS/TCP fails) via an nsenter probe into aardvark-dns; with a two-probe 10s debounce to rule out transients and a host-precheck that bails out if the host itself is offline. When the rootless-netns is truly broken, does a graceful podman stop --all / start --all so pasta + aardvark-dns rebuild the netns from scratch. Bitcoin-knots and every other outbound container recover in one cycle. - core/archipelago/src/update.rs: host_sudo → pub(crate) so bootstrap.rs can reuse the existing systemd-run escape hatch. - apps/bitcoin-core/manifest.yml: bump app version 24.0.0 → 28.4.0 and image bitcoin/bitcoin:24.0 → bitcoin/bitcoin:28.4. Resources aligned with the real container-specs.sh large-disk tune (4 GiB memory cap, cpu_limit: 0 so bitcoind can run -par=auto across every core). - neode-ui/src/views/apps/AppCard.vue + Apps.vue: add an Update button + Updating spinner to every app card that has available-update set. Wires through serverStore.updatePackage(id) — the same RPC the detail view already calls. common.update / common.updating i18n keys added in en.json and es.json. - core/archipelago/src/identity_manager.rs: add create_from_signing_key() that mirrors an existing Ed25519 key as a manager-level identity with a deterministic id (`node-<pubkey16>`). Idempotent across restarts, gets the hex-SVG master avatar. - core/archipelago/src/server.rs: the auto-create path on first boot now mirrors the node's own signing_key (seed-derived on onboarded installs) as a "Node" identity instead of generating a random "Default" keypair. Once this ships, the DID on the Web5 DID Status card (via node.did RPC), the Node entry on the Identities page (via identity.list), and the DID used for peer-to-peer connects (via server_info.pubkey) all resolve to the same seed-derived pubkey. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 08:29:56 -04:00
pub(crate) async fn host_sudo(args: &[&str]) -> Result<std::process::ExitStatus> {
release(v1.7.10-alpha): apply namespace fix + FIPS cascade + profile polish THE apply fix archipelago.service uses ProtectSystem=strict, so /opt and /usr are read-only inside the service's mount namespace. sudo inherits that namespace — every sudo mkdir/mv/chown from apply_update was hitting EROFS even as root. Every prior "Failed to apply update" was a symptom of this. New `host_sudo()` helper wraps every filesystem call in `sudo systemd-run --wait --collect --pipe -- <cmd>`, which spawns a transient unit with systemd's default (no ProtectSystem) protections — the command runs in the host namespace and can touch /opt/archipelago + /usr/local/bin normally. FIPS cascade (#2) Home.vue and Server.vue both carry a FIPS row that previously only looked at {installed, service_active, key_present}. Now they also read anchor_connected + authenticated_peer_count and mirror the full FIPS card: green "Active · N peers" when healthy, orange "No anchor" when the DHT bootstrap has failed. Profile paste URL fallback (#4) Web5Identities.vue list + editor previously had `@error="display:none"` on the <img>, which hid the tag without re-rendering the fallback — a broken pasted URL showed up blank. Replaced with reactive pictureLoadFailed / listPictureFailed flags plus a watcher that resets on URL change. Broken URL now falls back to the initial (or identicon for seed-derived identities). Small-upload data URL (#3) Uploaded profile pictures ≤ 64 KB are now inlined as `data:image/png;base64,...` into profile.picture on the client before calling update-profile. That kind-0 event is fetchable by any Nostr client — no Tor needed. Larger uploads fall back to the onion-rooted public_url with a hint telling the user to paste a public https:// URL for broader visibility. Deferred: #1 FIPS Reconnect "actually fixes" — the current Reconnect calls fips.restart which clears the daemon state, but when the anchor is truly unreachable (UDP 8668 blocked by network/ISP), no amount of restart can help. A richer diagnostic is out of scope for this bundle. Artefacts: archipelago 4a77c704…82aa6f8 40379696 archipelago-frontend-1.7.10-alpha.tar.gz 0644a436…54f58 76983846 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 13:46:03 -04:00
let mut full: Vec<&str> = vec![
"systemd-run",
"--wait",
"--quiet",
"--collect",
"--pipe",
"--",
];
full.extend_from_slice(args);
tokio::process::Command::new("sudo")
.args(&full)
.status()
.await
.context("sudo systemd-run spawn failed")
}
/// Apply a downloaded update. Backs up current binaries, replaces with staged versions.
pub async fn apply_update(data_dir: &Path) -> Result<()> {
let staging_dir = data_dir.join("update-staging");
if !staging_dir.exists() {
anyhow::bail!("No staged update found. Download first.");
}
let backup_dir = data_dir.join("update-backup");
fs::create_dir_all(&backup_dir)
.await
.context("Failed to create backup dir")?;
info!(
staging = %staging_dir.display(),
backup = %backup_dir.display(),
"Applying staged update"
);
// Back up current backend binary
let current_binary = Path::new("/usr/local/bin/archipelago");
if current_binary.exists() {
let backup_path = backup_dir.join("archipelago");
fs::copy(current_binary, &backup_path)
.await
.context("Failed to backup current binary")?;
info!("Current binary backed up");
}
// Apply staged components
let mut entries = fs::read_dir(&staging_dir)
.await
.context("Failed to read staging dir")?;
while let Some(entry) = entries.next_entry().await? {
let name = entry.file_name().to_string_lossy().to_string();
let src = entry.path();
match name.as_str() {
"archipelago" => {
release(v1.7.10-alpha): apply namespace fix + FIPS cascade + profile polish THE apply fix archipelago.service uses ProtectSystem=strict, so /opt and /usr are read-only inside the service's mount namespace. sudo inherits that namespace — every sudo mkdir/mv/chown from apply_update was hitting EROFS even as root. Every prior "Failed to apply update" was a symptom of this. New `host_sudo()` helper wraps every filesystem call in `sudo systemd-run --wait --collect --pipe -- <cmd>`, which spawns a transient unit with systemd's default (no ProtectSystem) protections — the command runs in the host namespace and can touch /opt/archipelago + /usr/local/bin normally. FIPS cascade (#2) Home.vue and Server.vue both carry a FIPS row that previously only looked at {installed, service_active, key_present}. Now they also read anchor_connected + authenticated_peer_count and mirror the full FIPS card: green "Active · N peers" when healthy, orange "No anchor" when the DHT bootstrap has failed. Profile paste URL fallback (#4) Web5Identities.vue list + editor previously had `@error="display:none"` on the <img>, which hid the tag without re-rendering the fallback — a broken pasted URL showed up blank. Replaced with reactive pictureLoadFailed / listPictureFailed flags plus a watcher that resets on URL change. Broken URL now falls back to the initial (or identicon for seed-derived identities). Small-upload data URL (#3) Uploaded profile pictures ≤ 64 KB are now inlined as `data:image/png;base64,...` into profile.picture on the client before calling update-profile. That kind-0 event is fetchable by any Nostr client — no Tor needed. Larger uploads fall back to the onion-rooted public_url with a hint telling the user to paste a public https:// URL for broader visibility. Deferred: #1 FIPS Reconnect "actually fixes" — the current Reconnect calls fips.restart which clears the daemon state, but when the anchor is truly unreachable (UDP 8668 blocked by network/ISP), no amount of restart can help. A richer diagnostic is out of scope for this bundle. Artefacts: archipelago 4a77c704…82aa6f8 40379696 archipelago-frontend-1.7.10-alpha.tar.gz 0644a436…54f58 76983846 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 13:46:03 -04:00
// Two namespace gotchas this block works around:
// 1. We're running FROM /usr/local/bin/archipelago, so
// `install`/`cp` (O_TRUNC + write) fail with ETXTBSY.
// Use `mv`, which is atomic rename() and tolerates a
// busy destination.
// 2. archipelago.service sets ProtectSystem=strict, so
// even `sudo mv` into /usr/local/bin/ fails EROFS —
// sudo inherits the service's mount namespace. Route
// the rename through systemd-run so it runs in a
// transient unit with default protections.
let staged = src.to_string_lossy().to_string();
release(v1.7.10-alpha): apply namespace fix + FIPS cascade + profile polish THE apply fix archipelago.service uses ProtectSystem=strict, so /opt and /usr are read-only inside the service's mount namespace. sudo inherits that namespace — every sudo mkdir/mv/chown from apply_update was hitting EROFS even as root. Every prior "Failed to apply update" was a symptom of this. New `host_sudo()` helper wraps every filesystem call in `sudo systemd-run --wait --collect --pipe -- <cmd>`, which spawns a transient unit with systemd's default (no ProtectSystem) protections — the command runs in the host namespace and can touch /opt/archipelago + /usr/local/bin normally. FIPS cascade (#2) Home.vue and Server.vue both carry a FIPS row that previously only looked at {installed, service_active, key_present}. Now they also read anchor_connected + authenticated_peer_count and mirror the full FIPS card: green "Active · N peers" when healthy, orange "No anchor" when the DHT bootstrap has failed. Profile paste URL fallback (#4) Web5Identities.vue list + editor previously had `@error="display:none"` on the <img>, which hid the tag without re-rendering the fallback — a broken pasted URL showed up blank. Replaced with reactive pictureLoadFailed / listPictureFailed flags plus a watcher that resets on URL change. Broken URL now falls back to the initial (or identicon for seed-derived identities). Small-upload data URL (#3) Uploaded profile pictures ≤ 64 KB are now inlined as `data:image/png;base64,...` into profile.picture on the client before calling update-profile. That kind-0 event is fetchable by any Nostr client — no Tor needed. Larger uploads fall back to the onion-rooted public_url with a hint telling the user to paste a public https:// URL for broader visibility. Deferred: #1 FIPS Reconnect "actually fixes" — the current Reconnect calls fips.restart which clears the daemon state, but when the anchor is truly unreachable (UDP 8668 blocked by network/ISP), no amount of restart can help. A richer diagnostic is out of scope for this bundle. Artefacts: archipelago 4a77c704…82aa6f8 40379696 archipelago-frontend-1.7.10-alpha.tar.gz 0644a436…54f58 76983846 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 13:46:03 -04:00
let _ = host_sudo(&["chmod", "0755", &staged]).await;
let _ = host_sudo(&["chown", "root:root", &staged]).await;
let status = host_sudo(&["mv", &staged, "/usr/local/bin/archipelago"])
.await
.with_context(|| format!("Failed to spawn mv for {}", name))?;
release(v1.7.2-alpha): fix Install Update + identity avatar backfill + label Three user-visible fixes shipped together. 1. update.apply permission-denied apply_update() was doing fs::copy into /usr/local/bin/archipelago and tar xzf into /opt/archipelago as the archipelago user — both root-owned. The backup step succeeded (it wrote to data_dir) but the swap failed with a silent permission denied, wrapped as "Failed to apply archipelago". Now uses `sudo install -m 0755` for the binary and `sudo tar -xzf` for the frontend, plus a post-apply `sudo systemctl --no-block restart archipelago` scheduled 2s after the RPC reply so the UI sees success. 2. Apply → Install label en/es locale strings: applyUpdate / applyTitle / applyNow changed from "Apply" to "Install". Matches the user's mental model and distinguishes the user-facing verb from the internal apply_update() function. 3. Identity avatar backfill Identities created before df83163f had profile=None on disk and so rendered as initials. load_record() now synthesizes an IdentityProfile with a default picture (identicon for regular identities, the hex node SVG for derivation_index=0) when profile is missing. The synthetic profile lives only in the returned record; the file stays untouched so a later explicit Save persists whatever the user actually chose. Artefacts: archipelago 70e5444e…67c589 40381960 archipelago-frontend-1.7.2-alpha.tar.gz 806b027b…358a824 76983699 Changelog rewritten layman-style per saved feedback. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 11:25:10 -04:00
if !status.success() {
anyhow::bail!(
release(v1.7.10-alpha): apply namespace fix + FIPS cascade + profile polish THE apply fix archipelago.service uses ProtectSystem=strict, so /opt and /usr are read-only inside the service's mount namespace. sudo inherits that namespace — every sudo mkdir/mv/chown from apply_update was hitting EROFS even as root. Every prior "Failed to apply update" was a symptom of this. New `host_sudo()` helper wraps every filesystem call in `sudo systemd-run --wait --collect --pipe -- <cmd>`, which spawns a transient unit with systemd's default (no ProtectSystem) protections — the command runs in the host namespace and can touch /opt/archipelago + /usr/local/bin normally. FIPS cascade (#2) Home.vue and Server.vue both carry a FIPS row that previously only looked at {installed, service_active, key_present}. Now they also read anchor_connected + authenticated_peer_count and mirror the full FIPS card: green "Active · N peers" when healthy, orange "No anchor" when the DHT bootstrap has failed. Profile paste URL fallback (#4) Web5Identities.vue list + editor previously had `@error="display:none"` on the <img>, which hid the tag without re-rendering the fallback — a broken pasted URL showed up blank. Replaced with reactive pictureLoadFailed / listPictureFailed flags plus a watcher that resets on URL change. Broken URL now falls back to the initial (or identicon for seed-derived identities). Small-upload data URL (#3) Uploaded profile pictures ≤ 64 KB are now inlined as `data:image/png;base64,...` into profile.picture on the client before calling update-profile. That kind-0 event is fetchable by any Nostr client — no Tor needed. Larger uploads fall back to the onion-rooted public_url with a hint telling the user to paste a public https:// URL for broader visibility. Deferred: #1 FIPS Reconnect "actually fixes" — the current Reconnect calls fips.restart which clears the daemon state, but when the anchor is truly unreachable (UDP 8668 blocked by network/ISP), no amount of restart can help. A richer diagnostic is out of scope for this bundle. Artefacts: archipelago 4a77c704…82aa6f8 40379696 archipelago-frontend-1.7.10-alpha.tar.gz 0644a436…54f58 76983846 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 13:46:03 -04:00
"mv into /usr/local/bin failed for {} (exit {:?})",
release(v1.7.2-alpha): fix Install Update + identity avatar backfill + label Three user-visible fixes shipped together. 1. update.apply permission-denied apply_update() was doing fs::copy into /usr/local/bin/archipelago and tar xzf into /opt/archipelago as the archipelago user — both root-owned. The backup step succeeded (it wrote to data_dir) but the swap failed with a silent permission denied, wrapped as "Failed to apply archipelago". Now uses `sudo install -m 0755` for the binary and `sudo tar -xzf` for the frontend, plus a post-apply `sudo systemctl --no-block restart archipelago` scheduled 2s after the RPC reply so the UI sees success. 2. Apply → Install label en/es locale strings: applyUpdate / applyTitle / applyNow changed from "Apply" to "Install". Matches the user's mental model and distinguishes the user-facing verb from the internal apply_update() function. 3. Identity avatar backfill Identities created before df83163f had profile=None on disk and so rendered as initials. load_record() now synthesizes an IdentityProfile with a default picture (identicon for regular identities, the hex node SVG for derivation_index=0) when profile is missing. The synthetic profile lives only in the returned record; the file stays untouched so a later explicit Save persists whatever the user actually chose. Artefacts: archipelago 70e5444e…67c589 40381960 archipelago-frontend-1.7.2-alpha.tar.gz 806b027b…358a824 76983699 Changelog rewritten layman-style per saved feedback. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 11:25:10 -04:00
name,
status.code()
);
}
info!(name = %name, "Backend binary applied");
}
_ if name.contains("frontend") && name.ends_with(".tar.gz") => {
release(v1.7.6-alpha): robust apply_update + manifest-override env var apply_update frontend swap Transient EROFS on .198 (filesystem hiccup — root FS mounts with errors=remount-ro so a fleeting glitch can bounce /opt to RO for a moment) caught the pre-cleanup `rm -rf web-ui.new web-ui.bak` mid- stride and aborted the apply. Rewrote the swap to use a timestamped staging dir (web-ui.new.<ms>) and a timestamped old-copy path so nothing needs to be rm'd before the extract. After the new tree is mv'd into place, the previous rollback copy is rotated aside with a .<ms> suffix (best-effort) and this apply's old copy becomes the new web-ui.bak. If the final mv fails, the staged old is restored so nginx keeps serving. handle_update_check manifest override handle_update_check takes the git path whenever ~/archy/.git exists. On the dev box (.116) that meant the Pull & Rebuild button was always the only option even though the manifest-path OTA was already wired via ARCHIPELAGO_UPDATE_URL. Now: if that env var is set, we skip the git detection entirely and use the manifest path. The regular fleet (no env var, no repo) hits the manifest branch naturally; beta dev nodes (repo + no env var) still get Pull & Rebuild; dev nodes with the env var explicitly set can finally test the manifest OTA end-to-end. Artefacts: archipelago 356e78cc…91a6dd 40372288 archipelago-frontend-1.7.6-alpha.tar.gz 4fb79664…0172e9 76984615 (reused) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 12:33:10 -04:00
// Tarball contents are the *inside* of web-ui/ (root entries
// `./test-aiui.html`, `./assets/`, ...). Extract into a
// uniquely-named staging dir, then mv into place. No `rm
// -rf` pre-cleanup — that's what hit transient EROFS on
// .198 and aborted the apply mid-flight.
let ts = chrono::Utc::now().timestamp_millis();
let staging_new = format!("/opt/archipelago/web-ui.new.{}", ts);
let staging_old = format!("/opt/archipelago/web-ui.old.{}", ts);
let web_ui = "/opt/archipelago/web-ui";
let backup_path = "/opt/archipelago/web-ui.bak";
release(v1.7.6-alpha): robust apply_update + manifest-override env var apply_update frontend swap Transient EROFS on .198 (filesystem hiccup — root FS mounts with errors=remount-ro so a fleeting glitch can bounce /opt to RO for a moment) caught the pre-cleanup `rm -rf web-ui.new web-ui.bak` mid- stride and aborted the apply. Rewrote the swap to use a timestamped staging dir (web-ui.new.<ms>) and a timestamped old-copy path so nothing needs to be rm'd before the extract. After the new tree is mv'd into place, the previous rollback copy is rotated aside with a .<ms> suffix (best-effort) and this apply's old copy becomes the new web-ui.bak. If the final mv fails, the staged old is restored so nginx keeps serving. handle_update_check manifest override handle_update_check takes the git path whenever ~/archy/.git exists. On the dev box (.116) that meant the Pull & Rebuild button was always the only option even though the manifest-path OTA was already wired via ARCHIPELAGO_UPDATE_URL. Now: if that env var is set, we skip the git detection entirely and use the manifest path. The regular fleet (no env var, no repo) hits the manifest branch naturally; beta dev nodes (repo + no env var) still get Pull & Rebuild; dev nodes with the env var explicitly set can finally test the manifest OTA end-to-end. Artefacts: archipelago 356e78cc…91a6dd 40372288 archipelago-frontend-1.7.6-alpha.tar.gz 4fb79664…0172e9 76984615 (reused) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 12:33:10 -04:00
release(v1.7.10-alpha): apply namespace fix + FIPS cascade + profile polish THE apply fix archipelago.service uses ProtectSystem=strict, so /opt and /usr are read-only inside the service's mount namespace. sudo inherits that namespace — every sudo mkdir/mv/chown from apply_update was hitting EROFS even as root. Every prior "Failed to apply update" was a symptom of this. New `host_sudo()` helper wraps every filesystem call in `sudo systemd-run --wait --collect --pipe -- <cmd>`, which spawns a transient unit with systemd's default (no ProtectSystem) protections — the command runs in the host namespace and can touch /opt/archipelago + /usr/local/bin normally. FIPS cascade (#2) Home.vue and Server.vue both carry a FIPS row that previously only looked at {installed, service_active, key_present}. Now they also read anchor_connected + authenticated_peer_count and mirror the full FIPS card: green "Active · N peers" when healthy, orange "No anchor" when the DHT bootstrap has failed. Profile paste URL fallback (#4) Web5Identities.vue list + editor previously had `@error="display:none"` on the <img>, which hid the tag without re-rendering the fallback — a broken pasted URL showed up blank. Replaced with reactive pictureLoadFailed / listPictureFailed flags plus a watcher that resets on URL change. Broken URL now falls back to the initial (or identicon for seed-derived identities). Small-upload data URL (#3) Uploaded profile pictures ≤ 64 KB are now inlined as `data:image/png;base64,...` into profile.picture on the client before calling update-profile. That kind-0 event is fetchable by any Nostr client — no Tor needed. Larger uploads fall back to the onion-rooted public_url with a hint telling the user to paste a public https:// URL for broader visibility. Deferred: #1 FIPS Reconnect "actually fixes" — the current Reconnect calls fips.restart which clears the daemon state, but when the anchor is truly unreachable (UDP 8668 blocked by network/ISP), no amount of restart can help. A richer diagnostic is out of scope for this bundle. Artefacts: archipelago 4a77c704…82aa6f8 40379696 archipelago-frontend-1.7.10-alpha.tar.gz 0644a436…54f58 76983846 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 13:46:03 -04:00
// All sudo calls that touch /opt/archipelago go through
// host_sudo so they see a normal root mount namespace.
let mk = host_sudo(&["mkdir", "-p", &staging_new])
.await
.context("Failed to create frontend staging dir")?;
if !mk.success() {
release(v1.7.6-alpha): robust apply_update + manifest-override env var apply_update frontend swap Transient EROFS on .198 (filesystem hiccup — root FS mounts with errors=remount-ro so a fleeting glitch can bounce /opt to RO for a moment) caught the pre-cleanup `rm -rf web-ui.new web-ui.bak` mid- stride and aborted the apply. Rewrote the swap to use a timestamped staging dir (web-ui.new.<ms>) and a timestamped old-copy path so nothing needs to be rm'd before the extract. After the new tree is mv'd into place, the previous rollback copy is rotated aside with a .<ms> suffix (best-effort) and this apply's old copy becomes the new web-ui.bak. If the final mv fails, the staged old is restored so nginx keeps serving. handle_update_check manifest override handle_update_check takes the git path whenever ~/archy/.git exists. On the dev box (.116) that meant the Pull & Rebuild button was always the only option even though the manifest-path OTA was already wired via ARCHIPELAGO_UPDATE_URL. Now: if that env var is set, we skip the git detection entirely and use the manifest path. The regular fleet (no env var, no repo) hits the manifest branch naturally; beta dev nodes (repo + no env var) still get Pull & Rebuild; dev nodes with the env var explicitly set can finally test the manifest OTA end-to-end. Artefacts: archipelago 356e78cc…91a6dd 40372288 archipelago-frontend-1.7.6-alpha.tar.gz 4fb79664…0172e9 76984615 (reused) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 12:33:10 -04:00
anyhow::bail!("mkdir {} failed", staging_new);
}
let extract =
host_sudo(&["tar", "-xzf", &src.to_string_lossy(), "-C", &staging_new])
.await
.with_context(|| format!("Failed to extract {}", name))?;
release(v1.7.6-alpha): robust apply_update + manifest-override env var apply_update frontend swap Transient EROFS on .198 (filesystem hiccup — root FS mounts with errors=remount-ro so a fleeting glitch can bounce /opt to RO for a moment) caught the pre-cleanup `rm -rf web-ui.new web-ui.bak` mid- stride and aborted the apply. Rewrote the swap to use a timestamped staging dir (web-ui.new.<ms>) and a timestamped old-copy path so nothing needs to be rm'd before the extract. After the new tree is mv'd into place, the previous rollback copy is rotated aside with a .<ms> suffix (best-effort) and this apply's old copy becomes the new web-ui.bak. If the final mv fails, the staged old is restored so nginx keeps serving. handle_update_check manifest override handle_update_check takes the git path whenever ~/archy/.git exists. On the dev box (.116) that meant the Pull & Rebuild button was always the only option even though the manifest-path OTA was already wired via ARCHIPELAGO_UPDATE_URL. Now: if that env var is set, we skip the git detection entirely and use the manifest path. The regular fleet (no env var, no repo) hits the manifest branch naturally; beta dev nodes (repo + no env var) still get Pull & Rebuild; dev nodes with the env var explicitly set can finally test the manifest OTA end-to-end. Artefacts: archipelago 356e78cc…91a6dd 40372288 archipelago-frontend-1.7.6-alpha.tar.gz 4fb79664…0172e9 76984615 (reused) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 12:33:10 -04:00
if !extract.success() {
release(v1.7.10-alpha): apply namespace fix + FIPS cascade + profile polish THE apply fix archipelago.service uses ProtectSystem=strict, so /opt and /usr are read-only inside the service's mount namespace. sudo inherits that namespace — every sudo mkdir/mv/chown from apply_update was hitting EROFS even as root. Every prior "Failed to apply update" was a symptom of this. New `host_sudo()` helper wraps every filesystem call in `sudo systemd-run --wait --collect --pipe -- <cmd>`, which spawns a transient unit with systemd's default (no ProtectSystem) protections — the command runs in the host namespace and can touch /opt/archipelago + /usr/local/bin normally. FIPS cascade (#2) Home.vue and Server.vue both carry a FIPS row that previously only looked at {installed, service_active, key_present}. Now they also read anchor_connected + authenticated_peer_count and mirror the full FIPS card: green "Active · N peers" when healthy, orange "No anchor" when the DHT bootstrap has failed. Profile paste URL fallback (#4) Web5Identities.vue list + editor previously had `@error="display:none"` on the <img>, which hid the tag without re-rendering the fallback — a broken pasted URL showed up blank. Replaced with reactive pictureLoadFailed / listPictureFailed flags plus a watcher that resets on URL change. Broken URL now falls back to the initial (or identicon for seed-derived identities). Small-upload data URL (#3) Uploaded profile pictures ≤ 64 KB are now inlined as `data:image/png;base64,...` into profile.picture on the client before calling update-profile. That kind-0 event is fetchable by any Nostr client — no Tor needed. Larger uploads fall back to the onion-rooted public_url with a hint telling the user to paste a public https:// URL for broader visibility. Deferred: #1 FIPS Reconnect "actually fixes" — the current Reconnect calls fips.restart which clears the daemon state, but when the anchor is truly unreachable (UDP 8668 blocked by network/ISP), no amount of restart can help. A richer diagnostic is out of scope for this bundle. Artefacts: archipelago 4a77c704…82aa6f8 40379696 archipelago-frontend-1.7.10-alpha.tar.gz 0644a436…54f58 76983846 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 13:46:03 -04:00
let _ = host_sudo(&["rm", "-rf", &staging_new]).await;
anyhow::bail!("tar extraction failed for {}", name);
}
let _ = host_sudo(&["chown", "-R", "archipelago:archipelago", &staging_new]).await;
release(v1.7.6-alpha): robust apply_update + manifest-override env var apply_update frontend swap Transient EROFS on .198 (filesystem hiccup — root FS mounts with errors=remount-ro so a fleeting glitch can bounce /opt to RO for a moment) caught the pre-cleanup `rm -rf web-ui.new web-ui.bak` mid- stride and aborted the apply. Rewrote the swap to use a timestamped staging dir (web-ui.new.<ms>) and a timestamped old-copy path so nothing needs to be rm'd before the extract. After the new tree is mv'd into place, the previous rollback copy is rotated aside with a .<ms> suffix (best-effort) and this apply's old copy becomes the new web-ui.bak. If the final mv fails, the staged old is restored so nginx keeps serving. handle_update_check manifest override handle_update_check takes the git path whenever ~/archy/.git exists. On the dev box (.116) that meant the Pull & Rebuild button was always the only option even though the manifest-path OTA was already wired via ARCHIPELAGO_UPDATE_URL. Now: if that env var is set, we skip the git detection entirely and use the manifest path. The regular fleet (no env var, no repo) hits the manifest branch naturally; beta dev nodes (repo + no env var) still get Pull & Rebuild; dev nodes with the env var explicitly set can finally test the manifest OTA end-to-end. Artefacts: archipelago 356e78cc…91a6dd 40372288 archipelago-frontend-1.7.6-alpha.tar.gz 4fb79664…0172e9 76984615 (reused) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 12:33:10 -04:00
release(v1.7.39-alpha): hotfix web-ui perms after OTA (nginx 500) + startup self-heal v1.7.38 shipped with an OTA bug: the tar-extracted staging dir inherited 700 perms and nginx (www-data) returned 500/403 on every request after the swap. .116 hit this on rollout; had to chmod by hand to recover. - update.rs: after extraction, explicitly chmod 755 dirs + 644 files on the new staging dir before the mv into place, so nginx can stat/serve them. - main.rs: self-heal on startup — if /opt/archipelago/web-ui is not world-readable, run `sudo chmod -R u=rwX,go=rX` to repair. This is what rescues nodes upgrading from v1.7.37/v1.7.38, since their extractor (running on the old binary) doesn't have the chmod fix yet — the new binary's first boot fixes the mess before nginx serves a single request. Everything v1.7.38 shipped is still in this release: - auth.rs auto-heals is_onboarding_complete() from setup_complete + password_hash so nodes don't bounce back to /onboarding/intro after browser clear / reboot / update - useOnboarding tri-state: backend-unreachable no longer defaults to intro - login sounds gated by isFirstInstallPhase() — silent after onboarding, typing sounds unaffected - FIPS app / Nostr Relay / Nostr VPN / Routstr / Penpot removed from catalog + frontend + Rust + docker + icons; 15 image versions deleted from tx1138, .168, gitea-local - AIUI baked into release tarball via demo/aiui/ - prebuild hook syncs app-catalog/catalog.json → public/catalog.json Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 13:26:54 -04:00
// Set world-readable perms so nginx (runs as www-data)
// can stat + serve the files. Without this, the tar
// extraction inherits the staging-dir's 700 mode and
// nginx returns 403/500 for every request after the
// swap — exactly what bit .116 on the v1.7.38 rollout.
let _ = host_sudo(&["chmod", "755", &staging_new]).await;
let _ = host_sudo(&[
"find",
&staging_new,
"-type",
"d",
"-exec",
"chmod",
"755",
"{}",
"+",
release(v1.7.39-alpha): hotfix web-ui perms after OTA (nginx 500) + startup self-heal v1.7.38 shipped with an OTA bug: the tar-extracted staging dir inherited 700 perms and nginx (www-data) returned 500/403 on every request after the swap. .116 hit this on rollout; had to chmod by hand to recover. - update.rs: after extraction, explicitly chmod 755 dirs + 644 files on the new staging dir before the mv into place, so nginx can stat/serve them. - main.rs: self-heal on startup — if /opt/archipelago/web-ui is not world-readable, run `sudo chmod -R u=rwX,go=rX` to repair. This is what rescues nodes upgrading from v1.7.37/v1.7.38, since their extractor (running on the old binary) doesn't have the chmod fix yet — the new binary's first boot fixes the mess before nginx serves a single request. Everything v1.7.38 shipped is still in this release: - auth.rs auto-heals is_onboarding_complete() from setup_complete + password_hash so nodes don't bounce back to /onboarding/intro after browser clear / reboot / update - useOnboarding tri-state: backend-unreachable no longer defaults to intro - login sounds gated by isFirstInstallPhase() — silent after onboarding, typing sounds unaffected - FIPS app / Nostr Relay / Nostr VPN / Routstr / Penpot removed from catalog + frontend + Rust + docker + icons; 15 image versions deleted from tx1138, .168, gitea-local - AIUI baked into release tarball via demo/aiui/ - prebuild hook syncs app-catalog/catalog.json → public/catalog.json Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 13:26:54 -04:00
])
.await;
let _ = host_sudo(&[
"find",
&staging_new,
"-type",
"f",
"-exec",
"chmod",
"644",
"{}",
"+",
release(v1.7.39-alpha): hotfix web-ui perms after OTA (nginx 500) + startup self-heal v1.7.38 shipped with an OTA bug: the tar-extracted staging dir inherited 700 perms and nginx (www-data) returned 500/403 on every request after the swap. .116 hit this on rollout; had to chmod by hand to recover. - update.rs: after extraction, explicitly chmod 755 dirs + 644 files on the new staging dir before the mv into place, so nginx can stat/serve them. - main.rs: self-heal on startup — if /opt/archipelago/web-ui is not world-readable, run `sudo chmod -R u=rwX,go=rX` to repair. This is what rescues nodes upgrading from v1.7.37/v1.7.38, since their extractor (running on the old binary) doesn't have the chmod fix yet — the new binary's first boot fixes the mess before nginx serves a single request. Everything v1.7.38 shipped is still in this release: - auth.rs auto-heals is_onboarding_complete() from setup_complete + password_hash so nodes don't bounce back to /onboarding/intro after browser clear / reboot / update - useOnboarding tri-state: backend-unreachable no longer defaults to intro - login sounds gated by isFirstInstallPhase() — silent after onboarding, typing sounds unaffected - FIPS app / Nostr Relay / Nostr VPN / Routstr / Penpot removed from catalog + frontend + Rust + docker + icons; 15 image versions deleted from tx1138, .168, gitea-local - AIUI baked into release tarball via demo/aiui/ - prebuild hook syncs app-catalog/catalog.json → public/catalog.json Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 13:26:54 -04:00
])
.await;
release(v1.7.14-alpha): install overlay + FIPS real fix + AIUI restore Install UX SystemUpdate.vue now shows a full-screen overlay after apply: the BitcoinFaceAscii logo, a target-version label, an indeterminate progress stripe (solid orange; solid green on ready), and an elapsed-time readout. Polls /health every 1.5s and auto-reloads once the backend reports the new version. 3-min stall → "Reload now" button. Download UI also shows a spinner + "Finishing download — verifying checksum…" while the fake bar sits at 95%. FIPS reconnect — for real this time New fips.reconnect RPC does stop → start → wait 20s → re-poll → classify. Classification buckets: connected / daemon_down / no_seed_key / no_outbound_udp_or_anchor_down / peers_but_no_anchor, each with a plain-language hint surfaced verbatim by the Reconnect button. The real reason nodes like .198/.253 couldn't reach the anchor: identity::write_fips_key_from_seed was writing fips_key.pub as a bech32 npub TEXT file, but upstream fips expects 32 raw bytes. The daemon silently authenticated with garbage. Fix: PublicKey::to_bytes() → raw 32 bytes, and new fips::config::normalize_pub_file migrates legacy files by decoding the npub and rewriting in place. fips.reconnect also re-installs the config + healed keys to /etc/fips before restarting. AIUI preservation + restore apply_update was wiping /opt/archipelago/web-ui/aiui because the Vue build doesn't include it — every OTA lost the Claude sidebar. The preserve block now copies aiui/ + archipelago-companion.apk from the old web-ui into the staging dir before the swap, and prefers new-tar versions if present. To restore it on the three nodes that already lost it (.116/.198/.253), this release bundles the 85 MB aiui build into the frontend tarball. Frontend component size is now ~155 MB. Download / install timeouts Backend download client timeout 1800s → 3600s (1 h). Larger tarball + slow gitea raw throughput put us above the old cap. Frontend update.download rpc timeout 30 min → 65 min to match. package.install rpc timeout 15 min → 45 min — IndeedHub pulls 6 images and was timing out mid-install. UI nit "Rollback to Previous" → "Rollback Available". App-catalog proxy already landed in v1.7.13. Artefacts: archipelago 725e18e6…3c525e6 40462288 archipelago-frontend-1.7.14-alpha.tar.gz c35284be…ff2c16 162077052 (+aiui) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 16:40:25 -04:00
// Preserve paths that are installed outside the Vue build
// (baked in by the ISO or sibling installers) and so
// aren't in the new tarball. Without this copy, every OTA
// wipes them — notably aiui/ (Claude Code sidebar) and
// the companion APK. `cp -a` preserves mode/ownership.
for preserved in ["aiui", "archipelago-companion.apk"] {
let src = format!("{}/{}", web_ui, preserved);
let dst = format!("{}/{}", staging_new, preserved);
// Only preserve the old copy if the new tarball
// doesn't already ship a fresher one.
if Path::new(&src).exists() && !Path::new(&dst).exists() {
let _ = host_sudo(&["cp", "-a", &src, &dst]).await;
}
}
release(v1.7.6-alpha): robust apply_update + manifest-override env var apply_update frontend swap Transient EROFS on .198 (filesystem hiccup — root FS mounts with errors=remount-ro so a fleeting glitch can bounce /opt to RO for a moment) caught the pre-cleanup `rm -rf web-ui.new web-ui.bak` mid- stride and aborted the apply. Rewrote the swap to use a timestamped staging dir (web-ui.new.<ms>) and a timestamped old-copy path so nothing needs to be rm'd before the extract. After the new tree is mv'd into place, the previous rollback copy is rotated aside with a .<ms> suffix (best-effort) and this apply's old copy becomes the new web-ui.bak. If the final mv fails, the staged old is restored so nginx keeps serving. handle_update_check manifest override handle_update_check takes the git path whenever ~/archy/.git exists. On the dev box (.116) that meant the Pull & Rebuild button was always the only option even though the manifest-path OTA was already wired via ARCHIPELAGO_UPDATE_URL. Now: if that env var is set, we skip the git detection entirely and use the manifest path. The regular fleet (no env var, no repo) hits the manifest branch naturally; beta dev nodes (repo + no env var) still get Pull & Rebuild; dev nodes with the env var explicitly set can finally test the manifest OTA end-to-end. Artefacts: archipelago 356e78cc…91a6dd 40372288 archipelago-frontend-1.7.6-alpha.tar.gz 4fb79664…0172e9 76984615 (reused) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 12:33:10 -04:00
// Swap: mv current web-ui aside, then mv new into place.
if Path::new(web_ui).exists() {
release(v1.7.10-alpha): apply namespace fix + FIPS cascade + profile polish THE apply fix archipelago.service uses ProtectSystem=strict, so /opt and /usr are read-only inside the service's mount namespace. sudo inherits that namespace — every sudo mkdir/mv/chown from apply_update was hitting EROFS even as root. Every prior "Failed to apply update" was a symptom of this. New `host_sudo()` helper wraps every filesystem call in `sudo systemd-run --wait --collect --pipe -- <cmd>`, which spawns a transient unit with systemd's default (no ProtectSystem) protections — the command runs in the host namespace and can touch /opt/archipelago + /usr/local/bin normally. FIPS cascade (#2) Home.vue and Server.vue both carry a FIPS row that previously only looked at {installed, service_active, key_present}. Now they also read anchor_connected + authenticated_peer_count and mirror the full FIPS card: green "Active · N peers" when healthy, orange "No anchor" when the DHT bootstrap has failed. Profile paste URL fallback (#4) Web5Identities.vue list + editor previously had `@error="display:none"` on the <img>, which hid the tag without re-rendering the fallback — a broken pasted URL showed up blank. Replaced with reactive pictureLoadFailed / listPictureFailed flags plus a watcher that resets on URL change. Broken URL now falls back to the initial (or identicon for seed-derived identities). Small-upload data URL (#3) Uploaded profile pictures ≤ 64 KB are now inlined as `data:image/png;base64,...` into profile.picture on the client before calling update-profile. That kind-0 event is fetchable by any Nostr client — no Tor needed. Larger uploads fall back to the onion-rooted public_url with a hint telling the user to paste a public https:// URL for broader visibility. Deferred: #1 FIPS Reconnect "actually fixes" — the current Reconnect calls fips.restart which clears the daemon state, but when the anchor is truly unreachable (UDP 8668 blocked by network/ISP), no amount of restart can help. A richer diagnostic is out of scope for this bundle. Artefacts: archipelago 4a77c704…82aa6f8 40379696 archipelago-frontend-1.7.10-alpha.tar.gz 0644a436…54f58 76983846 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 13:46:03 -04:00
let mv_old = host_sudo(&["mv", web_ui, &staging_old])
.await
.context("Failed to rotate old web-ui")?;
if !mv_old.success() {
anyhow::bail!("failed to move old web-ui aside");
}
}
release(v1.7.10-alpha): apply namespace fix + FIPS cascade + profile polish THE apply fix archipelago.service uses ProtectSystem=strict, so /opt and /usr are read-only inside the service's mount namespace. sudo inherits that namespace — every sudo mkdir/mv/chown from apply_update was hitting EROFS even as root. Every prior "Failed to apply update" was a symptom of this. New `host_sudo()` helper wraps every filesystem call in `sudo systemd-run --wait --collect --pipe -- <cmd>`, which spawns a transient unit with systemd's default (no ProtectSystem) protections — the command runs in the host namespace and can touch /opt/archipelago + /usr/local/bin normally. FIPS cascade (#2) Home.vue and Server.vue both carry a FIPS row that previously only looked at {installed, service_active, key_present}. Now they also read anchor_connected + authenticated_peer_count and mirror the full FIPS card: green "Active · N peers" when healthy, orange "No anchor" when the DHT bootstrap has failed. Profile paste URL fallback (#4) Web5Identities.vue list + editor previously had `@error="display:none"` on the <img>, which hid the tag without re-rendering the fallback — a broken pasted URL showed up blank. Replaced with reactive pictureLoadFailed / listPictureFailed flags plus a watcher that resets on URL change. Broken URL now falls back to the initial (or identicon for seed-derived identities). Small-upload data URL (#3) Uploaded profile pictures ≤ 64 KB are now inlined as `data:image/png;base64,...` into profile.picture on the client before calling update-profile. That kind-0 event is fetchable by any Nostr client — no Tor needed. Larger uploads fall back to the onion-rooted public_url with a hint telling the user to paste a public https:// URL for broader visibility. Deferred: #1 FIPS Reconnect "actually fixes" — the current Reconnect calls fips.restart which clears the daemon state, but when the anchor is truly unreachable (UDP 8668 blocked by network/ISP), no amount of restart can help. A richer diagnostic is out of scope for this bundle. Artefacts: archipelago 4a77c704…82aa6f8 40379696 archipelago-frontend-1.7.10-alpha.tar.gz 0644a436…54f58 76983846 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 13:46:03 -04:00
let mv_new = host_sudo(&["mv", &staging_new, web_ui])
.await
.context("Failed to swap new web-ui into place")?;
if !mv_new.success() {
release(v1.7.6-alpha): robust apply_update + manifest-override env var apply_update frontend swap Transient EROFS on .198 (filesystem hiccup — root FS mounts with errors=remount-ro so a fleeting glitch can bounce /opt to RO for a moment) caught the pre-cleanup `rm -rf web-ui.new web-ui.bak` mid- stride and aborted the apply. Rewrote the swap to use a timestamped staging dir (web-ui.new.<ms>) and a timestamped old-copy path so nothing needs to be rm'd before the extract. After the new tree is mv'd into place, the previous rollback copy is rotated aside with a .<ms> suffix (best-effort) and this apply's old copy becomes the new web-ui.bak. If the final mv fails, the staged old is restored so nginx keeps serving. handle_update_check manifest override handle_update_check takes the git path whenever ~/archy/.git exists. On the dev box (.116) that meant the Pull & Rebuild button was always the only option even though the manifest-path OTA was already wired via ARCHIPELAGO_UPDATE_URL. Now: if that env var is set, we skip the git detection entirely and use the manifest path. The regular fleet (no env var, no repo) hits the manifest branch naturally; beta dev nodes (repo + no env var) still get Pull & Rebuild; dev nodes with the env var explicitly set can finally test the manifest OTA end-to-end. Artefacts: archipelago 356e78cc…91a6dd 40372288 archipelago-frontend-1.7.6-alpha.tar.gz 4fb79664…0172e9 76984615 (reused) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 12:33:10 -04:00
if Path::new(&staging_old).exists() {
release(v1.7.10-alpha): apply namespace fix + FIPS cascade + profile polish THE apply fix archipelago.service uses ProtectSystem=strict, so /opt and /usr are read-only inside the service's mount namespace. sudo inherits that namespace — every sudo mkdir/mv/chown from apply_update was hitting EROFS even as root. Every prior "Failed to apply update" was a symptom of this. New `host_sudo()` helper wraps every filesystem call in `sudo systemd-run --wait --collect --pipe -- <cmd>`, which spawns a transient unit with systemd's default (no ProtectSystem) protections — the command runs in the host namespace and can touch /opt/archipelago + /usr/local/bin normally. FIPS cascade (#2) Home.vue and Server.vue both carry a FIPS row that previously only looked at {installed, service_active, key_present}. Now they also read anchor_connected + authenticated_peer_count and mirror the full FIPS card: green "Active · N peers" when healthy, orange "No anchor" when the DHT bootstrap has failed. Profile paste URL fallback (#4) Web5Identities.vue list + editor previously had `@error="display:none"` on the <img>, which hid the tag without re-rendering the fallback — a broken pasted URL showed up blank. Replaced with reactive pictureLoadFailed / listPictureFailed flags plus a watcher that resets on URL change. Broken URL now falls back to the initial (or identicon for seed-derived identities). Small-upload data URL (#3) Uploaded profile pictures ≤ 64 KB are now inlined as `data:image/png;base64,...` into profile.picture on the client before calling update-profile. That kind-0 event is fetchable by any Nostr client — no Tor needed. Larger uploads fall back to the onion-rooted public_url with a hint telling the user to paste a public https:// URL for broader visibility. Deferred: #1 FIPS Reconnect "actually fixes" — the current Reconnect calls fips.restart which clears the daemon state, but when the anchor is truly unreachable (UDP 8668 blocked by network/ISP), no amount of restart can help. A richer diagnostic is out of scope for this bundle. Artefacts: archipelago 4a77c704…82aa6f8 40379696 archipelago-frontend-1.7.10-alpha.tar.gz 0644a436…54f58 76983846 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 13:46:03 -04:00
let _ = host_sudo(&["mv", &staging_old, web_ui]).await;
release(v1.7.6-alpha): robust apply_update + manifest-override env var apply_update frontend swap Transient EROFS on .198 (filesystem hiccup — root FS mounts with errors=remount-ro so a fleeting glitch can bounce /opt to RO for a moment) caught the pre-cleanup `rm -rf web-ui.new web-ui.bak` mid- stride and aborted the apply. Rewrote the swap to use a timestamped staging dir (web-ui.new.<ms>) and a timestamped old-copy path so nothing needs to be rm'd before the extract. After the new tree is mv'd into place, the previous rollback copy is rotated aside with a .<ms> suffix (best-effort) and this apply's old copy becomes the new web-ui.bak. If the final mv fails, the staged old is restored so nginx keeps serving. handle_update_check manifest override handle_update_check takes the git path whenever ~/archy/.git exists. On the dev box (.116) that meant the Pull & Rebuild button was always the only option even though the manifest-path OTA was already wired via ARCHIPELAGO_UPDATE_URL. Now: if that env var is set, we skip the git detection entirely and use the manifest path. The regular fleet (no env var, no repo) hits the manifest branch naturally; beta dev nodes (repo + no env var) still get Pull & Rebuild; dev nodes with the env var explicitly set can finally test the manifest OTA end-to-end. Artefacts: archipelago 356e78cc…91a6dd 40372288 archipelago-frontend-1.7.6-alpha.tar.gz 4fb79664…0172e9 76984615 (reused) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 12:33:10 -04:00
}
anyhow::bail!("failed to move new web-ui into place");
}
release(v1.7.6-alpha): robust apply_update + manifest-override env var apply_update frontend swap Transient EROFS on .198 (filesystem hiccup — root FS mounts with errors=remount-ro so a fleeting glitch can bounce /opt to RO for a moment) caught the pre-cleanup `rm -rf web-ui.new web-ui.bak` mid- stride and aborted the apply. Rewrote the swap to use a timestamped staging dir (web-ui.new.<ms>) and a timestamped old-copy path so nothing needs to be rm'd before the extract. After the new tree is mv'd into place, the previous rollback copy is rotated aside with a .<ms> suffix (best-effort) and this apply's old copy becomes the new web-ui.bak. If the final mv fails, the staged old is restored so nginx keeps serving. handle_update_check manifest override handle_update_check takes the git path whenever ~/archy/.git exists. On the dev box (.116) that meant the Pull & Rebuild button was always the only option even though the manifest-path OTA was already wired via ARCHIPELAGO_UPDATE_URL. Now: if that env var is set, we skip the git detection entirely and use the manifest path. The regular fleet (no env var, no repo) hits the manifest branch naturally; beta dev nodes (repo + no env var) still get Pull & Rebuild; dev nodes with the env var explicitly set can finally test the manifest OTA end-to-end. Artefacts: archipelago 356e78cc…91a6dd 40372288 archipelago-frontend-1.7.6-alpha.tar.gz 4fb79664…0172e9 76984615 (reused) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 12:33:10 -04:00
release(v1.7.10-alpha): apply namespace fix + FIPS cascade + profile polish THE apply fix archipelago.service uses ProtectSystem=strict, so /opt and /usr are read-only inside the service's mount namespace. sudo inherits that namespace — every sudo mkdir/mv/chown from apply_update was hitting EROFS even as root. Every prior "Failed to apply update" was a symptom of this. New `host_sudo()` helper wraps every filesystem call in `sudo systemd-run --wait --collect --pipe -- <cmd>`, which spawns a transient unit with systemd's default (no ProtectSystem) protections — the command runs in the host namespace and can touch /opt/archipelago + /usr/local/bin normally. FIPS cascade (#2) Home.vue and Server.vue both carry a FIPS row that previously only looked at {installed, service_active, key_present}. Now they also read anchor_connected + authenticated_peer_count and mirror the full FIPS card: green "Active · N peers" when healthy, orange "No anchor" when the DHT bootstrap has failed. Profile paste URL fallback (#4) Web5Identities.vue list + editor previously had `@error="display:none"` on the <img>, which hid the tag without re-rendering the fallback — a broken pasted URL showed up blank. Replaced with reactive pictureLoadFailed / listPictureFailed flags plus a watcher that resets on URL change. Broken URL now falls back to the initial (or identicon for seed-derived identities). Small-upload data URL (#3) Uploaded profile pictures ≤ 64 KB are now inlined as `data:image/png;base64,...` into profile.picture on the client before calling update-profile. That kind-0 event is fetchable by any Nostr client — no Tor needed. Larger uploads fall back to the onion-rooted public_url with a hint telling the user to paste a public https:// URL for broader visibility. Deferred: #1 FIPS Reconnect "actually fixes" — the current Reconnect calls fips.restart which clears the daemon state, but when the anchor is truly unreachable (UDP 8668 blocked by network/ISP), no amount of restart can help. A richer diagnostic is out of scope for this bundle. Artefacts: archipelago 4a77c704…82aa6f8 40379696 archipelago-frontend-1.7.10-alpha.tar.gz 0644a436…54f58 76983846 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 13:46:03 -04:00
// Rotate previous rollback aside and install this apply's
// old copy as the new rollback.
release(v1.7.6-alpha): robust apply_update + manifest-override env var apply_update frontend swap Transient EROFS on .198 (filesystem hiccup — root FS mounts with errors=remount-ro so a fleeting glitch can bounce /opt to RO for a moment) caught the pre-cleanup `rm -rf web-ui.new web-ui.bak` mid- stride and aborted the apply. Rewrote the swap to use a timestamped staging dir (web-ui.new.<ms>) and a timestamped old-copy path so nothing needs to be rm'd before the extract. After the new tree is mv'd into place, the previous rollback copy is rotated aside with a .<ms> suffix (best-effort) and this apply's old copy becomes the new web-ui.bak. If the final mv fails, the staged old is restored so nginx keeps serving. handle_update_check manifest override handle_update_check takes the git path whenever ~/archy/.git exists. On the dev box (.116) that meant the Pull & Rebuild button was always the only option even though the manifest-path OTA was already wired via ARCHIPELAGO_UPDATE_URL. Now: if that env var is set, we skip the git detection entirely and use the manifest path. The regular fleet (no env var, no repo) hits the manifest branch naturally; beta dev nodes (repo + no env var) still get Pull & Rebuild; dev nodes with the env var explicitly set can finally test the manifest OTA end-to-end. Artefacts: archipelago 356e78cc…91a6dd 40372288 archipelago-frontend-1.7.6-alpha.tar.gz 4fb79664…0172e9 76984615 (reused) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 12:33:10 -04:00
if Path::new(&staging_old).exists() {
if Path::new(backup_path).exists() {
let _ = host_sudo(&["mv", backup_path, &format!("{}.{}", backup_path, ts)])
.await;
release(v1.7.10-alpha): apply namespace fix + FIPS cascade + profile polish THE apply fix archipelago.service uses ProtectSystem=strict, so /opt and /usr are read-only inside the service's mount namespace. sudo inherits that namespace — every sudo mkdir/mv/chown from apply_update was hitting EROFS even as root. Every prior "Failed to apply update" was a symptom of this. New `host_sudo()` helper wraps every filesystem call in `sudo systemd-run --wait --collect --pipe -- <cmd>`, which spawns a transient unit with systemd's default (no ProtectSystem) protections — the command runs in the host namespace and can touch /opt/archipelago + /usr/local/bin normally. FIPS cascade (#2) Home.vue and Server.vue both carry a FIPS row that previously only looked at {installed, service_active, key_present}. Now they also read anchor_connected + authenticated_peer_count and mirror the full FIPS card: green "Active · N peers" when healthy, orange "No anchor" when the DHT bootstrap has failed. Profile paste URL fallback (#4) Web5Identities.vue list + editor previously had `@error="display:none"` on the <img>, which hid the tag without re-rendering the fallback — a broken pasted URL showed up blank. Replaced with reactive pictureLoadFailed / listPictureFailed flags plus a watcher that resets on URL change. Broken URL now falls back to the initial (or identicon for seed-derived identities). Small-upload data URL (#3) Uploaded profile pictures ≤ 64 KB are now inlined as `data:image/png;base64,...` into profile.picture on the client before calling update-profile. That kind-0 event is fetchable by any Nostr client — no Tor needed. Larger uploads fall back to the onion-rooted public_url with a hint telling the user to paste a public https:// URL for broader visibility. Deferred: #1 FIPS Reconnect "actually fixes" — the current Reconnect calls fips.restart which clears the daemon state, but when the anchor is truly unreachable (UDP 8668 blocked by network/ISP), no amount of restart can help. A richer diagnostic is out of scope for this bundle. Artefacts: archipelago 4a77c704…82aa6f8 40379696 archipelago-frontend-1.7.10-alpha.tar.gz 0644a436…54f58 76983846 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 13:46:03 -04:00
}
let _ = host_sudo(&["mv", &staging_old, backup_path]).await;
release(v1.7.6-alpha): robust apply_update + manifest-override env var apply_update frontend swap Transient EROFS on .198 (filesystem hiccup — root FS mounts with errors=remount-ro so a fleeting glitch can bounce /opt to RO for a moment) caught the pre-cleanup `rm -rf web-ui.new web-ui.bak` mid- stride and aborted the apply. Rewrote the swap to use a timestamped staging dir (web-ui.new.<ms>) and a timestamped old-copy path so nothing needs to be rm'd before the extract. After the new tree is mv'd into place, the previous rollback copy is rotated aside with a .<ms> suffix (best-effort) and this apply's old copy becomes the new web-ui.bak. If the final mv fails, the staged old is restored so nginx keeps serving. handle_update_check manifest override handle_update_check takes the git path whenever ~/archy/.git exists. On the dev box (.116) that meant the Pull & Rebuild button was always the only option even though the manifest-path OTA was already wired via ARCHIPELAGO_UPDATE_URL. Now: if that env var is set, we skip the git detection entirely and use the manifest path. The regular fleet (no env var, no repo) hits the manifest branch naturally; beta dev nodes (repo + no env var) still get Pull & Rebuild; dev nodes with the env var explicitly set can finally test the manifest OTA end-to-end. Artefacts: archipelago 356e78cc…91a6dd 40372288 archipelago-frontend-1.7.6-alpha.tar.gz 4fb79664…0172e9 76984615 (reused) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 12:33:10 -04:00
}
info!(name = %name, "Frontend archive extracted to /opt/archipelago/web-ui");
}
_ if name.contains("runtime") && name.ends_with(".tar.gz") => {
let ts = chrono::Utc::now().timestamp_millis();
let staging_new = format!("/opt/archipelago/runtime.new.{}", ts);
let archive = src.to_string_lossy().to_string();
let mk = host_sudo(&["mkdir", "-p", &staging_new])
.await
.context("Failed to create runtime staging dir")?;
if !mk.success() {
anyhow::bail!("mkdir {} failed", staging_new);
}
let extract = host_sudo(&["tar", "-xzf", &archive, "-C", &staging_new])
.await
.with_context(|| format!("Failed to extract {}", name))?;
if !extract.success() {
let _ = host_sudo(&["rm", "-rf", &staging_new]).await;
anyhow::bail!("tar extraction failed for {}", name);
}
let runtime_paths = [
("apps", "apps"),
("scripts", "scripts"),
("docker", "docker"),
(
"image-recipe/configs/archipelago-doctor.service",
"archipelago-doctor.service",
),
(
"image-recipe/configs/archipelago-doctor.timer",
"archipelago-doctor.timer",
),
];
for (relative, label) in runtime_paths {
let staged_path = format!("{}/{}", staging_new, relative);
if !Path::new(&staged_path).exists() {
tracing::debug!(path = %relative, "Runtime artifact path absent, skipping");
continue;
}
match label {
"apps" | "scripts" | "docker" => {
let dest = format!("/opt/archipelago/{}", label);
let tmp_dest =
format!("{}.new.{}", dest, chrono::Utc::now().timestamp_millis());
let _ = host_sudo(&["mkdir", "-p", &tmp_dest]).await;
let staged_dot = format!("{}/.", staged_path);
let copy = host_sudo(&["cp", "-a", &staged_dot, &tmp_dest])
.await
.with_context(|| format!("Failed to copy runtime {}", label))?;
if !copy.success() {
let _ = host_sudo(&["rm", "-rf", &tmp_dest]).await;
anyhow::bail!("runtime copy failed for {}", label);
}
let _ = host_sudo(&["mkdir", "-p", &dest]).await;
let clean = host_sudo(&[
"find",
&dest,
"-mindepth",
"1",
"-maxdepth",
"1",
"-exec",
"rm",
"-rf",
"{}",
"+",
])
.await
.with_context(|| format!("Failed to clean runtime {}", label))?;
if !clean.success() {
let _ = host_sudo(&["rm", "-rf", &tmp_dest]).await;
anyhow::bail!("runtime clean failed for {}", label);
}
let tmp_dot = format!("{}/.", tmp_dest);
let promote = host_sudo(&["cp", "-a", &tmp_dot, &dest])
.await
.with_context(|| format!("Failed to promote runtime {}", label))?;
let _ = host_sudo(&["rm", "-rf", &tmp_dest]).await;
if !promote.success() {
anyhow::bail!("runtime promote failed for {}", label);
}
if label == "scripts" {
let _ = host_sudo(&[
"find", &dest, "-type", "f", "-name", "*.sh", "-exec", "chmod",
"755", "{}", "+",
])
.await;
}
}
"archipelago-doctor.service" | "archipelago-doctor.timer" => {
let dest = format!("/etc/systemd/system/{}", label);
let install = host_sudo(&["install", "-m", "644", &staged_path, &dest])
.await
.with_context(|| format!("Failed to install {}", label))?;
if !install.success() {
anyhow::bail!("runtime unit install failed for {}", label);
}
}
_ => {}
}
}
if Path::new(&format!("{}/scripts/image-versions.sh", staging_new)).exists() {
let _ = host_sudo(&[
"cp",
&format!("{}/scripts/image-versions.sh", staging_new),
"/opt/archipelago/image-versions.sh",
])
.await;
}
let _ = host_sudo(&["systemctl", "daemon-reload"]).await;
let _ =
host_sudo(&["systemctl", "enable", "--now", "archipelago-doctor.timer"]).await;
let _ = host_sudo(&["rm", "-rf", &staging_new]).await;
info!(name = %name, "Runtime assets applied to /opt/archipelago");
}
_ => {
debug!(name = %name, "Unknown component, skipping");
}
}
}
// Update state
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 + v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500) with no recovery path short of SSH. This release adds a self-check guardrail to the update flow. What changed: - apply_update() writes a pending-verify marker with old+new version and a 150s deadline immediately before scheduling the service restart. - verify_pending_update() runs from main.rs startup. If the marker is present and within its freshness window, the new binary waits 15s for nginx + backend to settle, then probes https://127.0.0.1/ every 5s for up to 90s (self-signed certs accepted). - On any probe success within the window, the marker is cleared and nothing else happens. - On window-exhaust, the new binary: 1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts> (quarantined, not deleted, so we can post-mortem). 2. Restores web-ui.bak on top of web-ui. 3. Calls rollback_update() to restore the previous binary. 4. Updates state.current_version to reflect the rollback. 5. systemctl --no-block restart archipelago so the OLD binary boots. - Markers older than 10 minutes are treated as stale and cleared without probing, so a crashed-during-startup marker from weeks ago cannot spontaneously roll back a healthy node on a later reboot. - rollback_update() binary copy now goes through host_sudo instead of tokio::fs::copy, so it escapes the service's ProtectSystem=strict mount namespace. Without this, the rollback silently failed with EROFS on /usr/local/bin and orphaned the rollback - the exact opposite of what auto-rollback is for. Tests: 4 new unit tests in update::tests covering marker round-trip, absent-marker noop, no-panic on verify_pending_update with nothing to verify, and an invariant assert that the 90s probe window stays below the 600s stale threshold. All passing. Side fix: scripts/create-release-manifest.sh was dying with exit 141 (SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail. Replaced with a single awk NR==1 that doesn't short-circuit the upstream pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
let previous_version = {
let state = load_state(data_dir).await?;
state.current_version.clone()
};
let mut state = load_state(data_dir).await?;
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 + v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500) with no recovery path short of SSH. This release adds a self-check guardrail to the update flow. What changed: - apply_update() writes a pending-verify marker with old+new version and a 150s deadline immediately before scheduling the service restart. - verify_pending_update() runs from main.rs startup. If the marker is present and within its freshness window, the new binary waits 15s for nginx + backend to settle, then probes https://127.0.0.1/ every 5s for up to 90s (self-signed certs accepted). - On any probe success within the window, the marker is cleared and nothing else happens. - On window-exhaust, the new binary: 1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts> (quarantined, not deleted, so we can post-mortem). 2. Restores web-ui.bak on top of web-ui. 3. Calls rollback_update() to restore the previous binary. 4. Updates state.current_version to reflect the rollback. 5. systemctl --no-block restart archipelago so the OLD binary boots. - Markers older than 10 minutes are treated as stale and cleared without probing, so a crashed-during-startup marker from weeks ago cannot spontaneously roll back a healthy node on a later reboot. - rollback_update() binary copy now goes through host_sudo instead of tokio::fs::copy, so it escapes the service's ProtectSystem=strict mount namespace. Without this, the rollback silently failed with EROFS on /usr/local/bin and orphaned the rollback - the exact opposite of what auto-rollback is for. Tests: 4 new unit tests in update::tests covering marker round-trip, absent-marker noop, no-panic on verify_pending_update with nothing to verify, and an invariant assert that the 90s probe window stays below the 600s stale threshold. All passing. Side fix: scripts/create-release-manifest.sh was dying with exit 141 (SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail. Replaced with a single awk NR==1 that doesn't short-circuit the upstream pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
let new_version = if let Some(manifest) = &state.available_update {
state.current_version = manifest.version.clone();
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 + v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500) with no recovery path short of SSH. This release adds a self-check guardrail to the update flow. What changed: - apply_update() writes a pending-verify marker with old+new version and a 150s deadline immediately before scheduling the service restart. - verify_pending_update() runs from main.rs startup. If the marker is present and within its freshness window, the new binary waits 15s for nginx + backend to settle, then probes https://127.0.0.1/ every 5s for up to 90s (self-signed certs accepted). - On any probe success within the window, the marker is cleared and nothing else happens. - On window-exhaust, the new binary: 1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts> (quarantined, not deleted, so we can post-mortem). 2. Restores web-ui.bak on top of web-ui. 3. Calls rollback_update() to restore the previous binary. 4. Updates state.current_version to reflect the rollback. 5. systemctl --no-block restart archipelago so the OLD binary boots. - Markers older than 10 minutes are treated as stale and cleared without probing, so a crashed-during-startup marker from weeks ago cannot spontaneously roll back a healthy node on a later reboot. - rollback_update() binary copy now goes through host_sudo instead of tokio::fs::copy, so it escapes the service's ProtectSystem=strict mount namespace. Without this, the rollback silently failed with EROFS on /usr/local/bin and orphaned the rollback - the exact opposite of what auto-rollback is for. Tests: 4 new unit tests in update::tests covering marker round-trip, absent-marker noop, no-panic on verify_pending_update with nothing to verify, and an invariant assert that the 90s probe window stays below the 600s stale threshold. All passing. Side fix: scripts/create-release-manifest.sh was dying with exit 141 (SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail. Replaced with a single awk NR==1 that doesn't short-circuit the upstream pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
manifest.version.clone()
} else {
state.current_version.clone()
};
state.available_update = None;
state.update_in_progress = false;
state.rollback_available = true;
save_state(data_dir, &state).await?;
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 + v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500) with no recovery path short of SSH. This release adds a self-check guardrail to the update flow. What changed: - apply_update() writes a pending-verify marker with old+new version and a 150s deadline immediately before scheduling the service restart. - verify_pending_update() runs from main.rs startup. If the marker is present and within its freshness window, the new binary waits 15s for nginx + backend to settle, then probes https://127.0.0.1/ every 5s for up to 90s (self-signed certs accepted). - On any probe success within the window, the marker is cleared and nothing else happens. - On window-exhaust, the new binary: 1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts> (quarantined, not deleted, so we can post-mortem). 2. Restores web-ui.bak on top of web-ui. 3. Calls rollback_update() to restore the previous binary. 4. Updates state.current_version to reflect the rollback. 5. systemctl --no-block restart archipelago so the OLD binary boots. - Markers older than 10 minutes are treated as stale and cleared without probing, so a crashed-during-startup marker from weeks ago cannot spontaneously roll back a healthy node on a later reboot. - rollback_update() binary copy now goes through host_sudo instead of tokio::fs::copy, so it escapes the service's ProtectSystem=strict mount namespace. Without this, the rollback silently failed with EROFS on /usr/local/bin and orphaned the rollback - the exact opposite of what auto-rollback is for. Tests: 4 new unit tests in update::tests covering marker round-trip, absent-marker noop, no-panic on verify_pending_update with nothing to verify, and an invariant assert that the 90s probe window stays below the 600s stale threshold. All passing. Side fix: scripts/create-release-manifest.sh was dying with exit 141 (SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail. Replaced with a single awk NR==1 that doesn't short-circuit the upstream pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
// Write the post-OTA verification marker BEFORE we schedule the
// restart. The new binary will read it on startup, probe the
// frontend, and auto-rollback if nginx is serving 5xx. Covers the
// class of failure where "apply succeeds, restart succeeds, but
// the UI is dead" (v1.7.38/39 tarball-perms bug). Best-effort —
// a failed marker write shouldn't abort the apply.
let marker = PendingVerification {
applied_at: chrono::Utc::now().to_rfc3339(),
new_version,
previous_version,
deadline_ts: chrono::Utc::now().timestamp() + PENDING_VERIFY_WINDOW_SECS as i64 + 60,
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 + v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500) with no recovery path short of SSH. This release adds a self-check guardrail to the update flow. What changed: - apply_update() writes a pending-verify marker with old+new version and a 150s deadline immediately before scheduling the service restart. - verify_pending_update() runs from main.rs startup. If the marker is present and within its freshness window, the new binary waits 15s for nginx + backend to settle, then probes https://127.0.0.1/ every 5s for up to 90s (self-signed certs accepted). - On any probe success within the window, the marker is cleared and nothing else happens. - On window-exhaust, the new binary: 1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts> (quarantined, not deleted, so we can post-mortem). 2. Restores web-ui.bak on top of web-ui. 3. Calls rollback_update() to restore the previous binary. 4. Updates state.current_version to reflect the rollback. 5. systemctl --no-block restart archipelago so the OLD binary boots. - Markers older than 10 minutes are treated as stale and cleared without probing, so a crashed-during-startup marker from weeks ago cannot spontaneously roll back a healthy node on a later reboot. - rollback_update() binary copy now goes through host_sudo instead of tokio::fs::copy, so it escapes the service's ProtectSystem=strict mount namespace. Without this, the rollback silently failed with EROFS on /usr/local/bin and orphaned the rollback - the exact opposite of what auto-rollback is for. Tests: 4 new unit tests in update::tests covering marker round-trip, absent-marker noop, no-panic on verify_pending_update with nothing to verify, and an invariant assert that the 90s probe window stays below the 600s stale threshold. All passing. Side fix: scripts/create-release-manifest.sh was dying with exit 141 (SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail. Replaced with a single awk NR==1 that doesn't short-circuit the upstream pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
};
if let Err(e) = write_pending_verification(data_dir, &marker).await {
tracing::warn!(error = %e, "Failed to write post-OTA verify marker — rollback disabled for this OTA");
} else {
info!("Post-OTA verify marker written; new binary will probe on boot");
}
// Clean staging
let _ = fs::remove_dir_all(&staging_dir).await;
release(v1.7.2-alpha): fix Install Update + identity avatar backfill + label Three user-visible fixes shipped together. 1. update.apply permission-denied apply_update() was doing fs::copy into /usr/local/bin/archipelago and tar xzf into /opt/archipelago as the archipelago user — both root-owned. The backup step succeeded (it wrote to data_dir) but the swap failed with a silent permission denied, wrapped as "Failed to apply archipelago". Now uses `sudo install -m 0755` for the binary and `sudo tar -xzf` for the frontend, plus a post-apply `sudo systemctl --no-block restart archipelago` scheduled 2s after the RPC reply so the UI sees success. 2. Apply → Install label en/es locale strings: applyUpdate / applyTitle / applyNow changed from "Apply" to "Install". Matches the user's mental model and distinguishes the user-facing verb from the internal apply_update() function. 3. Identity avatar backfill Identities created before df83163f had profile=None on disk and so rendered as initials. load_record() now synthesizes an IdentityProfile with a default picture (identicon for regular identities, the hex node SVG for derivation_index=0) when profile is missing. The synthetic profile lives only in the returned record; the file stays untouched so a later explicit Save persists whatever the user actually chose. Artefacts: archipelago 70e5444e…67c589 40381960 archipelago-frontend-1.7.2-alpha.tar.gz 806b027b…358a824 76983699 Changelog rewritten layman-style per saved feedback. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 11:25:10 -04:00
info!("Update applied — scheduling service restart in 2s so the RPC reply lands first");
// Restart asynchronously so the JSON-RPC response actually reaches the
// UI before systemd kills us. --no-block makes sure systemctl doesn't
// try to wait for the current service (us) to exit cleanly before
// starting the new process — it would deadlock otherwise.
tokio::spawn(async {
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
release(v1.7.10-alpha): apply namespace fix + FIPS cascade + profile polish THE apply fix archipelago.service uses ProtectSystem=strict, so /opt and /usr are read-only inside the service's mount namespace. sudo inherits that namespace — every sudo mkdir/mv/chown from apply_update was hitting EROFS even as root. Every prior "Failed to apply update" was a symptom of this. New `host_sudo()` helper wraps every filesystem call in `sudo systemd-run --wait --collect --pipe -- <cmd>`, which spawns a transient unit with systemd's default (no ProtectSystem) protections — the command runs in the host namespace and can touch /opt/archipelago + /usr/local/bin normally. FIPS cascade (#2) Home.vue and Server.vue both carry a FIPS row that previously only looked at {installed, service_active, key_present}. Now they also read anchor_connected + authenticated_peer_count and mirror the full FIPS card: green "Active · N peers" when healthy, orange "No anchor" when the DHT bootstrap has failed. Profile paste URL fallback (#4) Web5Identities.vue list + editor previously had `@error="display:none"` on the <img>, which hid the tag without re-rendering the fallback — a broken pasted URL showed up blank. Replaced with reactive pictureLoadFailed / listPictureFailed flags plus a watcher that resets on URL change. Broken URL now falls back to the initial (or identicon for seed-derived identities). Small-upload data URL (#3) Uploaded profile pictures ≤ 64 KB are now inlined as `data:image/png;base64,...` into profile.picture on the client before calling update-profile. That kind-0 event is fetchable by any Nostr client — no Tor needed. Larger uploads fall back to the onion-rooted public_url with a hint telling the user to paste a public https:// URL for broader visibility. Deferred: #1 FIPS Reconnect "actually fixes" — the current Reconnect calls fips.restart which clears the daemon state, but when the anchor is truly unreachable (UDP 8668 blocked by network/ISP), no amount of restart can help. A richer diagnostic is out of scope for this bundle. Artefacts: archipelago 4a77c704…82aa6f8 40379696 archipelago-frontend-1.7.10-alpha.tar.gz 0644a436…54f58 76983846 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 13:46:03 -04:00
// systemctl talks to PID 1 over D-Bus — doesn't need the host
// mount namespace, but routing through host_sudo keeps the
// apply flow's sudo calls uniform.
let _ = host_sudo(&["systemctl", "--no-block", "restart", "archipelago"]).await;
release(v1.7.2-alpha): fix Install Update + identity avatar backfill + label Three user-visible fixes shipped together. 1. update.apply permission-denied apply_update() was doing fs::copy into /usr/local/bin/archipelago and tar xzf into /opt/archipelago as the archipelago user — both root-owned. The backup step succeeded (it wrote to data_dir) but the swap failed with a silent permission denied, wrapped as "Failed to apply archipelago". Now uses `sudo install -m 0755` for the binary and `sudo tar -xzf` for the frontend, plus a post-apply `sudo systemctl --no-block restart archipelago` scheduled 2s after the RPC reply so the UI sees success. 2. Apply → Install label en/es locale strings: applyUpdate / applyTitle / applyNow changed from "Apply" to "Install". Matches the user's mental model and distinguishes the user-facing verb from the internal apply_update() function. 3. Identity avatar backfill Identities created before df83163f had profile=None on disk and so rendered as initials. load_record() now synthesizes an IdentityProfile with a default picture (identicon for regular identities, the hex node SVG for derivation_index=0) when profile is missing. The synthetic profile lives only in the returned record; the file stays untouched so a later explicit Save persists whatever the user actually chose. Artefacts: archipelago 70e5444e…67c589 40381960 archipelago-frontend-1.7.2-alpha.tar.gz 806b027b…358a824 76983699 Changelog rewritten layman-style per saved feedback. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 11:25:10 -04:00
});
Ok(())
}
/// Rollback to the previous version from backup.
pub async fn rollback_update(data_dir: &Path) -> Result<()> {
let backup_dir = data_dir.join("update-backup");
if !backup_dir.exists() {
anyhow::bail!("No rollback backup available");
}
let backup_binary = backup_dir.join("archipelago");
if backup_binary.exists() {
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 + v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500) with no recovery path short of SSH. This release adds a self-check guardrail to the update flow. What changed: - apply_update() writes a pending-verify marker with old+new version and a 150s deadline immediately before scheduling the service restart. - verify_pending_update() runs from main.rs startup. If the marker is present and within its freshness window, the new binary waits 15s for nginx + backend to settle, then probes https://127.0.0.1/ every 5s for up to 90s (self-signed certs accepted). - On any probe success within the window, the marker is cleared and nothing else happens. - On window-exhaust, the new binary: 1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts> (quarantined, not deleted, so we can post-mortem). 2. Restores web-ui.bak on top of web-ui. 3. Calls rollback_update() to restore the previous binary. 4. Updates state.current_version to reflect the rollback. 5. systemctl --no-block restart archipelago so the OLD binary boots. - Markers older than 10 minutes are treated as stale and cleared without probing, so a crashed-during-startup marker from weeks ago cannot spontaneously roll back a healthy node on a later reboot. - rollback_update() binary copy now goes through host_sudo instead of tokio::fs::copy, so it escapes the service's ProtectSystem=strict mount namespace. Without this, the rollback silently failed with EROFS on /usr/local/bin and orphaned the rollback - the exact opposite of what auto-rollback is for. Tests: 4 new unit tests in update::tests covering marker round-trip, absent-marker noop, no-panic on verify_pending_update with nothing to verify, and an invariant assert that the 90s probe window stays below the 600s stale threshold. All passing. Side fix: scripts/create-release-manifest.sh was dying with exit 141 (SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail. Replaced with a single awk NR==1 that doesn't short-circuit the upstream pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
// Use host_sudo + mv so we escape the archipelago service's
// ProtectSystem=strict mount namespace. A plain fs::copy or
// `sudo cp` from inside the service hits EROFS on /usr/local/bin,
// which would silently orphan the rollback — exactly the
// opposite of what auto-rollback is for. Pattern matches
// apply_update()'s binary swap above.
let backup_str = backup_binary.to_string_lossy().to_string();
let _ = host_sudo(&["chmod", "0755", &backup_str]).await;
let _ = host_sudo(&["chown", "root:root", &backup_str]).await;
let status = host_sudo(&["cp", &backup_str, "/usr/local/bin/archipelago"])
.await
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 + v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500) with no recovery path short of SSH. This release adds a self-check guardrail to the update flow. What changed: - apply_update() writes a pending-verify marker with old+new version and a 150s deadline immediately before scheduling the service restart. - verify_pending_update() runs from main.rs startup. If the marker is present and within its freshness window, the new binary waits 15s for nginx + backend to settle, then probes https://127.0.0.1/ every 5s for up to 90s (self-signed certs accepted). - On any probe success within the window, the marker is cleared and nothing else happens. - On window-exhaust, the new binary: 1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts> (quarantined, not deleted, so we can post-mortem). 2. Restores web-ui.bak on top of web-ui. 3. Calls rollback_update() to restore the previous binary. 4. Updates state.current_version to reflect the rollback. 5. systemctl --no-block restart archipelago so the OLD binary boots. - Markers older than 10 minutes are treated as stale and cleared without probing, so a crashed-during-startup marker from weeks ago cannot spontaneously roll back a healthy node on a later reboot. - rollback_update() binary copy now goes through host_sudo instead of tokio::fs::copy, so it escapes the service's ProtectSystem=strict mount namespace. Without this, the rollback silently failed with EROFS on /usr/local/bin and orphaned the rollback - the exact opposite of what auto-rollback is for. Tests: 4 new unit tests in update::tests covering marker round-trip, absent-marker noop, no-panic on verify_pending_update with nothing to verify, and an invariant assert that the 90s probe window stays below the 600s stale threshold. All passing. Side fix: scripts/create-release-manifest.sh was dying with exit 141 (SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail. Replaced with a single awk NR==1 that doesn't short-circuit the upstream pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
.context("Failed to restore backup binary via host_sudo")?;
if !status.success() {
anyhow::bail!(
"cp backup binary into /usr/local/bin failed (exit {:?})",
status.code()
);
}
info!("Binary rolled back to previous version");
}
let mut state = load_state(data_dir).await?;
state.rollback_available = false;
save_state(data_dir, &state).await?;
let _ = fs::remove_dir_all(&backup_dir).await;
info!("Rollback complete. Restart service to take effect.");
Ok(())
}
#[derive(Debug, Serialize, Deserialize)]
pub struct DownloadProgress {
pub total_bytes: u64,
pub downloaded_bytes: u64,
pub components_downloaded: usize,
pub staging_dir: String,
}
/// Set the update schedule preference.
pub async fn set_schedule(data_dir: &Path, schedule: UpdateSchedule) -> Result<()> {
let mut state = load_state(data_dir).await?;
state.schedule = schedule;
save_state(data_dir, &state).await?;
info!(schedule = ?schedule, "Update schedule changed");
Ok(())
}
/// Get the current schedule.
pub async fn get_schedule(data_dir: &Path) -> Result<UpdateSchedule> {
let state = load_state(data_dir).await?;
Ok(state.schedule)
}
/// Background update scheduler. Runs in a loop, checking/applying based on schedule.
/// Call this once at startup via `tokio::spawn`.
pub async fn run_update_scheduler(data_dir: std::path::PathBuf) {
use tokio::time::{interval, Duration};
// Check every hour; act based on schedule setting
let mut tick = interval(Duration::from_secs(3600));
loop {
tick.tick().await;
let state = match load_state(&data_dir).await {
Ok(s) => s,
Err(e) => {
debug!("Update scheduler: failed to load state: {}", e);
continue;
}
};
match state.schedule {
UpdateSchedule::Manual => {
debug!("Update scheduler: manual mode, skipping");
continue;
}
UpdateSchedule::DailyCheck => {
// Only check once per day
if let Some(ref last) = state.last_check {
if let Ok(last_time) = chrono::DateTime::parse_from_rfc3339(last) {
let elapsed = chrono::Utc::now() - last_time.with_timezone(&chrono::Utc);
if elapsed.num_hours() < 24 {
debug!("Update scheduler: checked recently, skipping");
continue;
}
}
}
info!("Update scheduler: running daily check");
if let Err(e) = check_for_updates(&data_dir).await {
debug!("Update scheduler: check failed: {}", e);
}
}
UpdateSchedule::AutoApply => {
// Auto-apply: check, download, and apply during 3 AM window
let hour = chrono::Local::now().hour();
if hour != 3 {
// Still do daily check outside the window
if let Some(ref last) = state.last_check {
if let Ok(last_time) = chrono::DateTime::parse_from_rfc3339(last) {
let elapsed =
chrono::Utc::now() - last_time.with_timezone(&chrono::Utc);
if elapsed.num_hours() < 24 {
continue;
}
}
}
info!("Update scheduler: auto-apply check (outside window)");
if let Err(e) = check_for_updates(&data_dir).await {
debug!("Update scheduler: check failed: {}", e);
}
continue;
}
// 3 AM — check, download, and apply
info!("Update scheduler: 3 AM auto-apply window");
match check_for_updates(&data_dir).await {
Ok(s) if s.available_update.is_some() => {
info!("Update scheduler: downloading update");
if let Err(e) = download_update(&data_dir).await {
debug!("Update scheduler: download failed: {}", e);
continue;
}
info!("Update scheduler: applying update");
if let Err(e) = apply_update(&data_dir).await {
debug!("Update scheduler: apply failed: {}", e);
continue;
}
info!(
"Update scheduler: update applied, restart scheduled by apply_update"
);
// apply_update has already spawned a 2s-delayed
// `systemctl restart archipelago`. Don't call
// std::process::exit here — that kills the runtime
// before the spawned restart task runs, and since
// the unit is Restart=on-failure a clean exit(0)
// leaves the service dead. Fall through; the
// scheduled restart will bring us back cleanly.
return;
}
Ok(_) => {
debug!("Update scheduler: no update available");
}
Err(e) => {
debug!("Update scheduler: check failed: {}", e);
}
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_update_schedule_default_is_daily_check() {
let schedule = UpdateSchedule::default();
assert_eq!(schedule, UpdateSchedule::DailyCheck);
}
#[test]
fn test_manifest_origin_parses_https() {
assert_eq!(
manifest_origin(
"https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/manifest.json"
),
Some("https://git.tx1138.com".to_string())
);
}
#[test]
fn test_manifest_origin_parses_http_with_port() {
assert_eq!(
manifest_origin(
"http://23.182.128.160:3000/lfg2025/archy/raw/branch/main/releases/manifest.json"
),
Some("http://23.182.128.160:3000".to_string())
);
}
#[test]
fn test_manifest_origin_rejects_garbage() {
assert_eq!(manifest_origin("not a url"), None);
assert_eq!(manifest_origin("ftp://git.tx1138.com/x"), None);
}
#[test]
fn test_rewrite_manifest_origins_swaps_all_components() {
let mut manifest = UpdateManifest {
version: "1.7.26-alpha".into(),
release_date: "2026-04-21".into(),
changelog: vec![],
components: vec![
ComponentUpdate {
name: "archipelago".into(),
current_version: "1.7.25-alpha".into(),
new_version: "1.7.26-alpha".into(),
download_url: "https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/v1.7.26-alpha/archipelago".into(),
sha256: "x".into(),
size_bytes: 1,
},
ComponentUpdate {
name: "frontend".into(),
current_version: "1.7.25-alpha".into(),
new_version: "1.7.26-alpha".into(),
download_url: "https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/v1.7.26-alpha/frontend.tar.gz".into(),
sha256: "y".into(),
size_bytes: 2,
},
],
};
rewrite_manifest_origins(
&mut manifest,
"http://23.182.128.160:3000/lfg2025/archy/raw/branch/main/releases/manifest.json",
);
assert_eq!(
manifest.components[0].download_url,
"http://23.182.128.160:3000/lfg2025/archy/raw/branch/main/releases/v1.7.26-alpha/archipelago"
);
assert_eq!(
manifest.components[1].download_url,
"http://23.182.128.160:3000/lfg2025/archy/raw/branch/main/releases/v1.7.26-alpha/frontend.tar.gz"
);
}
#[tokio::test]
async fn test_load_mirrors_returns_defaults_when_absent() {
let dir = tempfile::tempdir().unwrap();
let list = load_mirrors(dir.path()).await.unwrap();
assert_eq!(list.len(), 2);
assert!(list[0].url.contains("146.59.87.168"));
release(v1.7.31-alpha): idempotent IndeedHub install + auto-merge default mirrors/registries + 3rd OVH update mirror - Backend: install.rs registry reachability probe now strips the `host[:port]/namespace` suffix before appending `/v2/` (the Docker V2 API lives at the host root, not under the namespace) and accepts HTTP 405 in addition to 200/401 as "registry daemon alive". This fixes false "unreachable" reports on the Test button for Gitea and other registries that protect their /v2/ endpoint. - Backend: stacks.rs install_indeedhub_stack now force-removes any leftover indeedhub-* containers and indeedhub-net before creating the stack. A partial install (or the old first-boot stub racing the installer) used to leave containers around that blocked re-install with "name already in use". Re-running the App Store install now self-heals. - Backend: registry.rs load_registries auto-merges any default registry URLs missing from the saved config (appended with priority max+10+i, persisted). Lets new default mirrors (e.g. Server 3 OVH) roll out to existing nodes without manual config edits. Explicit removals still stick — URLs absent from disk AND absent from defaults stay gone. - Backend: update.rs adds DEFAULT_TERTIARY_MIRROR_URL at http://146.59.87.168:3000/ (Server 3 OVH) to default_mirrors, with the same auto-merge-on-load behavior as registries. Test updated for 3-mirror default (.160, tx1138, .168). - Scripts: dropped the first-boot IndeedHub stub (~38 lines in first-boot-containers.sh §8b). It predated the proper stack installer, raced it, and was the main source of the name-conflict mess the stacks.rs cleanup above now also guards against. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 03:26:09 -04:00
assert!(list[1].url.contains("git.tx1138.com"));
}
#[tokio::test]
async fn test_save_and_load_mirrors_roundtrip() {
let dir = tempfile::tempdir().unwrap();
let list = vec![UpdateMirror {
url: "https://example.com/m.json".into(),
label: "Example".into(),
}];
save_mirrors(dir.path(), &list).await.unwrap();
let back = load_mirrors(dir.path()).await.unwrap();
// load_mirrors merges in any missing default mirrors so a node
// that explicitly added a single custom mirror still gets the
// built-in OVH + tx1138 fallbacks. The custom mirror is preserved.
assert!(
back.iter().any(|m| m.url == "https://example.com/m.json"),
"custom mirror should round-trip; got {:?}",
back
);
for def in default_mirrors() {
assert!(
back.iter().any(|m| m.url == def.url),
"default mirror {} should be present after load; got {:?}",
def.url,
back
);
}
}
#[test]
fn test_update_state_default_values() {
let state = UpdateState::default();
assert_eq!(state.current_version, env!("CARGO_PKG_VERSION"));
assert!(state.last_check.is_none());
assert!(state.available_update.is_none());
assert!(!state.update_in_progress);
assert!(!state.rollback_available);
assert_eq!(state.schedule, UpdateSchedule::DailyCheck);
}
#[test]
fn test_update_state_serialization_roundtrip() {
let state = UpdateState {
current_version: "0.2.0".to_string(),
last_check: Some("2025-01-01T00:00:00Z".to_string()),
available_update: None,
update_in_progress: false,
rollback_available: true,
schedule: UpdateSchedule::AutoApply,
manifest_mirror: None,
};
let json = serde_json::to_string(&state).unwrap();
let deserialized: UpdateState = serde_json::from_str(&json).unwrap();
assert_eq!(deserialized.current_version, "0.2.0");
assert!(deserialized.rollback_available);
assert_eq!(deserialized.schedule, UpdateSchedule::AutoApply);
}
#[test]
fn test_update_schedule_serde_rename() {
let json = serde_json::to_string(&UpdateSchedule::DailyCheck).unwrap();
assert_eq!(json, "\"daily_check\"");
let json = serde_json::to_string(&UpdateSchedule::Manual).unwrap();
assert_eq!(json, "\"manual\"");
let json = serde_json::to_string(&UpdateSchedule::AutoApply).unwrap();
assert_eq!(json, "\"auto_apply\"");
}
#[test]
fn test_update_state_schedule_defaults_on_missing_field() {
// When schedule field is missing from JSON, it should default to DailyCheck
let json = r#"{
"current_version": "0.1.0",
"last_check": null,
"available_update": null,
"update_in_progress": false,
"rollback_available": false
}"#;
let state: UpdateState = serde_json::from_str(json).unwrap();
assert_eq!(state.schedule, UpdateSchedule::DailyCheck);
}
#[test]
fn test_parse_version_triple() {
assert_eq!(parse_version_triple("1.7.18"), Some((1, 7, 18)));
assert_eq!(parse_version_triple("1.7.18-alpha"), Some((1, 7, 18)));
assert_eq!(parse_version_triple("0.0.1"), Some((0, 0, 1)));
assert_eq!(parse_version_triple("garbage"), None);
assert_eq!(parse_version_triple("1.2"), None);
}
#[test]
fn test_is_newer() {
assert!(is_newer("1.7.19-alpha", "1.7.18-alpha"));
assert!(is_newer("1.8.0-alpha", "1.7.99-alpha"));
assert!(is_newer("1.7.10-alpha", "1.7.9-alpha")); // numeric, not lexical
assert!(!is_newer("1.7.18-alpha", "1.7.18-alpha"));
assert!(!is_newer("1.7.17-alpha", "1.7.18-alpha")); // would-be downgrade
assert!(!is_newer("1.7.9-alpha", "1.7.10-alpha"));
}
#[tokio::test]
async fn test_load_state_clears_stale_available_on_version_bump() {
// Simulates a sideload: state file on disk says we're on
// 1.7.16-alpha with 1.7.17-alpha staged as the pending update,
// but the running binary is 1.7.18-alpha (skipped a version).
// load_state must drop the stale available_update so the UI
// doesn't offer a downgrade.
let dir = tempfile::tempdir().unwrap();
let stale = UpdateState {
current_version: "1.7.16-alpha".to_string(),
available_update: Some(UpdateManifest {
version: "1.7.17-alpha".to_string(),
release_date: "2026-04-20".to_string(),
changelog: vec![],
components: vec![],
}),
..UpdateState::default()
};
save_state(dir.path(), &stale).await.unwrap();
let loaded = load_state(dir.path()).await.unwrap();
assert_eq!(loaded.current_version, env!("CARGO_PKG_VERSION"));
assert!(
loaded.available_update.is_none(),
"stale available_update must be cleared after version bump"
);
}
#[tokio::test]
async fn test_load_state_creates_default_when_missing() {
let dir = tempfile::tempdir().unwrap();
let state = load_state(dir.path()).await.unwrap();
assert_eq!(state.current_version, env!("CARGO_PKG_VERSION"));
assert!(!state.update_in_progress);
// File should now exist after load created the default
assert!(dir.path().join(UPDATE_STATE_FILE).exists());
}
#[tokio::test]
async fn test_save_and_load_state_roundtrip() {
let dir = tempfile::tempdir().unwrap();
2026-06-12 03:00:15 -04:00
let staging = dir.path().join("update-staging");
tokio::fs::create_dir_all(&staging).await.unwrap();
tokio::fs::write(staging.join("archipelago"), b"staged")
.await
.unwrap();
let state = UpdateState {
current_version: "1.0.0".to_string(),
last_check: Some("2025-06-15T12:00:00Z".to_string()),
available_update: Some(UpdateManifest {
version: "1.1.0".to_string(),
release_date: "2025-06-20".to_string(),
changelog: vec!["Fix bugs".to_string(), "New feature".to_string()],
components: vec![ComponentUpdate {
name: "archipelago".to_string(),
current_version: "1.0.0".to_string(),
new_version: "1.1.0".to_string(),
download_url: "https://example.com/binary".to_string(),
sha256: "abc123".to_string(),
size_bytes: 5000,
}],
}),
update_in_progress: true,
rollback_available: false,
schedule: UpdateSchedule::Manual,
manifest_mirror: Some(
"https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/manifest.json"
.to_string(),
),
};
save_state(dir.path(), &state).await.unwrap();
let loaded = load_state(dir.path()).await.unwrap();
// load_state rewrites current_version to match the running
// binary (sideload self-heal), so don't assert on the saved
// value. The migration also clears available_update when the
// version changes — check the other fields survived.
assert_eq!(loaded.current_version, env!("CARGO_PKG_VERSION"));
assert!(loaded.update_in_progress);
assert_eq!(loaded.schedule, UpdateSchedule::Manual);
assert!(loaded.available_update.is_none());
}
2026-06-12 03:00:15 -04:00
#[tokio::test]
async fn test_load_state_clears_stale_in_progress_without_staging() {
let dir = tempfile::tempdir().unwrap();
let state = UpdateState {
update_in_progress: true,
..UpdateState::default()
};
save_state(dir.path(), &state).await.unwrap();
let loaded = load_state(dir.path()).await.unwrap();
assert!(!loaded.update_in_progress);
let persisted = load_state(dir.path()).await.unwrap();
assert!(!persisted.update_in_progress);
}
#[tokio::test]
async fn test_dismiss_update_clears_available() {
let dir = tempfile::tempdir().unwrap();
let state = UpdateState {
available_update: Some(UpdateManifest {
version: "2.0.0".to_string(),
release_date: "2025-07-01".to_string(),
changelog: vec![],
components: vec![],
}),
..UpdateState::default()
};
save_state(dir.path(), &state).await.unwrap();
dismiss_update(dir.path()).await.unwrap();
let loaded = load_state(dir.path()).await.unwrap();
assert!(loaded.available_update.is_none());
}
#[tokio::test]
async fn test_set_and_get_schedule() {
let dir = tempfile::tempdir().unwrap();
// Initialize state
let _ = load_state(dir.path()).await.unwrap();
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
set_schedule(dir.path(), UpdateSchedule::AutoApply)
.await
.unwrap();
let schedule = get_schedule(dir.path()).await.unwrap();
assert_eq!(schedule, UpdateSchedule::AutoApply);
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
set_schedule(dir.path(), UpdateSchedule::Manual)
.await
.unwrap();
let schedule = get_schedule(dir.path()).await.unwrap();
assert_eq!(schedule, UpdateSchedule::Manual);
}
#[tokio::test]
async fn test_get_status_returns_current_state() {
let dir = tempfile::tempdir().unwrap();
let state = UpdateState {
current_version: "3.0.0".to_string(),
rollback_available: true,
..UpdateState::default()
};
save_state(dir.path(), &state).await.unwrap();
let status = get_status(dir.path()).await.unwrap();
// get_status → load_state, which rewrites current_version to
// match the running binary (see the sideload-self-heal path).
assert_eq!(status.current_version, env!("CARGO_PKG_VERSION"));
assert!(status.rollback_available);
}
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 + v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500) with no recovery path short of SSH. This release adds a self-check guardrail to the update flow. What changed: - apply_update() writes a pending-verify marker with old+new version and a 150s deadline immediately before scheduling the service restart. - verify_pending_update() runs from main.rs startup. If the marker is present and within its freshness window, the new binary waits 15s for nginx + backend to settle, then probes https://127.0.0.1/ every 5s for up to 90s (self-signed certs accepted). - On any probe success within the window, the marker is cleared and nothing else happens. - On window-exhaust, the new binary: 1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts> (quarantined, not deleted, so we can post-mortem). 2. Restores web-ui.bak on top of web-ui. 3. Calls rollback_update() to restore the previous binary. 4. Updates state.current_version to reflect the rollback. 5. systemctl --no-block restart archipelago so the OLD binary boots. - Markers older than 10 minutes are treated as stale and cleared without probing, so a crashed-during-startup marker from weeks ago cannot spontaneously roll back a healthy node on a later reboot. - rollback_update() binary copy now goes through host_sudo instead of tokio::fs::copy, so it escapes the service's ProtectSystem=strict mount namespace. Without this, the rollback silently failed with EROFS on /usr/local/bin and orphaned the rollback - the exact opposite of what auto-rollback is for. Tests: 4 new unit tests in update::tests covering marker round-trip, absent-marker noop, no-panic on verify_pending_update with nothing to verify, and an invariant assert that the 90s probe window stays below the 600s stale threshold. All passing. Side fix: scripts/create-release-manifest.sh was dying with exit 141 (SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail. Replaced with a single awk NR==1 that doesn't short-circuit the upstream pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
#[tokio::test]
async fn test_pending_verification_round_trip() {
let dir = tempfile::tempdir().unwrap();
let marker = PendingVerification {
applied_at: chrono::Utc::now().to_rfc3339(),
new_version: "1.7.41-alpha".into(),
previous_version: "1.7.40-alpha".into(),
deadline_ts: chrono::Utc::now().timestamp() + 150,
};
write_pending_verification(dir.path(), &marker)
.await
.unwrap();
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 + v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500) with no recovery path short of SSH. This release adds a self-check guardrail to the update flow. What changed: - apply_update() writes a pending-verify marker with old+new version and a 150s deadline immediately before scheduling the service restart. - verify_pending_update() runs from main.rs startup. If the marker is present and within its freshness window, the new binary waits 15s for nginx + backend to settle, then probes https://127.0.0.1/ every 5s for up to 90s (self-signed certs accepted). - On any probe success within the window, the marker is cleared and nothing else happens. - On window-exhaust, the new binary: 1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts> (quarantined, not deleted, so we can post-mortem). 2. Restores web-ui.bak on top of web-ui. 3. Calls rollback_update() to restore the previous binary. 4. Updates state.current_version to reflect the rollback. 5. systemctl --no-block restart archipelago so the OLD binary boots. - Markers older than 10 minutes are treated as stale and cleared without probing, so a crashed-during-startup marker from weeks ago cannot spontaneously roll back a healthy node on a later reboot. - rollback_update() binary copy now goes through host_sudo instead of tokio::fs::copy, so it escapes the service's ProtectSystem=strict mount namespace. Without this, the rollback silently failed with EROFS on /usr/local/bin and orphaned the rollback - the exact opposite of what auto-rollback is for. Tests: 4 new unit tests in update::tests covering marker round-trip, absent-marker noop, no-panic on verify_pending_update with nothing to verify, and an invariant assert that the 90s probe window stays below the 600s stale threshold. All passing. Side fix: scripts/create-release-manifest.sh was dying with exit 141 (SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail. Replaced with a single awk NR==1 that doesn't short-circuit the upstream pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
let read = read_pending_verification(dir.path()).await.unwrap();
assert_eq!(read.new_version, "1.7.41-alpha");
assert_eq!(read.previous_version, "1.7.40-alpha");
clear_pending_verification(dir.path()).await;
assert!(read_pending_verification(dir.path()).await.is_none());
}
#[tokio::test]
async fn test_pending_verification_absent_is_none() {
let dir = tempfile::tempdir().unwrap();
assert!(read_pending_verification(dir.path()).await.is_none());
}
#[tokio::test]
async fn test_verify_pending_update_noop_without_marker() {
let dir = tempfile::tempdir().unwrap();
// No marker written -- must return quickly without doing anything
// risky (network probes, rollback calls). We're just asserting
// it doesn't panic or hang.
verify_pending_update(dir.path()).await;
}
#[test]
fn test_pending_verify_constants_are_sensible() {
// Window must be generous enough for nginx + backend startup,
// but less than the stale-marker threshold so a normal cycle
// can complete without the marker being considered stale.
assert!(PENDING_VERIFY_WINDOW_SECS < PENDING_VERIFY_MAX_AGE_SECS as u64);
assert!(PENDING_VERIFY_WINDOW_SECS >= 60);
}
}