2026-03-09 07:43:12 +00:00
|
|
|
//! Update system: check for updates, download deltas, apply with rollback.
|
|
|
|
|
|
|
|
|
|
use anyhow::{Context, Result};
|
2026-03-11 10:57:33 +00:00
|
|
|
use chrono::Timelike;
|
2026-03-09 07:43:12 +00:00
|
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
|
use std::path::Path;
|
2026-04-20 19:10:34 -04:00
|
|
|
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
|
2026-03-09 07:43:12 +00:00
|
|
|
use tokio::fs;
|
2026-03-22 03:30:21 +00:00
|
|
|
use tracing::{debug, info};
|
2026-03-09 07:43:12 +00:00
|
|
|
|
2026-04-20 17:17:58 -04:00
|
|
|
/// Live download progress counters. Updated by download_component_resumable
|
|
|
|
|
/// as bytes arrive and read by the update.status RPC so the UI can show
|
|
|
|
|
/// a real progress bar instead of a fake creep. Global because the
|
|
|
|
|
/// download runs in one place at a time; no need for per-handler state.
|
|
|
|
|
pub static DOWNLOAD_BYTES: AtomicU64 = AtomicU64::new(0);
|
|
|
|
|
pub static DOWNLOAD_TOTAL: AtomicU64 = AtomicU64::new(0);
|
2026-04-20 19:10:34 -04:00
|
|
|
/// Set true to ask the in-flight download loop to bail out at the next
|
|
|
|
|
/// chunk boundary. Read via `is_canceled`; reset at the start of every
|
|
|
|
|
/// `download_update` run. Also flipped by the `cancel_download` RPC.
|
|
|
|
|
pub static DOWNLOAD_CANCEL: AtomicBool = AtomicBool::new(false);
|
|
|
|
|
/// Monotonic ms timestamp of the last time DOWNLOAD_BYTES advanced.
|
|
|
|
|
/// Lets `update.status` flag a download as "stalled" when no bytes have
|
|
|
|
|
/// arrived for a while, so the UI can offer a Cancel button with more
|
|
|
|
|
/// confidence than "looks stuck at 0%".
|
|
|
|
|
pub static DOWNLOAD_PROGRESS_AT: AtomicU64 = AtomicU64::new(0);
|
|
|
|
|
|
|
|
|
|
fn now_ms() -> u64 {
|
|
|
|
|
use std::time::{SystemTime, UNIX_EPOCH};
|
|
|
|
|
SystemTime::now()
|
|
|
|
|
.duration_since(UNIX_EPOCH)
|
|
|
|
|
.map(|d| d.as_millis() as u64)
|
|
|
|
|
.unwrap_or(0)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn is_canceled() -> bool {
|
|
|
|
|
DOWNLOAD_CANCEL.load(Ordering::Relaxed)
|
|
|
|
|
}
|
2026-04-20 17:17:58 -04:00
|
|
|
|
2026-04-21 04:04:20 -04:00
|
|
|
/// Parse "MAJOR.MINOR.PATCH[-suffix]" into a tuple; suffix is ignored.
|
|
|
|
|
/// Returns None if the numeric portion can't be parsed — callers should
|
|
|
|
|
/// fall back to string comparison in that case so we don't silently
|
|
|
|
|
/// mis-rank versions we don't understand.
|
|
|
|
|
fn parse_version_triple(v: &str) -> Option<(u32, u32, u32)> {
|
|
|
|
|
let core = v.split('-').next().unwrap_or(v);
|
|
|
|
|
let mut parts = core.split('.');
|
|
|
|
|
let major: u32 = parts.next()?.parse().ok()?;
|
|
|
|
|
let minor: u32 = parts.next()?.parse().ok()?;
|
|
|
|
|
let patch: u32 = parts.next()?.parse().ok()?;
|
|
|
|
|
Some((major, minor, patch))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Is `candidate` strictly newer than `current`? Used to guard against
|
|
|
|
|
/// the manifest offering a version we've already passed (e.g. a stale
|
|
|
|
|
/// cached manifest or a node that sideloaded past the manifest's
|
|
|
|
|
/// latest). Falls back to string inequality if either version doesn't
|
|
|
|
|
/// parse, preserving the old behaviour for unusual version strings.
|
|
|
|
|
fn is_newer(candidate: &str, current: &str) -> bool {
|
|
|
|
|
match (parse_version_triple(candidate), parse_version_triple(current)) {
|
|
|
|
|
(Some(a), Some(b)) => a > b,
|
|
|
|
|
_ => candidate != current,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-11 10:57:33 +00:00
|
|
|
const DEFAULT_UPDATE_MANIFEST_URL: &str =
|
2026-03-25 15:52:26 +00:00
|
|
|
"https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/manifest.json";
|
2026-04-23 08:22:32 -04:00
|
|
|
/// Secondary mirror on an OVH VPS — independent network path so a
|
|
|
|
|
/// single-provider outage doesn't knock out both mirrors. Promoted to
|
|
|
|
|
/// primary default on 2026-04-23 after the Hetzner .160 VPS was
|
|
|
|
|
/// decommissioned.
|
2026-04-21 10:09:28 -04:00
|
|
|
const DEFAULT_SECONDARY_MIRROR_URL: &str =
|
2026-04-22 03:26:09 -04:00
|
|
|
"http://146.59.87.168:3000/lfg2025/archy/raw/branch/main/releases/manifest.json";
|
2026-03-09 07:43:12 +00:00
|
|
|
const UPDATE_STATE_FILE: &str = "update_state.json";
|
2026-04-21 10:09:28 -04:00
|
|
|
const UPDATE_MIRRORS_FILE: &str = "update-mirrors.json";
|
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet
Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 +
v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500)
with no recovery path short of SSH. This release adds a self-check
guardrail to the update flow.
What changed:
- apply_update() writes a pending-verify marker with old+new version and
a 150s deadline immediately before scheduling the service restart.
- verify_pending_update() runs from main.rs startup. If the marker is
present and within its freshness window, the new binary waits 15s for
nginx + backend to settle, then probes https://127.0.0.1/ every 5s for
up to 90s (self-signed certs accepted).
- On any probe success within the window, the marker is cleared and
nothing else happens.
- On window-exhaust, the new binary:
1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts>
(quarantined, not deleted, so we can post-mortem).
2. Restores web-ui.bak on top of web-ui.
3. Calls rollback_update() to restore the previous binary.
4. Updates state.current_version to reflect the rollback.
5. systemctl --no-block restart archipelago so the OLD binary boots.
- Markers older than 10 minutes are treated as stale and cleared without
probing, so a crashed-during-startup marker from weeks ago cannot
spontaneously roll back a healthy node on a later reboot.
- rollback_update() binary copy now goes through host_sudo instead of
tokio::fs::copy, so it escapes the service's ProtectSystem=strict
mount namespace. Without this, the rollback silently failed with
EROFS on /usr/local/bin and orphaned the rollback - the exact
opposite of what auto-rollback is for.
Tests: 4 new unit tests in update::tests covering marker round-trip,
absent-marker noop, no-panic on verify_pending_update with nothing to
verify, and an invariant assert that the 90s probe window stays below
the 600s stale threshold. All passing.
Side fix: scripts/create-release-manifest.sh was dying with exit 141
(SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail.
Replaced with a single awk NR==1 that doesn't short-circuit the upstream
pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
|
|
|
/// Marker written by apply_update() just before the service restart and
|
|
|
|
|
/// consumed by verify_pending_update() in the NEW binary's startup path.
|
|
|
|
|
/// If present, the new binary probes the frontend; if the probe fails,
|
|
|
|
|
/// rollback_update() runs and the service restarts on the old binary.
|
|
|
|
|
/// Closes the "OTA broke nginx fleet-wide with no auto-rollback" failure
|
|
|
|
|
/// mode from 2026-04-22 (v1.7.38/39 tarball-perms bug).
|
|
|
|
|
const PENDING_VERIFY_FILE: &str = "update-pending-verify.json";
|
|
|
|
|
/// Probe timeout for the frontend health check (total time including
|
|
|
|
|
/// retries). Generous: the new binary has to come fully up, health
|
|
|
|
|
/// monitor settles, nginx has to re-read any snippet changes. 90s is
|
|
|
|
|
/// comfortably longer than the slowest observed startup.
|
|
|
|
|
const PENDING_VERIFY_WINDOW_SECS: u64 = 90;
|
|
|
|
|
/// If the marker is older than this on read, treat it as stale and
|
|
|
|
|
/// delete without probing. Guards against a node that somehow failed
|
|
|
|
|
/// to run verification at all (e.g. crashed during startup) from
|
|
|
|
|
/// spontaneously rolling back days later when the user reboots.
|
|
|
|
|
const PENDING_VERIFY_MAX_AGE_SECS: i64 = 600;
|
2026-04-21 10:09:28 -04:00
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
|
|
|
|
pub struct UpdateMirror {
|
|
|
|
|
/// Full URL to `manifest.json`. Download URLs in the fetched
|
|
|
|
|
/// manifest are origin-rewritten to match this URL's scheme+host+
|
|
|
|
|
/// port, so hitting a mirror pulls its components from the same
|
|
|
|
|
/// mirror rather than whatever absolute host the publisher baked in.
|
|
|
|
|
pub url: String,
|
|
|
|
|
/// Human-readable label for the UI ("Server 1", "Home VPS", …).
|
|
|
|
|
#[serde(default)]
|
|
|
|
|
pub label: String,
|
|
|
|
|
}
|
2026-03-09 07:43:12 +00:00
|
|
|
|
2026-04-21 10:09:28 -04:00
|
|
|
fn mirrors_path(data_dir: &Path) -> std::path::PathBuf {
|
|
|
|
|
data_dir.join(UPDATE_MIRRORS_FILE)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn default_mirrors() -> Vec<UpdateMirror> {
|
|
|
|
|
vec![
|
|
|
|
|
UpdateMirror {
|
2026-04-21 15:06:37 -04:00
|
|
|
url: DEFAULT_SECONDARY_MIRROR_URL.to_string(),
|
2026-04-23 08:22:32 -04:00
|
|
|
label: "Server 1 (OVH)".to_string(),
|
2026-04-21 10:09:28 -04:00
|
|
|
},
|
|
|
|
|
UpdateMirror {
|
2026-04-21 15:06:37 -04:00
|
|
|
url: DEFAULT_UPDATE_MANIFEST_URL.to_string(),
|
|
|
|
|
label: "Server 2 (tx1138)".to_string(),
|
2026-04-21 10:09:28 -04:00
|
|
|
},
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Load the operator-configured mirror list. Returns defaults if the
|
|
|
|
|
/// file doesn't exist yet, so a node OTA'd from a pre-mirrors release
|
2026-04-22 03:26:09 -04:00
|
|
|
/// starts with the current default mirrors available without any
|
|
|
|
|
/// manual config.
|
|
|
|
|
///
|
|
|
|
|
/// Migration: any default mirror URL that isn't already in the saved
|
|
|
|
|
/// list gets appended at the end. This lets us add new default mirrors
|
|
|
|
|
/// (e.g. a new Server 3) and have them appear on existing nodes after
|
|
|
|
|
/// an update, without requiring manual config edits. Explicit removals
|
|
|
|
|
/// stick — once an operator removes a URL it stays gone unless it's
|
|
|
|
|
/// later re-added to defaults.
|
2026-04-21 10:09:28 -04:00
|
|
|
pub async fn load_mirrors(data_dir: &Path) -> Result<Vec<UpdateMirror>> {
|
|
|
|
|
let path = mirrors_path(data_dir);
|
|
|
|
|
if !path.exists() {
|
|
|
|
|
return Ok(default_mirrors());
|
|
|
|
|
}
|
|
|
|
|
let bytes = fs::read(&path)
|
|
|
|
|
.await
|
|
|
|
|
.with_context(|| format!("read {}", path.display()))?;
|
2026-04-22 03:26:09 -04:00
|
|
|
let mut list: Vec<UpdateMirror> =
|
2026-04-21 10:09:28 -04:00
|
|
|
serde_json::from_slice(&bytes).with_context(|| format!("parse {}", path.display()))?;
|
|
|
|
|
if list.is_empty() {
|
2026-04-22 03:26:09 -04:00
|
|
|
return Ok(default_mirrors());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Merge in any default URLs the saved config is missing.
|
|
|
|
|
let known: std::collections::HashSet<String> =
|
|
|
|
|
list.iter().map(|m| m.url.clone()).collect();
|
|
|
|
|
let defaults = default_mirrors();
|
|
|
|
|
let mut added = false;
|
|
|
|
|
for def in &defaults {
|
|
|
|
|
if !known.contains(&def.url) {
|
|
|
|
|
list.push(def.clone());
|
|
|
|
|
added = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if added {
|
|
|
|
|
let _ = save_mirrors(data_dir, &list).await;
|
2026-04-21 10:09:28 -04:00
|
|
|
}
|
2026-04-22 03:26:09 -04:00
|
|
|
Ok(list)
|
2026-04-21 10:09:28 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub async fn save_mirrors(data_dir: &Path, mirrors: &[UpdateMirror]) -> Result<()> {
|
|
|
|
|
fs::create_dir_all(data_dir)
|
|
|
|
|
.await
|
|
|
|
|
.with_context(|| format!("mkdir {}", data_dir.display()))?;
|
|
|
|
|
let path = mirrors_path(data_dir);
|
|
|
|
|
let tmp = path.with_extension("json.tmp");
|
|
|
|
|
let json = serde_json::to_vec_pretty(mirrors).context("serialize mirrors")?;
|
|
|
|
|
fs::write(&tmp, json)
|
|
|
|
|
.await
|
|
|
|
|
.with_context(|| format!("write {}", tmp.display()))?;
|
|
|
|
|
fs::rename(&tmp, &path)
|
|
|
|
|
.await
|
|
|
|
|
.with_context(|| format!("rename {} -> {}", tmp.display(), path.display()))?;
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Parse a manifest URL and return its `scheme://host[:port]` prefix.
|
|
|
|
|
/// Used by `rewrite_manifest_origins` so a manifest fetched from a
|
|
|
|
|
/// mirror points component downloads back at the same mirror rather
|
|
|
|
|
/// than whatever absolute URL the publisher baked in.
|
|
|
|
|
fn manifest_origin(manifest_url: &str) -> Option<String> {
|
|
|
|
|
let rest = manifest_url.strip_prefix("https://")
|
|
|
|
|
.map(|r| ("https", r))
|
|
|
|
|
.or_else(|| manifest_url.strip_prefix("http://").map(|r| ("http", r)))?;
|
|
|
|
|
let (scheme, after_scheme) = rest;
|
|
|
|
|
let host_and_port = after_scheme.split('/').next()?;
|
|
|
|
|
if host_and_port.is_empty() {
|
|
|
|
|
return None;
|
|
|
|
|
}
|
|
|
|
|
Some(format!("{}://{}", scheme, host_and_port))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Rewrite every component `download_url` so its origin matches the
|
|
|
|
|
/// manifest URL we just fetched. Preserves the path portion (which is
|
|
|
|
|
/// consistent across mirrors — every gitea serves `/lfg2025/archy/raw/…`).
|
|
|
|
|
/// Leaves URLs with a different path shape untouched (some operator
|
|
|
|
|
/// might mirror with a custom layout; in that case we don't guess).
|
|
|
|
|
fn rewrite_manifest_origins(manifest: &mut UpdateManifest, manifest_url: &str) {
|
|
|
|
|
let Some(new_origin) = manifest_origin(manifest_url) else {
|
|
|
|
|
return;
|
|
|
|
|
};
|
|
|
|
|
for c in manifest.components.iter_mut() {
|
|
|
|
|
if let Some(orig_origin) = manifest_origin(&c.download_url) {
|
|
|
|
|
if orig_origin != new_origin {
|
|
|
|
|
let path = c.download_url.trim_start_matches(&orig_origin).to_string();
|
|
|
|
|
c.download_url = format!("{}{}", new_origin, path);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Which manifest URL to try FIRST — operator override via env wins,
|
|
|
|
|
/// otherwise the first entry in the mirrors list, otherwise the hard
|
|
|
|
|
/// default. Callers that need the full mirror walk should use
|
|
|
|
|
/// `load_mirrors` directly.
|
2026-03-11 10:57:33 +00:00
|
|
|
fn update_manifest_url() -> String {
|
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job
The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy
with -D warnings, and tests. All three were failing. This commit:
- Applies rustfmt across the tree (the bulk of the diff — untouched
since the last toolchain bump, so a wide sweep was unavoidable).
- Fixes the correctness-level clippy errors:
container/bitcoin_simulator.rs wildcard-in-or-pattern
container/manifest.rs from_str rename to parse (reserved name)
container/podman_client.rs .get(0) -> .first()
container/runtime.rs manual += collapse
archipelago/src/constants.rs doc-comment → module-doc
api/rpc/package/install.rs stray /// comment above a non-item
container/docker_packages.rs redundant field init
streaming/advertisement.rs missing Metric import in tests
tests/orchestration_tests.rs `vec!` in non-Vec contexts
mesh/listener/dispatch.rs unused store_plain_message import
api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec!
- Quiets wide legacy surfaces with crate-level allows in main.rs for
stylistic lints (too_many_arguments, type_complexity, doc indent,
enum variant prefix, wildcard-in-or, assertions-on-constants,
drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens
of places with no correctness payoff and have been churning every
toolchain bump.
- Tags intentional-dead-code helpers: wallet/ and streaming/ modules
are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for
rollback compatibility, vpn::get_nostr_vpn_status is surface-area
for a not-yet-landed RPC.
cargo fmt --check, cargo clippy --all-targets --all-features
-- -D warnings, and cargo test --all-features now all pass locally.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
|
|
|
std::env::var("ARCHIPELAGO_UPDATE_URL")
|
|
|
|
|
.unwrap_or_else(|_| DEFAULT_UPDATE_MANIFEST_URL.to_string())
|
2026-03-11 10:57:33 +00:00
|
|
|
}
|
|
|
|
|
|
2026-03-09 07:43:12 +00:00
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
|
|
pub struct UpdateManifest {
|
|
|
|
|
pub version: String,
|
|
|
|
|
pub release_date: String,
|
|
|
|
|
pub changelog: Vec<String>,
|
|
|
|
|
pub components: Vec<ComponentUpdate>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
|
|
pub struct ComponentUpdate {
|
|
|
|
|
pub name: String,
|
|
|
|
|
pub current_version: String,
|
|
|
|
|
pub new_version: String,
|
|
|
|
|
pub download_url: String,
|
|
|
|
|
pub sha256: String,
|
|
|
|
|
pub size_bytes: u64,
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-11 10:57:33 +00:00
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
|
|
|
#[serde(rename_all = "snake_case")]
|
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job
The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy
with -D warnings, and tests. All three were failing. This commit:
- Applies rustfmt across the tree (the bulk of the diff — untouched
since the last toolchain bump, so a wide sweep was unavoidable).
- Fixes the correctness-level clippy errors:
container/bitcoin_simulator.rs wildcard-in-or-pattern
container/manifest.rs from_str rename to parse (reserved name)
container/podman_client.rs .get(0) -> .first()
container/runtime.rs manual += collapse
archipelago/src/constants.rs doc-comment → module-doc
api/rpc/package/install.rs stray /// comment above a non-item
container/docker_packages.rs redundant field init
streaming/advertisement.rs missing Metric import in tests
tests/orchestration_tests.rs `vec!` in non-Vec contexts
mesh/listener/dispatch.rs unused store_plain_message import
api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec!
- Quiets wide legacy surfaces with crate-level allows in main.rs for
stylistic lints (too_many_arguments, type_complexity, doc indent,
enum variant prefix, wildcard-in-or, assertions-on-constants,
drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens
of places with no correctness payoff and have been churning every
toolchain bump.
- Tags intentional-dead-code helpers: wallet/ and streaming/ modules
are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for
rollback compatibility, vpn::get_nostr_vpn_status is surface-area
for a not-yet-landed RPC.
cargo fmt --check, cargo clippy --all-targets --all-features
-- -D warnings, and cargo test --all-features now all pass locally.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
|
|
|
#[derive(Default)]
|
2026-03-11 10:57:33 +00:00
|
|
|
pub enum UpdateSchedule {
|
|
|
|
|
Manual,
|
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job
The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy
with -D warnings, and tests. All three were failing. This commit:
- Applies rustfmt across the tree (the bulk of the diff — untouched
since the last toolchain bump, so a wide sweep was unavoidable).
- Fixes the correctness-level clippy errors:
container/bitcoin_simulator.rs wildcard-in-or-pattern
container/manifest.rs from_str rename to parse (reserved name)
container/podman_client.rs .get(0) -> .first()
container/runtime.rs manual += collapse
archipelago/src/constants.rs doc-comment → module-doc
api/rpc/package/install.rs stray /// comment above a non-item
container/docker_packages.rs redundant field init
streaming/advertisement.rs missing Metric import in tests
tests/orchestration_tests.rs `vec!` in non-Vec contexts
mesh/listener/dispatch.rs unused store_plain_message import
api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec!
- Quiets wide legacy surfaces with crate-level allows in main.rs for
stylistic lints (too_many_arguments, type_complexity, doc indent,
enum variant prefix, wildcard-in-or, assertions-on-constants,
drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens
of places with no correctness payoff and have been churning every
toolchain bump.
- Tags intentional-dead-code helpers: wallet/ and streaming/ modules
are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for
rollback compatibility, vpn::get_nostr_vpn_status is surface-area
for a not-yet-landed RPC.
cargo fmt --check, cargo clippy --all-targets --all-features
-- -D warnings, and cargo test --all-features now all pass locally.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
|
|
|
#[default]
|
2026-03-11 10:57:33 +00:00
|
|
|
DailyCheck,
|
|
|
|
|
AutoApply,
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-09 07:43:12 +00:00
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
|
|
pub struct UpdateState {
|
|
|
|
|
pub current_version: String,
|
|
|
|
|
pub last_check: Option<String>,
|
|
|
|
|
pub available_update: Option<UpdateManifest>,
|
|
|
|
|
pub update_in_progress: bool,
|
|
|
|
|
pub rollback_available: bool,
|
2026-03-11 10:57:33 +00:00
|
|
|
#[serde(default)]
|
|
|
|
|
pub schedule: UpdateSchedule,
|
2026-04-21 13:05:42 -04:00
|
|
|
/// URL of the mirror whose manifest populated `available_update`.
|
|
|
|
|
/// Surfaces in the UI so operators can tell at a glance which mirror
|
|
|
|
|
/// their node actually hit (vs. just which is configured primary).
|
|
|
|
|
#[serde(default)]
|
|
|
|
|
pub manifest_mirror: Option<String>,
|
2026-03-09 07:43:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl Default for UpdateState {
|
|
|
|
|
fn default() -> Self {
|
|
|
|
|
Self {
|
|
|
|
|
current_version: env!("CARGO_PKG_VERSION").to_string(),
|
|
|
|
|
last_check: None,
|
|
|
|
|
available_update: None,
|
|
|
|
|
update_in_progress: false,
|
|
|
|
|
rollback_available: false,
|
2026-03-11 10:57:33 +00:00
|
|
|
schedule: UpdateSchedule::DailyCheck,
|
2026-04-21 13:05:42 -04:00
|
|
|
manifest_mirror: None,
|
2026-03-09 07:43:12 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet
Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 +
v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500)
with no recovery path short of SSH. This release adds a self-check
guardrail to the update flow.
What changed:
- apply_update() writes a pending-verify marker with old+new version and
a 150s deadline immediately before scheduling the service restart.
- verify_pending_update() runs from main.rs startup. If the marker is
present and within its freshness window, the new binary waits 15s for
nginx + backend to settle, then probes https://127.0.0.1/ every 5s for
up to 90s (self-signed certs accepted).
- On any probe success within the window, the marker is cleared and
nothing else happens.
- On window-exhaust, the new binary:
1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts>
(quarantined, not deleted, so we can post-mortem).
2. Restores web-ui.bak on top of web-ui.
3. Calls rollback_update() to restore the previous binary.
4. Updates state.current_version to reflect the rollback.
5. systemctl --no-block restart archipelago so the OLD binary boots.
- Markers older than 10 minutes are treated as stale and cleared without
probing, so a crashed-during-startup marker from weeks ago cannot
spontaneously roll back a healthy node on a later reboot.
- rollback_update() binary copy now goes through host_sudo instead of
tokio::fs::copy, so it escapes the service's ProtectSystem=strict
mount namespace. Without this, the rollback silently failed with
EROFS on /usr/local/bin and orphaned the rollback - the exact
opposite of what auto-rollback is for.
Tests: 4 new unit tests in update::tests covering marker round-trip,
absent-marker noop, no-panic on verify_pending_update with nothing to
verify, and an invariant assert that the 90s probe window stays below
the 600s stale threshold. All passing.
Side fix: scripts/create-release-manifest.sh was dying with exit 141
(SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail.
Replaced with a single awk NR==1 that doesn't short-circuit the upstream
pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
|
|
|
/// Marker written by apply_update() just before the service restart and
|
|
|
|
|
/// consumed by verify_pending_update() in the NEW binary's startup path.
|
|
|
|
|
/// See PENDING_VERIFY_FILE for the full rationale — this is the hook
|
|
|
|
|
/// that turns "nginx 500 on every page after OTA" from an unrecoverable
|
|
|
|
|
/// field incident into an automatic rollback.
|
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
|
|
pub struct PendingVerification {
|
|
|
|
|
/// RFC3339 timestamp of the apply that wrote this marker.
|
|
|
|
|
pub applied_at: String,
|
|
|
|
|
/// Version we just applied (what the NEW binary should be running).
|
|
|
|
|
pub new_version: String,
|
|
|
|
|
/// Version the outgoing binary was running (what we roll back to).
|
|
|
|
|
pub previous_version: String,
|
|
|
|
|
/// Unix epoch seconds after which the probe should give up and
|
|
|
|
|
/// trigger rollback. Prevents a probe from retrying forever if e.g.
|
|
|
|
|
/// nginx is totally wedged.
|
|
|
|
|
pub deadline_ts: i64,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn write_pending_verification(
|
|
|
|
|
data_dir: &Path,
|
|
|
|
|
marker: &PendingVerification,
|
|
|
|
|
) -> Result<()> {
|
|
|
|
|
let path = data_dir.join(PENDING_VERIFY_FILE);
|
|
|
|
|
let data = serde_json::to_string_pretty(marker)
|
|
|
|
|
.context("serialize pending-verify marker")?;
|
|
|
|
|
fs::write(&path, data)
|
|
|
|
|
.await
|
|
|
|
|
.with_context(|| format!("write pending-verify marker to {}", path.display()))?;
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn read_pending_verification(data_dir: &Path) -> Option<PendingVerification> {
|
|
|
|
|
let path = data_dir.join(PENDING_VERIFY_FILE);
|
|
|
|
|
let data = fs::read_to_string(&path).await.ok()?;
|
|
|
|
|
serde_json::from_str(&data).ok()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn clear_pending_verification(data_dir: &Path) {
|
|
|
|
|
let path = data_dir.join(PENDING_VERIFY_FILE);
|
|
|
|
|
let _ = fs::remove_file(&path).await;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Probe the local frontend through nginx. Returns Ok(()) on the first
|
|
|
|
|
/// response that's 2xx or 3xx; errors on timeout / connection refused /
|
|
|
|
|
/// any 4xx/5xx. `accept_self_signed` because nodes use a self-signed
|
|
|
|
|
/// cert the reqwest default root-set doesn't trust.
|
|
|
|
|
async fn probe_frontend_once() -> Result<()> {
|
|
|
|
|
let client = reqwest::Client::builder()
|
|
|
|
|
.danger_accept_invalid_certs(true)
|
|
|
|
|
.timeout(std::time::Duration::from_secs(5))
|
|
|
|
|
.build()
|
|
|
|
|
.context("build probe client")?;
|
|
|
|
|
// Prefer HTTPS since that's the failure mode we're catching (nginx
|
|
|
|
|
// 500 on the PWA). HTTP usually redirects to HTTPS and would mask
|
|
|
|
|
// the bug.
|
|
|
|
|
let resp = client
|
|
|
|
|
.get("https://127.0.0.1/")
|
|
|
|
|
.send()
|
|
|
|
|
.await
|
|
|
|
|
.context("probe GET https://127.0.0.1/")?;
|
|
|
|
|
let status = resp.status();
|
|
|
|
|
if status.is_success() || status.is_redirection() {
|
|
|
|
|
return Ok(());
|
|
|
|
|
}
|
|
|
|
|
anyhow::bail!("frontend probe returned HTTP {}", status);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Called from main.rs startup. If a pending-verification marker is
|
|
|
|
|
/// present, probe the frontend; on failure, trigger rollback and
|
|
|
|
|
/// restart the service so the OLD binary boots.
|
|
|
|
|
///
|
|
|
|
|
/// This is the "post-OTA auto-rollback" guardrail. If ANY problem in
|
|
|
|
|
/// the new version takes down the PWA (bad tarball perms as in v1.7.38,
|
|
|
|
|
/// a broken service worker, a missing asset, a backend panic on first
|
|
|
|
|
/// boot), the node self-heals back to the previous working state
|
|
|
|
|
/// without SSH intervention.
|
|
|
|
|
pub async fn verify_pending_update(data_dir: &Path) {
|
|
|
|
|
let marker = match read_pending_verification(data_dir).await {
|
|
|
|
|
Some(m) => m,
|
|
|
|
|
None => return, // No update pending; nothing to verify.
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// Guard against a marker left behind by some earlier crash path —
|
|
|
|
|
// don't want a user who reboots days later to suddenly get
|
|
|
|
|
// rolled back because the marker was never cleared.
|
|
|
|
|
let applied_at = chrono::DateTime::parse_from_rfc3339(&marker.applied_at);
|
|
|
|
|
if let Ok(ts) = applied_at {
|
|
|
|
|
let age = chrono::Utc::now() - ts.with_timezone(&chrono::Utc);
|
|
|
|
|
if age.num_seconds() > PENDING_VERIFY_MAX_AGE_SECS {
|
|
|
|
|
tracing::warn!(
|
|
|
|
|
age_secs = age.num_seconds(),
|
|
|
|
|
"pending-verify marker is stale, clearing without probing"
|
|
|
|
|
);
|
|
|
|
|
clear_pending_verification(data_dir).await;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
info!(
|
|
|
|
|
new_version = %marker.new_version,
|
|
|
|
|
previous_version = %marker.previous_version,
|
|
|
|
|
"Post-OTA verification: probing frontend at https://127.0.0.1/"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Give the new service time to bind its listeners + nginx to
|
|
|
|
|
// pick up any config changes. 15s matches what we observed on
|
|
|
|
|
// .116 during the v1.7.40 rollout recovery.
|
|
|
|
|
tokio::time::sleep(std::time::Duration::from_secs(15)).await;
|
|
|
|
|
|
|
|
|
|
let deadline =
|
|
|
|
|
std::time::Instant::now() + std::time::Duration::from_secs(PENDING_VERIFY_WINDOW_SECS);
|
|
|
|
|
let mut attempt = 0u32;
|
|
|
|
|
let mut last_err: Option<String> = None;
|
|
|
|
|
|
|
|
|
|
while std::time::Instant::now() < deadline {
|
|
|
|
|
attempt += 1;
|
|
|
|
|
match probe_frontend_once().await {
|
|
|
|
|
Ok(()) => {
|
|
|
|
|
info!(
|
|
|
|
|
attempt,
|
|
|
|
|
"Post-OTA verification succeeded — clearing marker"
|
|
|
|
|
);
|
|
|
|
|
clear_pending_verification(data_dir).await;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
Err(e) => {
|
|
|
|
|
let msg = e.to_string();
|
|
|
|
|
tracing::warn!(attempt, error = %msg, "Post-OTA probe failed, retrying");
|
|
|
|
|
last_err = Some(msg);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
tracing::error!(
|
|
|
|
|
attempts = attempt,
|
|
|
|
|
window_secs = PENDING_VERIFY_WINDOW_SECS,
|
|
|
|
|
last_error = last_err.as_deref().unwrap_or(""),
|
|
|
|
|
new_version = %marker.new_version,
|
|
|
|
|
previous_version = %marker.previous_version,
|
|
|
|
|
"Post-OTA verification FAILED — rolling back"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Restore web-ui.bak on top of web-ui. update.rs keeps web-ui.bak
|
|
|
|
|
// from the previous apply; moving it back is the frontend half of
|
|
|
|
|
// the rollback. The binary half is handled by rollback_update().
|
|
|
|
|
let web_ui_bak = Path::new("/opt/archipelago/web-ui.bak");
|
|
|
|
|
let web_ui = "/opt/archipelago/web-ui";
|
|
|
|
|
if web_ui_bak.exists() {
|
|
|
|
|
let ts = chrono::Utc::now().timestamp_millis();
|
|
|
|
|
let quarantine = format!("/opt/archipelago/web-ui.failed.{}", ts);
|
|
|
|
|
let _ = host_sudo(&["mv", web_ui, &quarantine]).await;
|
|
|
|
|
let _ = host_sudo(&["mv", web_ui_bak.to_str().unwrap_or(""), web_ui]).await;
|
|
|
|
|
tracing::info!(quarantined = %quarantine, "Restored web-ui from web-ui.bak");
|
|
|
|
|
} else {
|
|
|
|
|
tracing::warn!(
|
|
|
|
|
"web-ui.bak not present — frontend cannot be rolled back, only binary"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if let Err(e) = rollback_update(data_dir).await {
|
|
|
|
|
tracing::error!(error = %e, "rollback_update() failed during post-OTA verification");
|
|
|
|
|
// Leave the marker in place so a future boot gets another shot.
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
clear_pending_verification(data_dir).await;
|
|
|
|
|
|
|
|
|
|
// Record why we rolled back so the UI can show it on the next boot.
|
|
|
|
|
if let Ok(mut state) = load_state(data_dir).await {
|
|
|
|
|
state.current_version = marker.previous_version.clone();
|
|
|
|
|
if let Err(e) = save_state(data_dir, &state).await {
|
|
|
|
|
tracing::warn!(error = %e, "Failed to update state after rollback");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Restart so the old binary takes over. --no-block because we're
|
|
|
|
|
// the service; systemd can't wait for us to exit before starting
|
|
|
|
|
// the old process.
|
|
|
|
|
let _ = host_sudo(&["systemctl", "--no-block", "restart", "archipelago"]).await;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-09 07:43:12 +00:00
|
|
|
pub async fn load_state(data_dir: &Path) -> Result<UpdateState> {
|
|
|
|
|
let path = data_dir.join(UPDATE_STATE_FILE);
|
|
|
|
|
if !path.exists() {
|
|
|
|
|
let state = UpdateState::default();
|
|
|
|
|
save_state(data_dir, &state).await?;
|
|
|
|
|
return Ok(state);
|
|
|
|
|
}
|
|
|
|
|
let data = fs::read_to_string(&path)
|
|
|
|
|
.await
|
|
|
|
|
.context("Reading update state")?;
|
2026-04-20 10:03:38 -04:00
|
|
|
let mut state: UpdateState =
|
|
|
|
|
serde_json::from_str(&data).context("Parsing update state")?;
|
|
|
|
|
|
|
|
|
|
// Keep current_version in sync with the binary. Sideloaded nodes
|
|
|
|
|
// (ssh + cp /usr/local/bin/archipelago) don't touch the state file,
|
|
|
|
|
// so without this the running 1.7.0-alpha binary would keep seeing
|
|
|
|
|
// `current_version: "1.6.0-alpha"` and re-offer itself as an update.
|
|
|
|
|
let running = env!("CARGO_PKG_VERSION");
|
|
|
|
|
if state.current_version != running {
|
|
|
|
|
state.current_version = running.to_string();
|
2026-04-21 04:04:20 -04:00
|
|
|
// Binary version changed (sideload or apply). Any stored
|
|
|
|
|
// `available_update` is either redundant (points at the running
|
|
|
|
|
// version) or stale (points at a version we've already passed —
|
|
|
|
|
// which would surface as a "downgrade" offer in the UI). Clear
|
|
|
|
|
// it unconditionally; the next check_for_updates will repopulate
|
|
|
|
|
// if there's genuinely something newer.
|
|
|
|
|
state.available_update = None;
|
2026-04-21 13:05:42 -04:00
|
|
|
state.manifest_mirror = None;
|
2026-04-20 10:03:38 -04:00
|
|
|
save_state(data_dir, &state).await?;
|
|
|
|
|
}
|
|
|
|
|
Ok(state)
|
2026-03-09 07:43:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub async fn save_state(data_dir: &Path, state: &UpdateState) -> Result<()> {
|
|
|
|
|
let path = data_dir.join(UPDATE_STATE_FILE);
|
|
|
|
|
let data = serde_json::to_string_pretty(state)?;
|
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job
The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy
with -D warnings, and tests. All three were failing. This commit:
- Applies rustfmt across the tree (the bulk of the diff — untouched
since the last toolchain bump, so a wide sweep was unavoidable).
- Fixes the correctness-level clippy errors:
container/bitcoin_simulator.rs wildcard-in-or-pattern
container/manifest.rs from_str rename to parse (reserved name)
container/podman_client.rs .get(0) -> .first()
container/runtime.rs manual += collapse
archipelago/src/constants.rs doc-comment → module-doc
api/rpc/package/install.rs stray /// comment above a non-item
container/docker_packages.rs redundant field init
streaming/advertisement.rs missing Metric import in tests
tests/orchestration_tests.rs `vec!` in non-Vec contexts
mesh/listener/dispatch.rs unused store_plain_message import
api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec!
- Quiets wide legacy surfaces with crate-level allows in main.rs for
stylistic lints (too_many_arguments, type_complexity, doc indent,
enum variant prefix, wildcard-in-or, assertions-on-constants,
drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens
of places with no correctness payoff and have been churning every
toolchain bump.
- Tags intentional-dead-code helpers: wallet/ and streaming/ modules
are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for
rollback compatibility, vpn::get_nostr_vpn_status is surface-area
for a not-yet-landed RPC.
cargo fmt --check, cargo clippy --all-targets --all-features
-- -D warnings, and cargo test --all-features now all pass locally.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
|
|
|
fs::write(&path, data).await.context("Writing update state")
|
2026-03-09 07:43:12 +00:00
|
|
|
}
|
|
|
|
|
|
2026-04-21 10:09:28 -04:00
|
|
|
/// Check for available updates by walking the mirror list. The first
|
|
|
|
|
/// mirror that returns a parseable manifest with a strictly-newer
|
|
|
|
|
/// version wins; if no mirror offers a newer version, the node is
|
|
|
|
|
/// reported as up-to-date. Per-mirror we retry up to 3 times on
|
|
|
|
|
/// transient failures.
|
|
|
|
|
///
|
|
|
|
|
/// Manifest `download_url`s are origin-rewritten to match the mirror
|
|
|
|
|
/// we fetched from, so switching mirrors in the UI also switches where
|
|
|
|
|
/// component downloads come from — even if the publisher baked an
|
|
|
|
|
/// absolute URL pointing at a different server into the manifest.
|
2026-03-09 07:43:12 +00:00
|
|
|
pub async fn check_for_updates(data_dir: &Path) -> Result<UpdateState> {
|
|
|
|
|
let mut state = load_state(data_dir).await?;
|
|
|
|
|
|
|
|
|
|
info!("Checking for updates...");
|
|
|
|
|
let client = reqwest::Client::builder()
|
2026-04-21 10:09:28 -04:00
|
|
|
// Short per-attempt HTTP timeout so a wedged mirror doesn't
|
|
|
|
|
// delay the whole check — we'd rather move on to the next
|
|
|
|
|
// mirror quickly than sit waiting on a slow one. 15s covers
|
|
|
|
|
// slow but alive mirrors.
|
2026-03-09 07:43:12 +00:00
|
|
|
.timeout(std::time::Duration::from_secs(15))
|
2026-04-20 17:17:58 -04:00
|
|
|
.connect_timeout(std::time::Duration::from_secs(10))
|
2026-03-09 07:43:12 +00:00
|
|
|
.build()
|
|
|
|
|
.context("Failed to create HTTP client")?;
|
|
|
|
|
|
2026-04-21 10:09:28 -04:00
|
|
|
// Env override (ARCHIPELAGO_UPDATE_URL) short-circuits the mirror
|
|
|
|
|
// list — used on dev boxes that point at a local gitea. Otherwise
|
|
|
|
|
// walk the operator-configured list and fall through on failure.
|
|
|
|
|
let mirrors: Vec<String> = if std::env::var("ARCHIPELAGO_UPDATE_URL").is_ok() {
|
|
|
|
|
vec![update_manifest_url()]
|
|
|
|
|
} else {
|
|
|
|
|
load_mirrors(data_dir)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap_or_else(|_| default_mirrors())
|
|
|
|
|
.into_iter()
|
|
|
|
|
.map(|m| m.url)
|
|
|
|
|
.collect()
|
|
|
|
|
};
|
|
|
|
|
|
2026-04-20 17:17:58 -04:00
|
|
|
let mut last_err: Option<String> = None;
|
|
|
|
|
let mut handled = false;
|
2026-04-21 10:09:28 -04:00
|
|
|
'mirrors: for manifest_url in mirrors.iter() {
|
|
|
|
|
for attempt in 1..=3u8 {
|
|
|
|
|
if attempt > 1 {
|
|
|
|
|
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
|
|
|
|
|
}
|
|
|
|
|
match client.get(manifest_url).send().await {
|
|
|
|
|
Ok(resp) if resp.status().is_success() => match resp.json::<UpdateManifest>().await {
|
|
|
|
|
Ok(mut manifest) => {
|
|
|
|
|
rewrite_manifest_origins(&mut manifest, manifest_url);
|
2026-04-21 04:04:20 -04:00
|
|
|
if is_newer(&manifest.version, &state.current_version) {
|
2026-04-20 17:17:58 -04:00
|
|
|
info!(
|
|
|
|
|
current = %state.current_version,
|
|
|
|
|
available = %manifest.version,
|
2026-04-21 10:09:28 -04:00
|
|
|
mirror = %manifest_url,
|
2026-04-20 17:17:58 -04:00
|
|
|
"Update available"
|
|
|
|
|
);
|
|
|
|
|
state.available_update = Some(manifest);
|
2026-04-21 13:05:42 -04:00
|
|
|
state.manifest_mirror = Some(manifest_url.clone());
|
2026-04-20 17:17:58 -04:00
|
|
|
} else {
|
2026-04-21 04:04:20 -04:00
|
|
|
// Manifest version matches us or is behind
|
2026-04-21 10:09:28 -04:00
|
|
|
// us — either we're current, or this mirror
|
|
|
|
|
// is stale. Try the next mirror; if all are
|
|
|
|
|
// stale or at our version we'll fall through
|
|
|
|
|
// to "up to date".
|
2026-04-21 04:04:20 -04:00
|
|
|
debug!(
|
|
|
|
|
current = %state.current_version,
|
|
|
|
|
manifest = %manifest.version,
|
2026-04-21 10:09:28 -04:00
|
|
|
mirror = %manifest_url,
|
2026-04-21 04:04:20 -04:00
|
|
|
"No newer version in manifest"
|
|
|
|
|
);
|
2026-04-21 10:09:28 -04:00
|
|
|
if state.available_update.is_some() {
|
|
|
|
|
// A later mirror might still have a
|
|
|
|
|
// newer version — don't clobber what an
|
|
|
|
|
// earlier mirror told us. But also don't
|
|
|
|
|
// break: another mirror could be ahead.
|
|
|
|
|
continue 'mirrors;
|
|
|
|
|
}
|
2026-04-21 13:05:42 -04:00
|
|
|
state.manifest_mirror = None;
|
2026-04-20 17:17:58 -04:00
|
|
|
state.available_update = None;
|
|
|
|
|
}
|
|
|
|
|
handled = true;
|
2026-04-21 10:09:28 -04:00
|
|
|
break 'mirrors;
|
2026-04-20 17:17:58 -04:00
|
|
|
}
|
2026-04-21 10:09:28 -04:00
|
|
|
Err(e) => last_err = Some(format!("{}: parse: {}", manifest_url, e)),
|
|
|
|
|
},
|
|
|
|
|
Ok(resp) => {
|
|
|
|
|
last_err = Some(format!("{}: HTTP {}", manifest_url, resp.status()));
|
|
|
|
|
}
|
|
|
|
|
Err(e) => {
|
|
|
|
|
last_err = Some(format!("{}: {}", manifest_url, e));
|
2026-04-20 17:17:58 -04:00
|
|
|
}
|
|
|
|
|
}
|
2026-03-09 07:43:12 +00:00
|
|
|
}
|
2026-04-21 10:09:28 -04:00
|
|
|
tracing::debug!(mirror = %manifest_url, "Mirror exhausted, trying next");
|
2026-04-20 17:17:58 -04:00
|
|
|
}
|
|
|
|
|
if !handled {
|
|
|
|
|
if let Some(e) = last_err {
|
2026-04-21 10:09:28 -04:00
|
|
|
debug!("Update check failed across all mirrors: {}", e);
|
2026-03-09 07:43:12 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
state.last_check = Some(chrono::Utc::now().to_rfc3339());
|
|
|
|
|
save_state(data_dir, &state).await?;
|
|
|
|
|
Ok(state)
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-21 13:05:42 -04:00
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
|
|
pub struct MirrorTestResult {
|
|
|
|
|
pub reachable: bool,
|
|
|
|
|
pub latency_ms: u64,
|
|
|
|
|
pub http_status: Option<u16>,
|
|
|
|
|
pub error: Option<String>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Ping a mirror's manifest URL and return reachability + wall-clock
|
|
|
|
|
/// latency. Used by the "Test mirror" button so operators can sanity-
|
|
|
|
|
/// check a newly added mirror without running a full update check.
|
|
|
|
|
pub async fn test_mirror(url: &str) -> MirrorTestResult {
|
|
|
|
|
let client = match reqwest::Client::builder()
|
|
|
|
|
.timeout(std::time::Duration::from_secs(10))
|
|
|
|
|
.connect_timeout(std::time::Duration::from_secs(5))
|
|
|
|
|
.build()
|
|
|
|
|
{
|
|
|
|
|
Ok(c) => c,
|
|
|
|
|
Err(e) => {
|
|
|
|
|
return MirrorTestResult {
|
|
|
|
|
reachable: false,
|
|
|
|
|
latency_ms: 0,
|
|
|
|
|
http_status: None,
|
|
|
|
|
error: Some(format!("client build failed: {}", e)),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
let start = std::time::Instant::now();
|
|
|
|
|
match client.get(url).send().await {
|
|
|
|
|
Ok(resp) => {
|
|
|
|
|
let latency_ms = start.elapsed().as_millis() as u64;
|
|
|
|
|
let status = resp.status();
|
|
|
|
|
if status.is_success() {
|
|
|
|
|
MirrorTestResult {
|
|
|
|
|
reachable: true,
|
|
|
|
|
latency_ms,
|
|
|
|
|
http_status: Some(status.as_u16()),
|
|
|
|
|
error: None,
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
MirrorTestResult {
|
|
|
|
|
reachable: false,
|
|
|
|
|
latency_ms,
|
|
|
|
|
http_status: Some(status.as_u16()),
|
|
|
|
|
error: Some(format!("HTTP {}", status.as_u16())),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
Err(e) => {
|
|
|
|
|
let latency_ms = start.elapsed().as_millis() as u64;
|
|
|
|
|
MirrorTestResult {
|
|
|
|
|
reachable: false,
|
|
|
|
|
latency_ms,
|
|
|
|
|
http_status: None,
|
|
|
|
|
error: Some(e.to_string()),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-09 07:43:12 +00:00
|
|
|
/// Get current update status without checking remote.
|
|
|
|
|
pub async fn get_status(data_dir: &Path) -> Result<UpdateState> {
|
|
|
|
|
load_state(data_dir).await
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Dismiss the available update notification.
|
|
|
|
|
pub async fn dismiss_update(data_dir: &Path) -> Result<()> {
|
|
|
|
|
let mut state = load_state(data_dir).await?;
|
|
|
|
|
state.available_update = None;
|
|
|
|
|
save_state(data_dir, &state).await
|
|
|
|
|
}
|
2026-03-11 10:57:33 +00:00
|
|
|
|
|
|
|
|
/// Download update components to a staging directory.
|
|
|
|
|
/// Verifies SHA256 hash for each component.
|
2026-04-20 17:17:58 -04:00
|
|
|
///
|
|
|
|
|
/// Robustness: each component download is **resumable** via HTTP Range
|
|
|
|
|
/// requests and retried up to 6 times with exponential backoff. When
|
|
|
|
|
/// gitea drops the connection mid-stream (happens regularly at slow
|
|
|
|
|
/// raw-file throughput), the next attempt picks up where the previous
|
|
|
|
|
/// one left off instead of restarting from byte zero. SHA256 is
|
|
|
|
|
/// verified over the complete file at the end of each component, so a
|
|
|
|
|
/// partially-corrupt resume still fails cleanly.
|
2026-03-11 10:57:33 +00:00
|
|
|
pub async fn download_update(data_dir: &Path) -> Result<DownloadProgress> {
|
|
|
|
|
let state = load_state(data_dir).await?;
|
|
|
|
|
let manifest = state
|
|
|
|
|
.available_update
|
|
|
|
|
.as_ref()
|
|
|
|
|
.ok_or_else(|| anyhow::anyhow!("No update available to download"))?;
|
|
|
|
|
|
|
|
|
|
let staging_dir = data_dir.join("update-staging");
|
|
|
|
|
fs::create_dir_all(&staging_dir)
|
|
|
|
|
.await
|
|
|
|
|
.context("Failed to create staging dir")?;
|
|
|
|
|
|
|
|
|
|
let client = reqwest::Client::builder()
|
2026-04-20 17:17:58 -04:00
|
|
|
// Per-request budget; each attempt gets the full hour. A retry
|
|
|
|
|
// restarts the budget cleanly.
|
2026-04-20 16:40:25 -04:00
|
|
|
.timeout(std::time::Duration::from_secs(3600))
|
2026-04-20 09:03:24 -04:00
|
|
|
.connect_timeout(std::time::Duration::from_secs(30))
|
2026-03-11 10:57:33 +00:00
|
|
|
.build()
|
|
|
|
|
.context("Failed to create HTTP client")?;
|
|
|
|
|
|
|
|
|
|
let mut downloaded = 0u64;
|
|
|
|
|
let total_bytes: u64 = manifest.components.iter().map(|c| c.size_bytes).sum();
|
|
|
|
|
|
2026-04-20 20:20:36 -04:00
|
|
|
info!(
|
|
|
|
|
version = %manifest.version,
|
|
|
|
|
components = manifest.components.len(),
|
|
|
|
|
total_bytes,
|
|
|
|
|
staging = %staging_dir.display(),
|
|
|
|
|
"Starting update download"
|
|
|
|
|
);
|
|
|
|
|
|
2026-04-20 19:10:34 -04:00
|
|
|
// Clear any stale cancel flag from a prior aborted run, then seed
|
|
|
|
|
// the live counters so polls during the handshake show the right
|
|
|
|
|
// denominator immediately instead of 0/0 → NaN%.
|
|
|
|
|
DOWNLOAD_CANCEL.store(false, Ordering::Relaxed);
|
2026-04-20 17:17:58 -04:00
|
|
|
DOWNLOAD_TOTAL.store(total_bytes, Ordering::Relaxed);
|
|
|
|
|
DOWNLOAD_BYTES.store(0, Ordering::Relaxed);
|
2026-04-20 19:10:34 -04:00
|
|
|
DOWNLOAD_PROGRESS_AT.store(now_ms(), Ordering::Relaxed);
|
2026-04-20 17:17:58 -04:00
|
|
|
|
2026-03-11 10:57:33 +00:00
|
|
|
for component in &manifest.components {
|
2026-04-20 19:10:34 -04:00
|
|
|
if is_canceled() {
|
|
|
|
|
DOWNLOAD_TOTAL.store(0, Ordering::Relaxed);
|
|
|
|
|
DOWNLOAD_BYTES.store(0, Ordering::Relaxed);
|
|
|
|
|
anyhow::bail!("Download canceled");
|
|
|
|
|
}
|
2026-03-11 10:57:33 +00:00
|
|
|
info!(name = %component.name, url = %component.download_url, "Downloading component");
|
|
|
|
|
let dest = staging_dir.join(&component.name);
|
2026-04-20 17:17:58 -04:00
|
|
|
download_component_resumable(&client, component, &dest, downloaded).await?;
|
2026-03-11 10:57:33 +00:00
|
|
|
downloaded += component.size_bytes;
|
2026-04-20 17:17:58 -04:00
|
|
|
DOWNLOAD_BYTES.store(downloaded, Ordering::Relaxed);
|
2026-03-11 10:57:33 +00:00
|
|
|
info!(
|
|
|
|
|
name = %component.name,
|
2026-04-20 17:17:58 -04:00
|
|
|
bytes = component.size_bytes,
|
2026-03-11 10:57:33 +00:00
|
|
|
"Component downloaded and verified"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Mark update as downloaded
|
|
|
|
|
let mut state = load_state(data_dir).await?;
|
|
|
|
|
state.update_in_progress = true;
|
|
|
|
|
save_state(data_dir, &state).await?;
|
|
|
|
|
|
|
|
|
|
Ok(DownloadProgress {
|
|
|
|
|
total_bytes,
|
|
|
|
|
downloaded_bytes: downloaded,
|
|
|
|
|
components_downloaded: manifest.components.len(),
|
|
|
|
|
staging_dir: staging_dir.to_string_lossy().to_string(),
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-20 17:17:58 -04:00
|
|
|
/// Download a single component to `dest`, resuming from the end of
|
|
|
|
|
/// any existing partial file via a Range request. Retries up to 6
|
|
|
|
|
/// times with exponential backoff (5s, 15s, 30s, 60s, 120s, 180s).
|
|
|
|
|
/// Verifies the SHA256 over the full file at the end.
|
|
|
|
|
async fn download_component_resumable(
|
|
|
|
|
client: &reqwest::Client,
|
|
|
|
|
component: &ComponentUpdate,
|
|
|
|
|
dest: &Path,
|
|
|
|
|
prior_total: u64,
|
|
|
|
|
) -> Result<()> {
|
|
|
|
|
use sha2::{Digest, Sha256};
|
|
|
|
|
use tokio::io::AsyncWriteExt;
|
|
|
|
|
const MAX_ATTEMPTS: u32 = 6;
|
|
|
|
|
const BACKOFFS: [u64; 5] = [5, 15, 30, 60, 120];
|
|
|
|
|
|
|
|
|
|
let mut last_err: Option<anyhow::Error> = None;
|
|
|
|
|
for attempt in 1..=MAX_ATTEMPTS {
|
|
|
|
|
let existing_len = match tokio::fs::metadata(dest).await {
|
|
|
|
|
Ok(m) => m.len(),
|
|
|
|
|
Err(_) => 0,
|
|
|
|
|
};
|
|
|
|
|
if existing_len >= component.size_bytes {
|
|
|
|
|
// File is already complete — break out and go verify.
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
if attempt > 1 {
|
|
|
|
|
let delay = BACKOFFS[(attempt as usize - 2).min(BACKOFFS.len() - 1)];
|
|
|
|
|
tracing::warn!(
|
|
|
|
|
name = %component.name,
|
|
|
|
|
attempt,
|
|
|
|
|
resume_at = existing_len,
|
|
|
|
|
"Retrying download in {}s (previous error: {})",
|
|
|
|
|
delay,
|
|
|
|
|
last_err.as_ref().map(|e| e.to_string()).unwrap_or_default()
|
|
|
|
|
);
|
2026-04-20 19:10:34 -04:00
|
|
|
// Sleep in 500ms slices so a Cancel during backoff wakes
|
|
|
|
|
// promptly instead of waiting out the full exponential window.
|
|
|
|
|
let slices = delay * 2;
|
|
|
|
|
for _ in 0..slices {
|
|
|
|
|
if is_canceled() {
|
|
|
|
|
anyhow::bail!("Download canceled");
|
|
|
|
|
}
|
|
|
|
|
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if is_canceled() {
|
|
|
|
|
anyhow::bail!("Download canceled");
|
2026-04-20 17:17:58 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let mut req = client.get(&component.download_url);
|
|
|
|
|
if existing_len > 0 {
|
|
|
|
|
req = req.header("Range", format!("bytes={}-", existing_len));
|
|
|
|
|
}
|
|
|
|
|
let resp = match req.send().await {
|
|
|
|
|
Ok(r) => r,
|
|
|
|
|
Err(e) => {
|
|
|
|
|
last_err = Some(anyhow::anyhow!(e));
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
let status = resp.status();
|
|
|
|
|
// 200 OK on a fresh start, 206 Partial Content on a resume
|
|
|
|
|
// that the server honoured. Anything else is a problem.
|
|
|
|
|
let is_resume = existing_len > 0 && status == reqwest::StatusCode::PARTIAL_CONTENT;
|
|
|
|
|
let is_fresh = existing_len == 0 && status.is_success();
|
|
|
|
|
let server_ignored_range = existing_len > 0 && status == reqwest::StatusCode::OK;
|
|
|
|
|
if !is_resume && !is_fresh && !server_ignored_range {
|
|
|
|
|
last_err = Some(anyhow::anyhow!(
|
|
|
|
|
"HTTP {} for {} (resume offset {})",
|
|
|
|
|
status,
|
|
|
|
|
component.name,
|
|
|
|
|
existing_len
|
|
|
|
|
));
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// If the server ignored Range (returned 200 with the full
|
|
|
|
|
// body), wipe the partial file and start over.
|
|
|
|
|
let mut file = if server_ignored_range {
|
|
|
|
|
let _ = tokio::fs::remove_file(dest).await;
|
|
|
|
|
tokio::fs::OpenOptions::new()
|
|
|
|
|
.create(true)
|
|
|
|
|
.write(true)
|
|
|
|
|
.truncate(true)
|
|
|
|
|
.open(dest)
|
|
|
|
|
.await
|
|
|
|
|
.context("open staging file")?
|
|
|
|
|
} else if is_resume {
|
|
|
|
|
tokio::fs::OpenOptions::new()
|
|
|
|
|
.append(true)
|
|
|
|
|
.open(dest)
|
|
|
|
|
.await
|
|
|
|
|
.context("open staging file for append")?
|
|
|
|
|
} else {
|
|
|
|
|
tokio::fs::OpenOptions::new()
|
|
|
|
|
.create(true)
|
|
|
|
|
.write(true)
|
|
|
|
|
.truncate(true)
|
|
|
|
|
.open(dest)
|
|
|
|
|
.await
|
|
|
|
|
.context("open staging file")?
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let mut resp = resp;
|
|
|
|
|
let mut stream_err = false;
|
|
|
|
|
let mut on_disk = existing_len;
|
2026-04-20 19:10:34 -04:00
|
|
|
let mut canceled = false;
|
2026-04-20 17:17:58 -04:00
|
|
|
loop {
|
2026-04-20 19:10:34 -04:00
|
|
|
if is_canceled() {
|
|
|
|
|
canceled = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
2026-04-20 17:17:58 -04:00
|
|
|
match resp.chunk().await {
|
|
|
|
|
Ok(Some(bytes)) => {
|
|
|
|
|
if let Err(e) = file.write_all(&bytes).await {
|
|
|
|
|
last_err = Some(anyhow::anyhow!(e).context("writing chunk"));
|
|
|
|
|
stream_err = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
on_disk += bytes.len() as u64;
|
|
|
|
|
DOWNLOAD_BYTES.store(
|
|
|
|
|
prior_total + on_disk.min(component.size_bytes),
|
|
|
|
|
Ordering::Relaxed,
|
|
|
|
|
);
|
2026-04-20 19:10:34 -04:00
|
|
|
DOWNLOAD_PROGRESS_AT.store(now_ms(), Ordering::Relaxed);
|
2026-04-20 17:17:58 -04:00
|
|
|
}
|
|
|
|
|
Ok(None) => break, // stream ended cleanly
|
|
|
|
|
Err(e) => {
|
|
|
|
|
last_err = Some(anyhow::anyhow!(e).context("reading chunk"));
|
|
|
|
|
stream_err = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-04-20 19:10:34 -04:00
|
|
|
if canceled {
|
|
|
|
|
let _ = file.flush().await;
|
|
|
|
|
drop(file);
|
|
|
|
|
DOWNLOAD_TOTAL.store(0, Ordering::Relaxed);
|
|
|
|
|
DOWNLOAD_BYTES.store(0, Ordering::Relaxed);
|
|
|
|
|
anyhow::bail!("Download canceled");
|
|
|
|
|
}
|
2026-04-20 17:17:58 -04:00
|
|
|
let _ = file.flush().await;
|
|
|
|
|
let _ = file.sync_all().await;
|
|
|
|
|
drop(file);
|
|
|
|
|
if stream_err {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Stream ended cleanly. If we've got the expected size, verify
|
|
|
|
|
// the SHA and succeed. Otherwise loop to resume from the new
|
|
|
|
|
// offset on the next attempt.
|
|
|
|
|
let final_len = tokio::fs::metadata(dest)
|
|
|
|
|
.await
|
|
|
|
|
.map(|m| m.len())
|
|
|
|
|
.unwrap_or(0);
|
|
|
|
|
if final_len < component.size_bytes {
|
|
|
|
|
last_err = Some(anyhow::anyhow!(
|
|
|
|
|
"download truncated: got {} of {} bytes",
|
|
|
|
|
final_len,
|
|
|
|
|
component.size_bytes
|
|
|
|
|
));
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Full file — verify hash.
|
|
|
|
|
let bytes = tokio::fs::read(dest)
|
|
|
|
|
.await
|
|
|
|
|
.context("read staging file for hash check")?;
|
|
|
|
|
let hash = hex::encode(Sha256::digest(&bytes));
|
|
|
|
|
if hash == component.sha256 {
|
|
|
|
|
return Ok(());
|
|
|
|
|
}
|
|
|
|
|
// SHA mismatch — the file on disk is garbage. Nuke it and
|
|
|
|
|
// start over from scratch on the next attempt.
|
|
|
|
|
let _ = tokio::fs::remove_file(dest).await;
|
|
|
|
|
last_err = Some(anyhow::anyhow!(
|
|
|
|
|
"SHA256 mismatch for {}: expected {}, got {}",
|
|
|
|
|
component.name,
|
|
|
|
|
component.sha256,
|
|
|
|
|
hash
|
|
|
|
|
));
|
|
|
|
|
}
|
|
|
|
|
Err(last_err.unwrap_or_else(|| anyhow::anyhow!("download failed without a captured error")))
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-20 19:10:34 -04:00
|
|
|
/// Cancel an in-flight download. Sets the cancellation flag so the
|
|
|
|
|
/// download loop bails out at the next chunk or backoff boundary, then
|
|
|
|
|
/// zeros the live counters and wipes the staging directory so the UI
|
|
|
|
|
/// sees "no active download" immediately and the next attempt starts
|
|
|
|
|
/// clean. Safe to call even when no download is running.
|
|
|
|
|
pub async fn cancel_download(data_dir: &Path) -> Result<()> {
|
|
|
|
|
DOWNLOAD_CANCEL.store(true, Ordering::Relaxed);
|
|
|
|
|
DOWNLOAD_BYTES.store(0, Ordering::Relaxed);
|
|
|
|
|
DOWNLOAD_TOTAL.store(0, Ordering::Relaxed);
|
|
|
|
|
let staging = data_dir.join("update-staging");
|
2026-04-20 20:20:36 -04:00
|
|
|
let wiped = if staging.exists() {
|
|
|
|
|
tokio::fs::remove_dir_all(&staging).await.is_ok()
|
|
|
|
|
} else {
|
|
|
|
|
false
|
|
|
|
|
};
|
2026-04-20 19:10:34 -04:00
|
|
|
// Clear the "downloaded, ready to apply" marker too — a canceled
|
|
|
|
|
// download is not a staged update.
|
2026-04-20 20:20:36 -04:00
|
|
|
let mut cleared_marker = false;
|
2026-04-20 19:10:34 -04:00
|
|
|
if let Ok(mut state) = load_state(data_dir).await {
|
|
|
|
|
if state.update_in_progress {
|
|
|
|
|
state.update_in_progress = false;
|
|
|
|
|
let _ = save_state(data_dir, &state).await;
|
2026-04-20 20:20:36 -04:00
|
|
|
cleared_marker = true;
|
2026-04-20 19:10:34 -04:00
|
|
|
}
|
|
|
|
|
}
|
2026-04-20 20:20:36 -04:00
|
|
|
info!(
|
|
|
|
|
staging = %staging.display(),
|
|
|
|
|
wiped,
|
|
|
|
|
cleared_marker,
|
|
|
|
|
"Update download canceled"
|
|
|
|
|
);
|
2026-04-20 19:10:34 -04:00
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-20 13:46:03 -04:00
|
|
|
/// Run a command as root, but *outside* the archipelago service's
|
|
|
|
|
/// restricted mount namespace.
|
|
|
|
|
///
|
|
|
|
|
/// archipelago.service uses `ProtectSystem=strict`, which makes `/opt`
|
|
|
|
|
/// and `/usr` read-only inside the service — and sudo inherits the
|
|
|
|
|
/// namespace, so `sudo mv /opt/archipelago/...` fails with EROFS even
|
|
|
|
|
/// though sudo itself is root. `systemd-run --wait` spawns a transient
|
|
|
|
|
/// service unit that inherits systemd's default protections (i.e. none
|
|
|
|
|
/// of ours), escaping the namespace.
|
2026-04-22 08:29:56 -04:00
|
|
|
pub(crate) async fn host_sudo(args: &[&str]) -> Result<std::process::ExitStatus> {
|
2026-04-20 13:46:03 -04:00
|
|
|
let mut full: Vec<&str> = vec![
|
|
|
|
|
"systemd-run",
|
|
|
|
|
"--wait",
|
|
|
|
|
"--quiet",
|
|
|
|
|
"--collect",
|
|
|
|
|
"--pipe",
|
|
|
|
|
"--",
|
|
|
|
|
];
|
|
|
|
|
full.extend_from_slice(args);
|
|
|
|
|
tokio::process::Command::new("sudo")
|
|
|
|
|
.args(&full)
|
|
|
|
|
.status()
|
|
|
|
|
.await
|
|
|
|
|
.context("sudo systemd-run spawn failed")
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-11 10:57:33 +00:00
|
|
|
/// Apply a downloaded update. Backs up current binaries, replaces with staged versions.
|
|
|
|
|
pub async fn apply_update(data_dir: &Path) -> Result<()> {
|
|
|
|
|
let staging_dir = data_dir.join("update-staging");
|
|
|
|
|
if !staging_dir.exists() {
|
|
|
|
|
anyhow::bail!("No staged update found. Download first.");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let backup_dir = data_dir.join("update-backup");
|
|
|
|
|
fs::create_dir_all(&backup_dir)
|
|
|
|
|
.await
|
|
|
|
|
.context("Failed to create backup dir")?;
|
|
|
|
|
|
2026-04-20 20:20:36 -04:00
|
|
|
info!(
|
|
|
|
|
staging = %staging_dir.display(),
|
|
|
|
|
backup = %backup_dir.display(),
|
|
|
|
|
"Applying staged update"
|
|
|
|
|
);
|
|
|
|
|
|
2026-03-11 10:57:33 +00:00
|
|
|
// Back up current backend binary
|
|
|
|
|
let current_binary = Path::new("/usr/local/bin/archipelago");
|
|
|
|
|
if current_binary.exists() {
|
|
|
|
|
let backup_path = backup_dir.join("archipelago");
|
|
|
|
|
fs::copy(current_binary, &backup_path)
|
|
|
|
|
.await
|
|
|
|
|
.context("Failed to backup current binary")?;
|
|
|
|
|
info!("Current binary backed up");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Apply staged components
|
|
|
|
|
let mut entries = fs::read_dir(&staging_dir)
|
|
|
|
|
.await
|
|
|
|
|
.context("Failed to read staging dir")?;
|
|
|
|
|
|
|
|
|
|
while let Some(entry) = entries.next_entry().await? {
|
|
|
|
|
let name = entry.file_name().to_string_lossy().to_string();
|
|
|
|
|
let src = entry.path();
|
|
|
|
|
|
2026-04-01 16:18:58 +01:00
|
|
|
match name.as_str() {
|
|
|
|
|
"archipelago" => {
|
2026-04-20 13:46:03 -04:00
|
|
|
// Two namespace gotchas this block works around:
|
|
|
|
|
// 1. We're running FROM /usr/local/bin/archipelago, so
|
|
|
|
|
// `install`/`cp` (O_TRUNC + write) fail with ETXTBSY.
|
|
|
|
|
// Use `mv`, which is atomic rename() and tolerates a
|
|
|
|
|
// busy destination.
|
|
|
|
|
// 2. archipelago.service sets ProtectSystem=strict, so
|
|
|
|
|
// even `sudo mv` into /usr/local/bin/ fails EROFS —
|
|
|
|
|
// sudo inherits the service's mount namespace. Route
|
|
|
|
|
// the rename through systemd-run so it runs in a
|
|
|
|
|
// transient unit with default protections.
|
2026-04-20 13:04:09 -04:00
|
|
|
let staged = src.to_string_lossy().to_string();
|
2026-04-20 13:46:03 -04:00
|
|
|
let _ = host_sudo(&["chmod", "0755", &staged]).await;
|
|
|
|
|
let _ = host_sudo(&["chown", "root:root", &staged]).await;
|
|
|
|
|
let status = host_sudo(&["mv", &staged, "/usr/local/bin/archipelago"])
|
2026-04-01 16:18:58 +01:00
|
|
|
.await
|
2026-04-20 13:04:09 -04:00
|
|
|
.with_context(|| format!("Failed to spawn mv for {}", name))?;
|
2026-04-20 11:25:10 -04:00
|
|
|
if !status.success() {
|
|
|
|
|
anyhow::bail!(
|
2026-04-20 13:46:03 -04:00
|
|
|
"mv into /usr/local/bin failed for {} (exit {:?})",
|
2026-04-20 11:25:10 -04:00
|
|
|
name,
|
|
|
|
|
status.code()
|
|
|
|
|
);
|
2026-03-11 10:57:33 +00:00
|
|
|
}
|
2026-04-01 16:18:58 +01:00
|
|
|
info!(name = %name, "Backend binary applied");
|
|
|
|
|
}
|
|
|
|
|
_ if name.contains("frontend") && name.ends_with(".tar.gz") => {
|
2026-04-20 12:33:10 -04:00
|
|
|
// Tarball contents are the *inside* of web-ui/ (root entries
|
|
|
|
|
// `./test-aiui.html`, `./assets/`, ...). Extract into a
|
|
|
|
|
// uniquely-named staging dir, then mv into place. No `rm
|
|
|
|
|
// -rf` pre-cleanup — that's what hit transient EROFS on
|
|
|
|
|
// .198 and aborted the apply mid-flight.
|
|
|
|
|
let ts = chrono::Utc::now().timestamp_millis();
|
|
|
|
|
let staging_new = format!("/opt/archipelago/web-ui.new.{}", ts);
|
|
|
|
|
let staging_old = format!("/opt/archipelago/web-ui.old.{}", ts);
|
|
|
|
|
let web_ui = "/opt/archipelago/web-ui";
|
2026-04-20 12:02:14 -04:00
|
|
|
let backup_path = "/opt/archipelago/web-ui.bak";
|
2026-04-20 12:33:10 -04:00
|
|
|
|
2026-04-20 13:46:03 -04:00
|
|
|
// All sudo calls that touch /opt/archipelago go through
|
|
|
|
|
// host_sudo so they see a normal root mount namespace.
|
|
|
|
|
let mk = host_sudo(&["mkdir", "-p", &staging_new])
|
2026-04-20 12:02:14 -04:00
|
|
|
.await
|
|
|
|
|
.context("Failed to create frontend staging dir")?;
|
|
|
|
|
if !mk.success() {
|
2026-04-20 12:33:10 -04:00
|
|
|
anyhow::bail!("mkdir {} failed", staging_new);
|
2026-04-01 16:18:58 +01:00
|
|
|
}
|
2026-04-20 13:46:03 -04:00
|
|
|
let extract = host_sudo(&[
|
|
|
|
|
"tar",
|
|
|
|
|
"-xzf",
|
|
|
|
|
&src.to_string_lossy(),
|
|
|
|
|
"-C",
|
|
|
|
|
&staging_new,
|
|
|
|
|
])
|
|
|
|
|
.await
|
|
|
|
|
.with_context(|| format!("Failed to extract {}", name))?;
|
2026-04-20 12:33:10 -04:00
|
|
|
if !extract.success() {
|
2026-04-20 13:46:03 -04:00
|
|
|
let _ = host_sudo(&["rm", "-rf", &staging_new]).await;
|
2026-04-01 16:18:58 +01:00
|
|
|
anyhow::bail!("tar extraction failed for {}", name);
|
|
|
|
|
}
|
2026-04-20 13:46:03 -04:00
|
|
|
let _ = host_sudo(&[
|
|
|
|
|
"chown",
|
|
|
|
|
"-R",
|
|
|
|
|
"archipelago:archipelago",
|
|
|
|
|
&staging_new,
|
|
|
|
|
])
|
|
|
|
|
.await;
|
2026-04-20 12:33:10 -04:00
|
|
|
|
2026-04-22 13:26:54 -04:00
|
|
|
// Set world-readable perms so nginx (runs as www-data)
|
|
|
|
|
// can stat + serve the files. Without this, the tar
|
|
|
|
|
// extraction inherits the staging-dir's 700 mode and
|
|
|
|
|
// nginx returns 403/500 for every request after the
|
|
|
|
|
// swap — exactly what bit .116 on the v1.7.38 rollout.
|
|
|
|
|
let _ = host_sudo(&["chmod", "755", &staging_new]).await;
|
|
|
|
|
let _ = host_sudo(&[
|
|
|
|
|
"find", &staging_new, "-type", "d", "-exec", "chmod", "755", "{}", "+",
|
|
|
|
|
])
|
|
|
|
|
.await;
|
|
|
|
|
let _ = host_sudo(&[
|
|
|
|
|
"find", &staging_new, "-type", "f", "-exec", "chmod", "644", "{}", "+",
|
|
|
|
|
])
|
|
|
|
|
.await;
|
|
|
|
|
|
2026-04-20 16:40:25 -04:00
|
|
|
// Preserve paths that are installed outside the Vue build
|
|
|
|
|
// (baked in by the ISO or sibling installers) and so
|
|
|
|
|
// aren't in the new tarball. Without this copy, every OTA
|
|
|
|
|
// wipes them — notably aiui/ (Claude Code sidebar) and
|
|
|
|
|
// the companion APK. `cp -a` preserves mode/ownership.
|
|
|
|
|
for preserved in ["aiui", "archipelago-companion.apk"] {
|
|
|
|
|
let src = format!("{}/{}", web_ui, preserved);
|
|
|
|
|
let dst = format!("{}/{}", staging_new, preserved);
|
|
|
|
|
// Only preserve the old copy if the new tarball
|
|
|
|
|
// doesn't already ship a fresher one.
|
|
|
|
|
if Path::new(&src).exists() && !Path::new(&dst).exists() {
|
|
|
|
|
let _ = host_sudo(&["cp", "-a", &src, &dst]).await;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-20 12:33:10 -04:00
|
|
|
// Swap: mv current web-ui aside, then mv new into place.
|
2026-04-20 12:02:14 -04:00
|
|
|
if Path::new(web_ui).exists() {
|
2026-04-20 13:46:03 -04:00
|
|
|
let mv_old = host_sudo(&["mv", web_ui, &staging_old])
|
2026-04-20 12:02:14 -04:00
|
|
|
.await
|
|
|
|
|
.context("Failed to rotate old web-ui")?;
|
|
|
|
|
if !mv_old.success() {
|
|
|
|
|
anyhow::bail!("failed to move old web-ui aside");
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-04-20 13:46:03 -04:00
|
|
|
let mv_new = host_sudo(&["mv", &staging_new, web_ui])
|
2026-04-20 12:02:14 -04:00
|
|
|
.await
|
|
|
|
|
.context("Failed to swap new web-ui into place")?;
|
|
|
|
|
if !mv_new.success() {
|
2026-04-20 12:33:10 -04:00
|
|
|
if Path::new(&staging_old).exists() {
|
2026-04-20 13:46:03 -04:00
|
|
|
let _ = host_sudo(&["mv", &staging_old, web_ui]).await;
|
2026-04-20 12:33:10 -04:00
|
|
|
}
|
2026-04-20 12:02:14 -04:00
|
|
|
anyhow::bail!("failed to move new web-ui into place");
|
|
|
|
|
}
|
2026-04-20 12:33:10 -04:00
|
|
|
|
2026-04-20 13:46:03 -04:00
|
|
|
// Rotate previous rollback aside and install this apply's
|
|
|
|
|
// old copy as the new rollback.
|
2026-04-20 12:33:10 -04:00
|
|
|
if Path::new(&staging_old).exists() {
|
|
|
|
|
if Path::new(backup_path).exists() {
|
2026-04-20 13:46:03 -04:00
|
|
|
let _ = host_sudo(&[
|
|
|
|
|
"mv",
|
|
|
|
|
backup_path,
|
|
|
|
|
&format!("{}.{}", backup_path, ts),
|
|
|
|
|
])
|
2026-04-20 12:33:10 -04:00
|
|
|
.await;
|
2026-04-20 13:46:03 -04:00
|
|
|
}
|
|
|
|
|
let _ = host_sudo(&["mv", &staging_old, backup_path]).await;
|
2026-04-20 12:33:10 -04:00
|
|
|
}
|
2026-04-01 16:18:58 +01:00
|
|
|
info!(name = %name, "Frontend archive extracted to /opt/archipelago/web-ui");
|
|
|
|
|
}
|
|
|
|
|
_ => {
|
|
|
|
|
debug!(name = %name, "Unknown component, skipping");
|
2026-03-11 10:57:33 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Update state
|
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet
Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 +
v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500)
with no recovery path short of SSH. This release adds a self-check
guardrail to the update flow.
What changed:
- apply_update() writes a pending-verify marker with old+new version and
a 150s deadline immediately before scheduling the service restart.
- verify_pending_update() runs from main.rs startup. If the marker is
present and within its freshness window, the new binary waits 15s for
nginx + backend to settle, then probes https://127.0.0.1/ every 5s for
up to 90s (self-signed certs accepted).
- On any probe success within the window, the marker is cleared and
nothing else happens.
- On window-exhaust, the new binary:
1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts>
(quarantined, not deleted, so we can post-mortem).
2. Restores web-ui.bak on top of web-ui.
3. Calls rollback_update() to restore the previous binary.
4. Updates state.current_version to reflect the rollback.
5. systemctl --no-block restart archipelago so the OLD binary boots.
- Markers older than 10 minutes are treated as stale and cleared without
probing, so a crashed-during-startup marker from weeks ago cannot
spontaneously roll back a healthy node on a later reboot.
- rollback_update() binary copy now goes through host_sudo instead of
tokio::fs::copy, so it escapes the service's ProtectSystem=strict
mount namespace. Without this, the rollback silently failed with
EROFS on /usr/local/bin and orphaned the rollback - the exact
opposite of what auto-rollback is for.
Tests: 4 new unit tests in update::tests covering marker round-trip,
absent-marker noop, no-panic on verify_pending_update with nothing to
verify, and an invariant assert that the 90s probe window stays below
the 600s stale threshold. All passing.
Side fix: scripts/create-release-manifest.sh was dying with exit 141
(SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail.
Replaced with a single awk NR==1 that doesn't short-circuit the upstream
pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
|
|
|
let previous_version = {
|
|
|
|
|
let state = load_state(data_dir).await?;
|
|
|
|
|
state.current_version.clone()
|
|
|
|
|
};
|
2026-03-11 10:57:33 +00:00
|
|
|
let mut state = load_state(data_dir).await?;
|
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet
Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 +
v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500)
with no recovery path short of SSH. This release adds a self-check
guardrail to the update flow.
What changed:
- apply_update() writes a pending-verify marker with old+new version and
a 150s deadline immediately before scheduling the service restart.
- verify_pending_update() runs from main.rs startup. If the marker is
present and within its freshness window, the new binary waits 15s for
nginx + backend to settle, then probes https://127.0.0.1/ every 5s for
up to 90s (self-signed certs accepted).
- On any probe success within the window, the marker is cleared and
nothing else happens.
- On window-exhaust, the new binary:
1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts>
(quarantined, not deleted, so we can post-mortem).
2. Restores web-ui.bak on top of web-ui.
3. Calls rollback_update() to restore the previous binary.
4. Updates state.current_version to reflect the rollback.
5. systemctl --no-block restart archipelago so the OLD binary boots.
- Markers older than 10 minutes are treated as stale and cleared without
probing, so a crashed-during-startup marker from weeks ago cannot
spontaneously roll back a healthy node on a later reboot.
- rollback_update() binary copy now goes through host_sudo instead of
tokio::fs::copy, so it escapes the service's ProtectSystem=strict
mount namespace. Without this, the rollback silently failed with
EROFS on /usr/local/bin and orphaned the rollback - the exact
opposite of what auto-rollback is for.
Tests: 4 new unit tests in update::tests covering marker round-trip,
absent-marker noop, no-panic on verify_pending_update with nothing to
verify, and an invariant assert that the 90s probe window stays below
the 600s stale threshold. All passing.
Side fix: scripts/create-release-manifest.sh was dying with exit 141
(SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail.
Replaced with a single awk NR==1 that doesn't short-circuit the upstream
pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
|
|
|
let new_version = if let Some(manifest) = &state.available_update {
|
2026-03-11 10:57:33 +00:00
|
|
|
state.current_version = manifest.version.clone();
|
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet
Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 +
v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500)
with no recovery path short of SSH. This release adds a self-check
guardrail to the update flow.
What changed:
- apply_update() writes a pending-verify marker with old+new version and
a 150s deadline immediately before scheduling the service restart.
- verify_pending_update() runs from main.rs startup. If the marker is
present and within its freshness window, the new binary waits 15s for
nginx + backend to settle, then probes https://127.0.0.1/ every 5s for
up to 90s (self-signed certs accepted).
- On any probe success within the window, the marker is cleared and
nothing else happens.
- On window-exhaust, the new binary:
1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts>
(quarantined, not deleted, so we can post-mortem).
2. Restores web-ui.bak on top of web-ui.
3. Calls rollback_update() to restore the previous binary.
4. Updates state.current_version to reflect the rollback.
5. systemctl --no-block restart archipelago so the OLD binary boots.
- Markers older than 10 minutes are treated as stale and cleared without
probing, so a crashed-during-startup marker from weeks ago cannot
spontaneously roll back a healthy node on a later reboot.
- rollback_update() binary copy now goes through host_sudo instead of
tokio::fs::copy, so it escapes the service's ProtectSystem=strict
mount namespace. Without this, the rollback silently failed with
EROFS on /usr/local/bin and orphaned the rollback - the exact
opposite of what auto-rollback is for.
Tests: 4 new unit tests in update::tests covering marker round-trip,
absent-marker noop, no-panic on verify_pending_update with nothing to
verify, and an invariant assert that the 90s probe window stays below
the 600s stale threshold. All passing.
Side fix: scripts/create-release-manifest.sh was dying with exit 141
(SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail.
Replaced with a single awk NR==1 that doesn't short-circuit the upstream
pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
|
|
|
manifest.version.clone()
|
|
|
|
|
} else {
|
|
|
|
|
state.current_version.clone()
|
|
|
|
|
};
|
2026-03-11 10:57:33 +00:00
|
|
|
state.available_update = None;
|
|
|
|
|
state.update_in_progress = false;
|
|
|
|
|
state.rollback_available = true;
|
|
|
|
|
save_state(data_dir, &state).await?;
|
|
|
|
|
|
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet
Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 +
v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500)
with no recovery path short of SSH. This release adds a self-check
guardrail to the update flow.
What changed:
- apply_update() writes a pending-verify marker with old+new version and
a 150s deadline immediately before scheduling the service restart.
- verify_pending_update() runs from main.rs startup. If the marker is
present and within its freshness window, the new binary waits 15s for
nginx + backend to settle, then probes https://127.0.0.1/ every 5s for
up to 90s (self-signed certs accepted).
- On any probe success within the window, the marker is cleared and
nothing else happens.
- On window-exhaust, the new binary:
1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts>
(quarantined, not deleted, so we can post-mortem).
2. Restores web-ui.bak on top of web-ui.
3. Calls rollback_update() to restore the previous binary.
4. Updates state.current_version to reflect the rollback.
5. systemctl --no-block restart archipelago so the OLD binary boots.
- Markers older than 10 minutes are treated as stale and cleared without
probing, so a crashed-during-startup marker from weeks ago cannot
spontaneously roll back a healthy node on a later reboot.
- rollback_update() binary copy now goes through host_sudo instead of
tokio::fs::copy, so it escapes the service's ProtectSystem=strict
mount namespace. Without this, the rollback silently failed with
EROFS on /usr/local/bin and orphaned the rollback - the exact
opposite of what auto-rollback is for.
Tests: 4 new unit tests in update::tests covering marker round-trip,
absent-marker noop, no-panic on verify_pending_update with nothing to
verify, and an invariant assert that the 90s probe window stays below
the 600s stale threshold. All passing.
Side fix: scripts/create-release-manifest.sh was dying with exit 141
(SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail.
Replaced with a single awk NR==1 that doesn't short-circuit the upstream
pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
|
|
|
// Write the post-OTA verification marker BEFORE we schedule the
|
|
|
|
|
// restart. The new binary will read it on startup, probe the
|
|
|
|
|
// frontend, and auto-rollback if nginx is serving 5xx. Covers the
|
|
|
|
|
// class of failure where "apply succeeds, restart succeeds, but
|
|
|
|
|
// the UI is dead" (v1.7.38/39 tarball-perms bug). Best-effort —
|
|
|
|
|
// a failed marker write shouldn't abort the apply.
|
|
|
|
|
let marker = PendingVerification {
|
|
|
|
|
applied_at: chrono::Utc::now().to_rfc3339(),
|
|
|
|
|
new_version,
|
|
|
|
|
previous_version,
|
|
|
|
|
deadline_ts: chrono::Utc::now().timestamp()
|
|
|
|
|
+ PENDING_VERIFY_WINDOW_SECS as i64
|
|
|
|
|
+ 60,
|
|
|
|
|
};
|
|
|
|
|
if let Err(e) = write_pending_verification(data_dir, &marker).await {
|
|
|
|
|
tracing::warn!(error = %e, "Failed to write post-OTA verify marker — rollback disabled for this OTA");
|
|
|
|
|
} else {
|
|
|
|
|
info!("Post-OTA verify marker written; new binary will probe on boot");
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-11 10:57:33 +00:00
|
|
|
// Clean staging
|
|
|
|
|
let _ = fs::remove_dir_all(&staging_dir).await;
|
|
|
|
|
|
2026-04-20 11:25:10 -04:00
|
|
|
info!("Update applied — scheduling service restart in 2s so the RPC reply lands first");
|
|
|
|
|
|
|
|
|
|
// Restart asynchronously so the JSON-RPC response actually reaches the
|
|
|
|
|
// UI before systemd kills us. --no-block makes sure systemctl doesn't
|
|
|
|
|
// try to wait for the current service (us) to exit cleanly before
|
|
|
|
|
// starting the new process — it would deadlock otherwise.
|
|
|
|
|
tokio::spawn(async {
|
|
|
|
|
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
|
2026-04-20 13:46:03 -04:00
|
|
|
// systemctl talks to PID 1 over D-Bus — doesn't need the host
|
|
|
|
|
// mount namespace, but routing through host_sudo keeps the
|
|
|
|
|
// apply flow's sudo calls uniform.
|
|
|
|
|
let _ = host_sudo(&["systemctl", "--no-block", "restart", "archipelago"]).await;
|
2026-04-20 11:25:10 -04:00
|
|
|
});
|
|
|
|
|
|
2026-03-11 10:57:33 +00:00
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Rollback to the previous version from backup.
|
|
|
|
|
pub async fn rollback_update(data_dir: &Path) -> Result<()> {
|
|
|
|
|
let backup_dir = data_dir.join("update-backup");
|
|
|
|
|
if !backup_dir.exists() {
|
|
|
|
|
anyhow::bail!("No rollback backup available");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let backup_binary = backup_dir.join("archipelago");
|
|
|
|
|
if backup_binary.exists() {
|
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet
Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 +
v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500)
with no recovery path short of SSH. This release adds a self-check
guardrail to the update flow.
What changed:
- apply_update() writes a pending-verify marker with old+new version and
a 150s deadline immediately before scheduling the service restart.
- verify_pending_update() runs from main.rs startup. If the marker is
present and within its freshness window, the new binary waits 15s for
nginx + backend to settle, then probes https://127.0.0.1/ every 5s for
up to 90s (self-signed certs accepted).
- On any probe success within the window, the marker is cleared and
nothing else happens.
- On window-exhaust, the new binary:
1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts>
(quarantined, not deleted, so we can post-mortem).
2. Restores web-ui.bak on top of web-ui.
3. Calls rollback_update() to restore the previous binary.
4. Updates state.current_version to reflect the rollback.
5. systemctl --no-block restart archipelago so the OLD binary boots.
- Markers older than 10 minutes are treated as stale and cleared without
probing, so a crashed-during-startup marker from weeks ago cannot
spontaneously roll back a healthy node on a later reboot.
- rollback_update() binary copy now goes through host_sudo instead of
tokio::fs::copy, so it escapes the service's ProtectSystem=strict
mount namespace. Without this, the rollback silently failed with
EROFS on /usr/local/bin and orphaned the rollback - the exact
opposite of what auto-rollback is for.
Tests: 4 new unit tests in update::tests covering marker round-trip,
absent-marker noop, no-panic on verify_pending_update with nothing to
verify, and an invariant assert that the 90s probe window stays below
the 600s stale threshold. All passing.
Side fix: scripts/create-release-manifest.sh was dying with exit 141
(SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail.
Replaced with a single awk NR==1 that doesn't short-circuit the upstream
pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
|
|
|
// Use host_sudo + mv so we escape the archipelago service's
|
|
|
|
|
// ProtectSystem=strict mount namespace. A plain fs::copy or
|
|
|
|
|
// `sudo cp` from inside the service hits EROFS on /usr/local/bin,
|
|
|
|
|
// which would silently orphan the rollback — exactly the
|
|
|
|
|
// opposite of what auto-rollback is for. Pattern matches
|
|
|
|
|
// apply_update()'s binary swap above.
|
|
|
|
|
let backup_str = backup_binary.to_string_lossy().to_string();
|
|
|
|
|
let _ = host_sudo(&["chmod", "0755", &backup_str]).await;
|
|
|
|
|
let _ = host_sudo(&["chown", "root:root", &backup_str]).await;
|
|
|
|
|
let status = host_sudo(&["cp", &backup_str, "/usr/local/bin/archipelago"])
|
2026-03-11 10:57:33 +00:00
|
|
|
.await
|
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet
Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 +
v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500)
with no recovery path short of SSH. This release adds a self-check
guardrail to the update flow.
What changed:
- apply_update() writes a pending-verify marker with old+new version and
a 150s deadline immediately before scheduling the service restart.
- verify_pending_update() runs from main.rs startup. If the marker is
present and within its freshness window, the new binary waits 15s for
nginx + backend to settle, then probes https://127.0.0.1/ every 5s for
up to 90s (self-signed certs accepted).
- On any probe success within the window, the marker is cleared and
nothing else happens.
- On window-exhaust, the new binary:
1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts>
(quarantined, not deleted, so we can post-mortem).
2. Restores web-ui.bak on top of web-ui.
3. Calls rollback_update() to restore the previous binary.
4. Updates state.current_version to reflect the rollback.
5. systemctl --no-block restart archipelago so the OLD binary boots.
- Markers older than 10 minutes are treated as stale and cleared without
probing, so a crashed-during-startup marker from weeks ago cannot
spontaneously roll back a healthy node on a later reboot.
- rollback_update() binary copy now goes through host_sudo instead of
tokio::fs::copy, so it escapes the service's ProtectSystem=strict
mount namespace. Without this, the rollback silently failed with
EROFS on /usr/local/bin and orphaned the rollback - the exact
opposite of what auto-rollback is for.
Tests: 4 new unit tests in update::tests covering marker round-trip,
absent-marker noop, no-panic on verify_pending_update with nothing to
verify, and an invariant assert that the 90s probe window stays below
the 600s stale threshold. All passing.
Side fix: scripts/create-release-manifest.sh was dying with exit 141
(SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail.
Replaced with a single awk NR==1 that doesn't short-circuit the upstream
pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
|
|
|
.context("Failed to restore backup binary via host_sudo")?;
|
|
|
|
|
if !status.success() {
|
|
|
|
|
anyhow::bail!(
|
|
|
|
|
"cp backup binary into /usr/local/bin failed (exit {:?})",
|
|
|
|
|
status.code()
|
|
|
|
|
);
|
|
|
|
|
}
|
2026-03-11 10:57:33 +00:00
|
|
|
info!("Binary rolled back to previous version");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let mut state = load_state(data_dir).await?;
|
|
|
|
|
state.rollback_available = false;
|
|
|
|
|
save_state(data_dir, &state).await?;
|
|
|
|
|
|
|
|
|
|
let _ = fs::remove_dir_all(&backup_dir).await;
|
|
|
|
|
|
|
|
|
|
info!("Rollback complete. Restart service to take effect.");
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Serialize, Deserialize)]
|
|
|
|
|
pub struct DownloadProgress {
|
|
|
|
|
pub total_bytes: u64,
|
|
|
|
|
pub downloaded_bytes: u64,
|
|
|
|
|
pub components_downloaded: usize,
|
|
|
|
|
pub staging_dir: String,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Set the update schedule preference.
|
|
|
|
|
pub async fn set_schedule(data_dir: &Path, schedule: UpdateSchedule) -> Result<()> {
|
|
|
|
|
let mut state = load_state(data_dir).await?;
|
|
|
|
|
state.schedule = schedule;
|
|
|
|
|
save_state(data_dir, &state).await?;
|
|
|
|
|
info!(schedule = ?schedule, "Update schedule changed");
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Get the current schedule.
|
|
|
|
|
pub async fn get_schedule(data_dir: &Path) -> Result<UpdateSchedule> {
|
|
|
|
|
let state = load_state(data_dir).await?;
|
|
|
|
|
Ok(state.schedule)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Background update scheduler. Runs in a loop, checking/applying based on schedule.
|
|
|
|
|
/// Call this once at startup via `tokio::spawn`.
|
|
|
|
|
pub async fn run_update_scheduler(data_dir: std::path::PathBuf) {
|
|
|
|
|
use tokio::time::{interval, Duration};
|
|
|
|
|
|
|
|
|
|
// Check every hour; act based on schedule setting
|
|
|
|
|
let mut tick = interval(Duration::from_secs(3600));
|
|
|
|
|
|
|
|
|
|
loop {
|
|
|
|
|
tick.tick().await;
|
|
|
|
|
|
|
|
|
|
let state = match load_state(&data_dir).await {
|
|
|
|
|
Ok(s) => s,
|
|
|
|
|
Err(e) => {
|
|
|
|
|
debug!("Update scheduler: failed to load state: {}", e);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
match state.schedule {
|
|
|
|
|
UpdateSchedule::Manual => {
|
|
|
|
|
debug!("Update scheduler: manual mode, skipping");
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
UpdateSchedule::DailyCheck => {
|
|
|
|
|
// Only check once per day
|
|
|
|
|
if let Some(ref last) = state.last_check {
|
|
|
|
|
if let Ok(last_time) = chrono::DateTime::parse_from_rfc3339(last) {
|
|
|
|
|
let elapsed = chrono::Utc::now() - last_time.with_timezone(&chrono::Utc);
|
|
|
|
|
if elapsed.num_hours() < 24 {
|
|
|
|
|
debug!("Update scheduler: checked recently, skipping");
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
info!("Update scheduler: running daily check");
|
|
|
|
|
if let Err(e) = check_for_updates(&data_dir).await {
|
|
|
|
|
debug!("Update scheduler: check failed: {}", e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
UpdateSchedule::AutoApply => {
|
|
|
|
|
// Auto-apply: check, download, and apply during 3 AM window
|
|
|
|
|
let hour = chrono::Local::now().hour();
|
|
|
|
|
if hour != 3 {
|
|
|
|
|
// Still do daily check outside the window
|
|
|
|
|
if let Some(ref last) = state.last_check {
|
|
|
|
|
if let Ok(last_time) = chrono::DateTime::parse_from_rfc3339(last) {
|
|
|
|
|
let elapsed =
|
|
|
|
|
chrono::Utc::now() - last_time.with_timezone(&chrono::Utc);
|
|
|
|
|
if elapsed.num_hours() < 24 {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
info!("Update scheduler: auto-apply check (outside window)");
|
|
|
|
|
if let Err(e) = check_for_updates(&data_dir).await {
|
|
|
|
|
debug!("Update scheduler: check failed: {}", e);
|
|
|
|
|
}
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 3 AM — check, download, and apply
|
|
|
|
|
info!("Update scheduler: 3 AM auto-apply window");
|
|
|
|
|
match check_for_updates(&data_dir).await {
|
|
|
|
|
Ok(s) if s.available_update.is_some() => {
|
|
|
|
|
info!("Update scheduler: downloading update");
|
|
|
|
|
if let Err(e) = download_update(&data_dir).await {
|
|
|
|
|
debug!("Update scheduler: download failed: {}", e);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
info!("Update scheduler: applying update");
|
|
|
|
|
if let Err(e) = apply_update(&data_dir).await {
|
|
|
|
|
debug!("Update scheduler: apply failed: {}", e);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2026-04-21 04:33:11 -04:00
|
|
|
info!("Update scheduler: update applied, restart scheduled by apply_update");
|
|
|
|
|
// apply_update has already spawned a 2s-delayed
|
|
|
|
|
// `systemctl restart archipelago`. Don't call
|
|
|
|
|
// std::process::exit here — that kills the runtime
|
|
|
|
|
// before the spawned restart task runs, and since
|
|
|
|
|
// the unit is Restart=on-failure a clean exit(0)
|
|
|
|
|
// leaves the service dead. Fall through; the
|
|
|
|
|
// scheduled restart will bring us back cleanly.
|
|
|
|
|
return;
|
2026-03-11 10:57:33 +00:00
|
|
|
}
|
|
|
|
|
Ok(_) => {
|
|
|
|
|
debug!("Update scheduler: no update available");
|
|
|
|
|
}
|
|
|
|
|
Err(e) => {
|
|
|
|
|
debug!("Update scheduler: check failed: {}", e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
|
mod tests {
|
|
|
|
|
use super::*;
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_update_schedule_default_is_daily_check() {
|
|
|
|
|
let schedule = UpdateSchedule::default();
|
|
|
|
|
assert_eq!(schedule, UpdateSchedule::DailyCheck);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
2026-04-21 10:09:28 -04:00
|
|
|
fn test_manifest_origin_parses_https() {
|
|
|
|
|
assert_eq!(
|
|
|
|
|
manifest_origin("https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/manifest.json"),
|
|
|
|
|
Some("https://git.tx1138.com".to_string())
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_manifest_origin_parses_http_with_port() {
|
|
|
|
|
assert_eq!(
|
|
|
|
|
manifest_origin("http://23.182.128.160:3000/lfg2025/archy/raw/branch/main/releases/manifest.json"),
|
|
|
|
|
Some("http://23.182.128.160:3000".to_string())
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_manifest_origin_rejects_garbage() {
|
|
|
|
|
assert_eq!(manifest_origin("not a url"), None);
|
|
|
|
|
assert_eq!(manifest_origin("ftp://git.tx1138.com/x"), None);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_rewrite_manifest_origins_swaps_all_components() {
|
|
|
|
|
let mut manifest = UpdateManifest {
|
|
|
|
|
version: "1.7.26-alpha".into(),
|
|
|
|
|
release_date: "2026-04-21".into(),
|
|
|
|
|
changelog: vec![],
|
|
|
|
|
components: vec![
|
|
|
|
|
ComponentUpdate {
|
|
|
|
|
name: "archipelago".into(),
|
|
|
|
|
current_version: "1.7.25-alpha".into(),
|
|
|
|
|
new_version: "1.7.26-alpha".into(),
|
|
|
|
|
download_url: "https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/v1.7.26-alpha/archipelago".into(),
|
|
|
|
|
sha256: "x".into(),
|
|
|
|
|
size_bytes: 1,
|
|
|
|
|
},
|
|
|
|
|
ComponentUpdate {
|
|
|
|
|
name: "frontend".into(),
|
|
|
|
|
current_version: "1.7.25-alpha".into(),
|
|
|
|
|
new_version: "1.7.26-alpha".into(),
|
|
|
|
|
download_url: "https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/v1.7.26-alpha/frontend.tar.gz".into(),
|
|
|
|
|
sha256: "y".into(),
|
|
|
|
|
size_bytes: 2,
|
|
|
|
|
},
|
|
|
|
|
],
|
|
|
|
|
};
|
|
|
|
|
rewrite_manifest_origins(&mut manifest, "http://23.182.128.160:3000/lfg2025/archy/raw/branch/main/releases/manifest.json");
|
|
|
|
|
assert_eq!(
|
|
|
|
|
manifest.components[0].download_url,
|
|
|
|
|
"http://23.182.128.160:3000/lfg2025/archy/raw/branch/main/releases/v1.7.26-alpha/archipelago"
|
|
|
|
|
);
|
|
|
|
|
assert_eq!(
|
|
|
|
|
manifest.components[1].download_url,
|
|
|
|
|
"http://23.182.128.160:3000/lfg2025/archy/raw/branch/main/releases/v1.7.26-alpha/frontend.tar.gz"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn test_load_mirrors_returns_defaults_when_absent() {
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let list = load_mirrors(dir.path()).await.unwrap();
|
2026-04-23 08:22:32 -04:00
|
|
|
assert_eq!(list.len(), 2);
|
|
|
|
|
assert!(list[0].url.contains("146.59.87.168"));
|
2026-04-22 03:26:09 -04:00
|
|
|
assert!(list[1].url.contains("git.tx1138.com"));
|
2026-04-21 10:09:28 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn test_save_and_load_mirrors_roundtrip() {
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let list = vec![UpdateMirror {
|
|
|
|
|
url: "https://example.com/m.json".into(),
|
|
|
|
|
label: "Example".into(),
|
|
|
|
|
}];
|
|
|
|
|
save_mirrors(dir.path(), &list).await.unwrap();
|
|
|
|
|
let back = load_mirrors(dir.path()).await.unwrap();
|
|
|
|
|
assert_eq!(back, list);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
2026-03-11 10:57:33 +00:00
|
|
|
fn test_update_state_default_values() {
|
|
|
|
|
let state = UpdateState::default();
|
|
|
|
|
assert_eq!(state.current_version, env!("CARGO_PKG_VERSION"));
|
|
|
|
|
assert!(state.last_check.is_none());
|
|
|
|
|
assert!(state.available_update.is_none());
|
|
|
|
|
assert!(!state.update_in_progress);
|
|
|
|
|
assert!(!state.rollback_available);
|
|
|
|
|
assert_eq!(state.schedule, UpdateSchedule::DailyCheck);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_update_state_serialization_roundtrip() {
|
|
|
|
|
let state = UpdateState {
|
|
|
|
|
current_version: "0.2.0".to_string(),
|
|
|
|
|
last_check: Some("2025-01-01T00:00:00Z".to_string()),
|
|
|
|
|
available_update: None,
|
|
|
|
|
update_in_progress: false,
|
|
|
|
|
rollback_available: true,
|
|
|
|
|
schedule: UpdateSchedule::AutoApply,
|
2026-04-21 13:05:42 -04:00
|
|
|
manifest_mirror: None,
|
2026-03-11 10:57:33 +00:00
|
|
|
};
|
|
|
|
|
let json = serde_json::to_string(&state).unwrap();
|
|
|
|
|
let deserialized: UpdateState = serde_json::from_str(&json).unwrap();
|
|
|
|
|
assert_eq!(deserialized.current_version, "0.2.0");
|
|
|
|
|
assert!(deserialized.rollback_available);
|
|
|
|
|
assert_eq!(deserialized.schedule, UpdateSchedule::AutoApply);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_update_schedule_serde_rename() {
|
|
|
|
|
let json = serde_json::to_string(&UpdateSchedule::DailyCheck).unwrap();
|
|
|
|
|
assert_eq!(json, "\"daily_check\"");
|
|
|
|
|
let json = serde_json::to_string(&UpdateSchedule::Manual).unwrap();
|
|
|
|
|
assert_eq!(json, "\"manual\"");
|
|
|
|
|
let json = serde_json::to_string(&UpdateSchedule::AutoApply).unwrap();
|
|
|
|
|
assert_eq!(json, "\"auto_apply\"");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_update_state_schedule_defaults_on_missing_field() {
|
|
|
|
|
// When schedule field is missing from JSON, it should default to DailyCheck
|
|
|
|
|
let json = r#"{
|
|
|
|
|
"current_version": "0.1.0",
|
|
|
|
|
"last_check": null,
|
|
|
|
|
"available_update": null,
|
|
|
|
|
"update_in_progress": false,
|
|
|
|
|
"rollback_available": false
|
|
|
|
|
}"#;
|
|
|
|
|
let state: UpdateState = serde_json::from_str(json).unwrap();
|
|
|
|
|
assert_eq!(state.schedule, UpdateSchedule::DailyCheck);
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-21 04:04:20 -04:00
|
|
|
#[test]
|
|
|
|
|
fn test_parse_version_triple() {
|
|
|
|
|
assert_eq!(parse_version_triple("1.7.18"), Some((1, 7, 18)));
|
|
|
|
|
assert_eq!(parse_version_triple("1.7.18-alpha"), Some((1, 7, 18)));
|
|
|
|
|
assert_eq!(parse_version_triple("0.0.1"), Some((0, 0, 1)));
|
|
|
|
|
assert_eq!(parse_version_triple("garbage"), None);
|
|
|
|
|
assert_eq!(parse_version_triple("1.2"), None);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_is_newer() {
|
|
|
|
|
assert!(is_newer("1.7.19-alpha", "1.7.18-alpha"));
|
|
|
|
|
assert!(is_newer("1.8.0-alpha", "1.7.99-alpha"));
|
|
|
|
|
assert!(is_newer("1.7.10-alpha", "1.7.9-alpha")); // numeric, not lexical
|
|
|
|
|
assert!(!is_newer("1.7.18-alpha", "1.7.18-alpha"));
|
|
|
|
|
assert!(!is_newer("1.7.17-alpha", "1.7.18-alpha")); // would-be downgrade
|
|
|
|
|
assert!(!is_newer("1.7.9-alpha", "1.7.10-alpha"));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn test_load_state_clears_stale_available_on_version_bump() {
|
|
|
|
|
// Simulates a sideload: state file on disk says we're on
|
|
|
|
|
// 1.7.16-alpha with 1.7.17-alpha staged as the pending update,
|
|
|
|
|
// but the running binary is 1.7.18-alpha (skipped a version).
|
|
|
|
|
// load_state must drop the stale available_update so the UI
|
|
|
|
|
// doesn't offer a downgrade.
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let stale = UpdateState {
|
|
|
|
|
current_version: "1.7.16-alpha".to_string(),
|
|
|
|
|
available_update: Some(UpdateManifest {
|
|
|
|
|
version: "1.7.17-alpha".to_string(),
|
|
|
|
|
release_date: "2026-04-20".to_string(),
|
|
|
|
|
changelog: vec![],
|
|
|
|
|
components: vec![],
|
|
|
|
|
}),
|
|
|
|
|
..UpdateState::default()
|
|
|
|
|
};
|
|
|
|
|
save_state(dir.path(), &stale).await.unwrap();
|
|
|
|
|
let loaded = load_state(dir.path()).await.unwrap();
|
|
|
|
|
assert_eq!(loaded.current_version, env!("CARGO_PKG_VERSION"));
|
|
|
|
|
assert!(
|
|
|
|
|
loaded.available_update.is_none(),
|
|
|
|
|
"stale available_update must be cleared after version bump"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-11 10:57:33 +00:00
|
|
|
#[tokio::test]
|
|
|
|
|
async fn test_load_state_creates_default_when_missing() {
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let state = load_state(dir.path()).await.unwrap();
|
|
|
|
|
assert_eq!(state.current_version, env!("CARGO_PKG_VERSION"));
|
|
|
|
|
assert!(!state.update_in_progress);
|
|
|
|
|
// File should now exist after load created the default
|
|
|
|
|
assert!(dir.path().join(UPDATE_STATE_FILE).exists());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn test_save_and_load_state_roundtrip() {
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let state = UpdateState {
|
|
|
|
|
current_version: "1.0.0".to_string(),
|
|
|
|
|
last_check: Some("2025-06-15T12:00:00Z".to_string()),
|
|
|
|
|
available_update: Some(UpdateManifest {
|
|
|
|
|
version: "1.1.0".to_string(),
|
|
|
|
|
release_date: "2025-06-20".to_string(),
|
|
|
|
|
changelog: vec!["Fix bugs".to_string(), "New feature".to_string()],
|
|
|
|
|
components: vec![ComponentUpdate {
|
|
|
|
|
name: "archipelago".to_string(),
|
|
|
|
|
current_version: "1.0.0".to_string(),
|
|
|
|
|
new_version: "1.1.0".to_string(),
|
|
|
|
|
download_url: "https://example.com/binary".to_string(),
|
|
|
|
|
sha256: "abc123".to_string(),
|
|
|
|
|
size_bytes: 5000,
|
|
|
|
|
}],
|
|
|
|
|
}),
|
|
|
|
|
update_in_progress: true,
|
|
|
|
|
rollback_available: false,
|
|
|
|
|
schedule: UpdateSchedule::Manual,
|
2026-04-21 13:05:42 -04:00
|
|
|
manifest_mirror: Some(
|
|
|
|
|
"https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/manifest.json"
|
|
|
|
|
.to_string(),
|
|
|
|
|
),
|
2026-03-11 10:57:33 +00:00
|
|
|
};
|
|
|
|
|
save_state(dir.path(), &state).await.unwrap();
|
|
|
|
|
let loaded = load_state(dir.path()).await.unwrap();
|
2026-04-21 04:04:20 -04:00
|
|
|
// load_state rewrites current_version to match the running
|
|
|
|
|
// binary (sideload self-heal), so don't assert on the saved
|
|
|
|
|
// value. The migration also clears available_update when the
|
|
|
|
|
// version changes — check the other fields survived.
|
|
|
|
|
assert_eq!(loaded.current_version, env!("CARGO_PKG_VERSION"));
|
2026-03-11 10:57:33 +00:00
|
|
|
assert!(loaded.update_in_progress);
|
|
|
|
|
assert_eq!(loaded.schedule, UpdateSchedule::Manual);
|
2026-04-21 04:04:20 -04:00
|
|
|
assert!(loaded.available_update.is_none());
|
2026-03-11 10:57:33 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn test_dismiss_update_clears_available() {
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let state = UpdateState {
|
|
|
|
|
available_update: Some(UpdateManifest {
|
|
|
|
|
version: "2.0.0".to_string(),
|
|
|
|
|
release_date: "2025-07-01".to_string(),
|
|
|
|
|
changelog: vec![],
|
|
|
|
|
components: vec![],
|
|
|
|
|
}),
|
|
|
|
|
..UpdateState::default()
|
|
|
|
|
};
|
|
|
|
|
save_state(dir.path(), &state).await.unwrap();
|
|
|
|
|
dismiss_update(dir.path()).await.unwrap();
|
|
|
|
|
let loaded = load_state(dir.path()).await.unwrap();
|
|
|
|
|
assert!(loaded.available_update.is_none());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn test_set_and_get_schedule() {
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
// Initialize state
|
|
|
|
|
let _ = load_state(dir.path()).await.unwrap();
|
|
|
|
|
|
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job
The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy
with -D warnings, and tests. All three were failing. This commit:
- Applies rustfmt across the tree (the bulk of the diff — untouched
since the last toolchain bump, so a wide sweep was unavoidable).
- Fixes the correctness-level clippy errors:
container/bitcoin_simulator.rs wildcard-in-or-pattern
container/manifest.rs from_str rename to parse (reserved name)
container/podman_client.rs .get(0) -> .first()
container/runtime.rs manual += collapse
archipelago/src/constants.rs doc-comment → module-doc
api/rpc/package/install.rs stray /// comment above a non-item
container/docker_packages.rs redundant field init
streaming/advertisement.rs missing Metric import in tests
tests/orchestration_tests.rs `vec!` in non-Vec contexts
mesh/listener/dispatch.rs unused store_plain_message import
api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec!
- Quiets wide legacy surfaces with crate-level allows in main.rs for
stylistic lints (too_many_arguments, type_complexity, doc indent,
enum variant prefix, wildcard-in-or, assertions-on-constants,
drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens
of places with no correctness payoff and have been churning every
toolchain bump.
- Tags intentional-dead-code helpers: wallet/ and streaming/ modules
are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for
rollback compatibility, vpn::get_nostr_vpn_status is surface-area
for a not-yet-landed RPC.
cargo fmt --check, cargo clippy --all-targets --all-features
-- -D warnings, and cargo test --all-features now all pass locally.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
|
|
|
set_schedule(dir.path(), UpdateSchedule::AutoApply)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
2026-03-11 10:57:33 +00:00
|
|
|
let schedule = get_schedule(dir.path()).await.unwrap();
|
|
|
|
|
assert_eq!(schedule, UpdateSchedule::AutoApply);
|
|
|
|
|
|
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job
The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy
with -D warnings, and tests. All three were failing. This commit:
- Applies rustfmt across the tree (the bulk of the diff — untouched
since the last toolchain bump, so a wide sweep was unavoidable).
- Fixes the correctness-level clippy errors:
container/bitcoin_simulator.rs wildcard-in-or-pattern
container/manifest.rs from_str rename to parse (reserved name)
container/podman_client.rs .get(0) -> .first()
container/runtime.rs manual += collapse
archipelago/src/constants.rs doc-comment → module-doc
api/rpc/package/install.rs stray /// comment above a non-item
container/docker_packages.rs redundant field init
streaming/advertisement.rs missing Metric import in tests
tests/orchestration_tests.rs `vec!` in non-Vec contexts
mesh/listener/dispatch.rs unused store_plain_message import
api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec!
- Quiets wide legacy surfaces with crate-level allows in main.rs for
stylistic lints (too_many_arguments, type_complexity, doc indent,
enum variant prefix, wildcard-in-or, assertions-on-constants,
drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens
of places with no correctness payoff and have been churning every
toolchain bump.
- Tags intentional-dead-code helpers: wallet/ and streaming/ modules
are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for
rollback compatibility, vpn::get_nostr_vpn_status is surface-area
for a not-yet-landed RPC.
cargo fmt --check, cargo clippy --all-targets --all-features
-- -D warnings, and cargo test --all-features now all pass locally.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
|
|
|
set_schedule(dir.path(), UpdateSchedule::Manual)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
2026-03-11 10:57:33 +00:00
|
|
|
let schedule = get_schedule(dir.path()).await.unwrap();
|
|
|
|
|
assert_eq!(schedule, UpdateSchedule::Manual);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn test_get_status_returns_current_state() {
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let state = UpdateState {
|
|
|
|
|
current_version: "3.0.0".to_string(),
|
|
|
|
|
rollback_available: true,
|
|
|
|
|
..UpdateState::default()
|
|
|
|
|
};
|
|
|
|
|
save_state(dir.path(), &state).await.unwrap();
|
|
|
|
|
let status = get_status(dir.path()).await.unwrap();
|
2026-04-21 04:04:20 -04:00
|
|
|
// get_status → load_state, which rewrites current_version to
|
|
|
|
|
// match the running binary (see the sideload-self-heal path).
|
|
|
|
|
assert_eq!(status.current_version, env!("CARGO_PKG_VERSION"));
|
2026-03-11 10:57:33 +00:00
|
|
|
assert!(status.rollback_available);
|
|
|
|
|
}
|
release(v1.7.41-alpha): post-OTA auto-rollback so a bad release cannot strand the fleet
Closes failure mode FM5 from docs/bulletproof-containers.md: the v1.7.38 +
v1.7.39 rollouts left every affected node on an unreachable UI (nginx 500)
with no recovery path short of SSH. This release adds a self-check
guardrail to the update flow.
What changed:
- apply_update() writes a pending-verify marker with old+new version and
a 150s deadline immediately before scheduling the service restart.
- verify_pending_update() runs from main.rs startup. If the marker is
present and within its freshness window, the new binary waits 15s for
nginx + backend to settle, then probes https://127.0.0.1/ every 5s for
up to 90s (self-signed certs accepted).
- On any probe success within the window, the marker is cleared and
nothing else happens.
- On window-exhaust, the new binary:
1. Moves the broken /opt/archipelago/web-ui to web-ui.failed.<ts>
(quarantined, not deleted, so we can post-mortem).
2. Restores web-ui.bak on top of web-ui.
3. Calls rollback_update() to restore the previous binary.
4. Updates state.current_version to reflect the rollback.
5. systemctl --no-block restart archipelago so the OLD binary boots.
- Markers older than 10 minutes are treated as stale and cleared without
probing, so a crashed-during-startup marker from weeks ago cannot
spontaneously roll back a healthy node on a later reboot.
- rollback_update() binary copy now goes through host_sudo instead of
tokio::fs::copy, so it escapes the service's ProtectSystem=strict
mount namespace. Without this, the rollback silently failed with
EROFS on /usr/local/bin and orphaned the rollback - the exact
opposite of what auto-rollback is for.
Tests: 4 new unit tests in update::tests covering marker round-trip,
absent-marker noop, no-panic on verify_pending_update with nothing to
verify, and an invariant assert that the 90s probe window stays below
the 600s stale threshold. All passing.
Side fix: scripts/create-release-manifest.sh was dying with exit 141
(SIGPIPE from tar tvzf pipe head pipe awk) under set -euo pipefail.
Replaced with a single awk NR==1 that doesn't short-circuit the upstream
pipe, so the release-build flow is idempotent again.
2026-04-22 16:14:35 -04:00
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn test_pending_verification_round_trip() {
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let marker = PendingVerification {
|
|
|
|
|
applied_at: chrono::Utc::now().to_rfc3339(),
|
|
|
|
|
new_version: "1.7.41-alpha".into(),
|
|
|
|
|
previous_version: "1.7.40-alpha".into(),
|
|
|
|
|
deadline_ts: chrono::Utc::now().timestamp() + 150,
|
|
|
|
|
};
|
|
|
|
|
write_pending_verification(dir.path(), &marker).await.unwrap();
|
|
|
|
|
let read = read_pending_verification(dir.path()).await.unwrap();
|
|
|
|
|
assert_eq!(read.new_version, "1.7.41-alpha");
|
|
|
|
|
assert_eq!(read.previous_version, "1.7.40-alpha");
|
|
|
|
|
clear_pending_verification(dir.path()).await;
|
|
|
|
|
assert!(read_pending_verification(dir.path()).await.is_none());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn test_pending_verification_absent_is_none() {
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
assert!(read_pending_verification(dir.path()).await.is_none());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn test_verify_pending_update_noop_without_marker() {
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
// No marker written -- must return quickly without doing anything
|
|
|
|
|
// risky (network probes, rollback calls). We're just asserting
|
|
|
|
|
// it doesn't panic or hang.
|
|
|
|
|
verify_pending_update(dir.path()).await;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_pending_verify_constants_are_sensible() {
|
|
|
|
|
// Window must be generous enough for nginx + backend startup,
|
|
|
|
|
// but less than the stale-marker threshold so a normal cycle
|
|
|
|
|
// can complete without the marker being considered stale.
|
|
|
|
|
assert!(PENDING_VERIFY_WINDOW_SECS < PENDING_VERIFY_MAX_AGE_SECS as u64);
|
|
|
|
|
assert!(PENDING_VERIFY_WINDOW_SECS >= 60);
|
|
|
|
|
}
|
2026-03-11 10:57:33 +00:00
|
|
|
}
|