archy/core/archipelago/src/update.rs

//! Update system: check for updates, download deltas, apply with rollback.

use anyhow::{Context, Result};
use chrono::Timelike;
use serde::{Deserialize, Serialize};
use std::path::Path;
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use tokio::fs;
use tracing::{debug, info, warn};

/// Live download progress counters. Updated by download_component_resumable
/// as bytes arrive and read by the update.status RPC so the UI can show
/// a real progress bar instead of a fake creep. Global because the
/// download runs in one place at a time; no need for per-handler state.
pub static DOWNLOAD_BYTES: AtomicU64 = AtomicU64::new(0);
pub static DOWNLOAD_TOTAL: AtomicU64 = AtomicU64::new(0);
/// Set true to ask the in-flight download loop to bail out at the next
/// chunk boundary. Read via `is_canceled`; reset at the start of every
/// `download_update` run. Also flipped by the `cancel_download` RPC.
pub static DOWNLOAD_CANCEL: AtomicBool = AtomicBool::new(false);
/// Monotonic ms timestamp of the last time DOWNLOAD_BYTES advanced.
/// Lets `update.status` flag a download as "stalled" when no bytes have
/// arrived for a while, so the UI can offer a Cancel button with more
/// confidence than "looks stuck at 0%".
pub static DOWNLOAD_PROGRESS_AT: AtomicU64 = AtomicU64::new(0);

fn now_ms() -> u64 {
    use std::time::{SystemTime, UNIX_EPOCH};
    SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .map(|d| d.as_millis() as u64)
        .unwrap_or(0)
}

fn is_canceled() -> bool {
    DOWNLOAD_CANCEL.load(Ordering::Relaxed)
}

/// Parse "MAJOR.MINOR.PATCH[-suffix]" into a tuple; suffix is ignored.
/// Returns None if the numeric portion can't be parsed — callers should
/// fall back to string comparison in that case so we don't silently
/// mis-rank versions we don't understand.
fn parse_version_triple(v: &str) -> Option<(u32, u32, u32)> {
    let core = v.split('-').next().unwrap_or(v);
    let mut parts = core.split('.');
    let major: u32 = parts.next()?.parse().ok()?;
    let minor: u32 = parts.next()?.parse().ok()?;
    let patch: u32 = parts.next()?.parse().ok()?;
    Some((major, minor, patch))
}

/// Is `candidate` strictly newer than `current`? Used to guard against
/// the manifest offering a version we've already passed (e.g. a stale
/// cached manifest or a node that sideloaded past the manifest's
/// latest). Falls back to string inequality if either version doesn't
/// parse, preserving the old behaviour for unusual version strings.
fn is_newer(candidate: &str, current: &str) -> bool {
    match (
        parse_version_triple(candidate),
        parse_version_triple(current),
    ) {
        (Some(a), Some(b)) => a > b,
        _ => candidate != current,
    }
}

const DEFAULT_UPDATE_MANIFEST_URL: &str =
    "http://146.59.87.168:3000/lfg2025/archy/raw/branch/main/releases/manifest.json";
/// Secondary mirror on tx1138 gitea — independent network path so a
/// single-provider outage doesn't knock out both mirrors.
const DEFAULT_SECONDARY_MIRROR_URL: &str =
    "https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/manifest.json";
const UPDATE_STATE_FILE: &str = "update_state.json";
const UPDATE_MIRRORS_FILE: &str = "update-mirrors.json";
/// Marker written by apply_update() just before the service restart and
/// consumed by verify_pending_update() in the NEW binary's startup path.
/// If present, the new binary probes the frontend; if the probe fails,
/// rollback_update() runs and the service restarts on the old binary.
/// Closes the "OTA broke nginx fleet-wide with no auto-rollback" failure
/// mode from 2026-04-22 (v1.7.38/39 tarball-perms bug).
const PENDING_VERIFY_FILE: &str = "update-pending-verify.json";
/// Probe timeout for the frontend health check (total time including
/// retries). Generous: the new binary has to come fully up, health
/// monitor settles, nginx has to re-read any snippet changes. 90s is
/// comfortably longer than the slowest observed startup.
const PENDING_VERIFY_WINDOW_SECS: u64 = 90;
/// If the marker is older than this on read, treat it as stale and
/// delete without probing. Guards against a node that somehow failed
/// to run verification at all (e.g. crashed during startup) from
/// spontaneously rolling back days later when the user reboots.
const PENDING_VERIFY_MAX_AGE_SECS: i64 = 600;

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct UpdateMirror {
    /// Full URL to `manifest.json`. Download URLs in the fetched
    /// manifest are origin-rewritten to match this URL's scheme+host+
    /// port, so hitting a mirror pulls its components from the same
    /// mirror rather than whatever absolute host the publisher baked in.
    pub url: String,
    /// Human-readable label for the UI ("Server 1", "Home VPS", …).
    #[serde(default)]
    pub label: String,
}

fn mirrors_path(data_dir: &Path) -> std::path::PathBuf {
    data_dir.join(UPDATE_MIRRORS_FILE)
}

fn default_mirrors() -> Vec<UpdateMirror> {
    vec![
        UpdateMirror {
            url: DEFAULT_UPDATE_MANIFEST_URL.to_string(),
            label: "Server 1 (OVH)".to_string(),
        },
        UpdateMirror {
            url: DEFAULT_SECONDARY_MIRROR_URL.to_string(),
            label: "Server 2 (tx1138)".to_string(),
        },
    ]
}

/// Load the operator-configured mirror list. Returns defaults if the
/// file doesn't exist yet, so a node OTA'd from a pre-mirrors release
/// starts with the current default mirrors available without any
/// manual config.
///
/// Migration: any default mirror URL that isn't already in the saved
/// list gets appended at the end. This lets us add new default mirrors
/// (e.g. a new Server 3) and have them appear on existing nodes after
/// an update, without requiring manual config edits. Explicit removals
/// stick — once an operator removes a URL it stays gone unless it's
/// later re-added to defaults.
pub async fn load_mirrors(data_dir: &Path) -> Result<Vec<UpdateMirror>> {
    let path = mirrors_path(data_dir);
    if !path.exists() {
        return Ok(default_mirrors());
    }
    let bytes = fs::read(&path)
        .await
        .with_context(|| format!("read {}", path.display()))?;
    let mut list: Vec<UpdateMirror> =
        serde_json::from_slice(&bytes).with_context(|| format!("parse {}", path.display()))?;
    if list.is_empty() {
        return Ok(default_mirrors());
    }

    // One-time migration: the Hetzner VPS at 23.182.128.160 was
    // decommissioned 2026-04-23. Existing nodes have it baked into their
    // saved mirror list (was the original Server 1). Strip it on load so
    // we don't spend seconds per install timing out against a dead host.
    // Exception to the usual "explicit removals stick" rule: the user
    // never chose to add this — it was a default.
    let before = list.len();
    list.retain(|m| !m.url.contains("23.182.128.160"));
    let mut changed = list.len() != before;

    // Merge in any default URLs the saved config is missing.
    let known: std::collections::HashSet<String> = list.iter().map(|m| m.url.clone()).collect();
    let defaults = default_mirrors();
    for def in &defaults {
        if !known.contains(&def.url) {
            list.push(def.clone());
            changed = true;
        }
    }
    let before_order: Vec<String> = list.iter().map(|m| m.url.clone()).collect();
    force_ovh_update_primary(&mut list);
    changed = changed || before_order != list.iter().map(|m| m.url.clone()).collect::<Vec<_>>();
    if changed {
        let _ = save_mirrors(data_dir, &list).await;
    }
    Ok(list)
}

fn force_ovh_update_primary(list: &mut Vec<UpdateMirror>) {
    let defaults = default_mirrors();
    for def in &defaults {
        if !list.iter().any(|m| m.url == def.url) {
            list.push(def.clone());
        }
    }
    for mirror in list.iter_mut() {
        if mirror.url == DEFAULT_UPDATE_MANIFEST_URL {
            mirror.label = "Server 1 (OVH)".to_string();
        } else if mirror.url == DEFAULT_SECONDARY_MIRROR_URL {
            mirror.label = "Server 2 (tx1138)".to_string();
        }
    }
    list.sort_by_key(|m| {
        if m.url == DEFAULT_UPDATE_MANIFEST_URL {
            0
        } else if m.url == DEFAULT_SECONDARY_MIRROR_URL {
            1
        } else {
            2
        }
    });
}

pub async fn save_mirrors(data_dir: &Path, mirrors: &[UpdateMirror]) -> Result<()> {
    fs::create_dir_all(data_dir)
        .await
        .with_context(|| format!("mkdir {}", data_dir.display()))?;
    let path = mirrors_path(data_dir);
    let tmp = path.with_extension("json.tmp");
    let json = serde_json::to_vec_pretty(mirrors).context("serialize mirrors")?;
    fs::write(&tmp, json)
        .await
        .with_context(|| format!("write {}", tmp.display()))?;
    fs::rename(&tmp, &path)
        .await
        .with_context(|| format!("rename {} -> {}", tmp.display(), path.display()))?;
    Ok(())
}

/// Parse a manifest URL and return its `scheme://host[:port]` prefix.
/// Used by `rewrite_manifest_origins` so a manifest fetched from a
/// mirror points component downloads back at the same mirror rather
/// than whatever absolute URL the publisher baked in.
fn manifest_origin(manifest_url: &str) -> Option<String> {
    let rest = manifest_url
        .strip_prefix("https://")
        .map(|r| ("https", r))
        .or_else(|| manifest_url.strip_prefix("http://").map(|r| ("http", r)))?;
    let (scheme, after_scheme) = rest;
    let host_and_port = after_scheme.split('/').next()?;
    if host_and_port.is_empty() {
        return None;
    }
    Some(format!("{}://{}", scheme, host_and_port))
}

/// Rewrite every component `download_url` so its origin matches the
/// manifest URL we just fetched. Preserves the path portion (which is
/// consistent across mirrors — every gitea serves `/lfg2025/archy/raw/…`).
/// Leaves URLs with a different path shape untouched (some operator
/// might mirror with a custom layout; in that case we don't guess).
fn rewrite_manifest_origins(manifest: &mut UpdateManifest, manifest_url: &str) {
    let Some(new_origin) = manifest_origin(manifest_url) else {
        return;
    };
    for c in manifest.components.iter_mut() {
        if let Some(orig_origin) = manifest_origin(&c.download_url) {
            if orig_origin != new_origin {
                let path = c.download_url.trim_start_matches(&orig_origin).to_string();
                c.download_url = format!("{}{}", new_origin, path);
            }
        }
    }
}

/// Which manifest URL to try FIRST — operator override via env wins,
/// otherwise the first entry in the mirrors list, otherwise the hard
/// default. Callers that need the full mirror walk should use
/// `load_mirrors` directly.
fn update_manifest_url() -> String {
    std::env::var("ARCHIPELAGO_UPDATE_URL")
        .unwrap_or_else(|_| DEFAULT_UPDATE_MANIFEST_URL.to_string())
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct UpdateManifest {
    pub version: String,
    pub release_date: String,
    pub changelog: Vec<String>,
    pub components: Vec<ComponentUpdate>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ComponentUpdate {
    pub name: String,
    pub current_version: String,
    pub new_version: String,
    pub download_url: String,
    pub sha256: String,
    pub size_bytes: u64,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[derive(Default)]
pub enum UpdateSchedule {
    Manual,
    #[default]
    DailyCheck,
    AutoApply,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct UpdateState {
    pub current_version: String,
    pub last_check: Option<String>,
    pub available_update: Option<UpdateManifest>,
    pub update_in_progress: bool,
    pub rollback_available: bool,
    #[serde(default)]
    pub schedule: UpdateSchedule,
    /// URL of the mirror whose manifest populated `available_update`.
    /// Surfaces in the UI so operators can tell at a glance which mirror
    /// their node actually hit (vs. just which is configured primary).
    #[serde(default)]
    pub manifest_mirror: Option<String>,
}

impl Default for UpdateState {
    fn default() -> Self {
        Self {
            current_version: env!("CARGO_PKG_VERSION").to_string(),
            last_check: None,
            available_update: None,
            update_in_progress: false,
            rollback_available: false,
            schedule: UpdateSchedule::DailyCheck,
            manifest_mirror: None,
        }
    }
}

/// Marker written by apply_update() just before the service restart and
/// consumed by verify_pending_update() in the NEW binary's startup path.
/// See PENDING_VERIFY_FILE for the full rationale — this is the hook
/// that turns "nginx 500 on every page after OTA" from an unrecoverable
/// field incident into an automatic rollback.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PendingVerification {
    /// RFC3339 timestamp of the apply that wrote this marker.
    pub applied_at: String,
    /// Version we just applied (what the NEW binary should be running).
    pub new_version: String,
    /// Version the outgoing binary was running (what we roll back to).
    pub previous_version: String,
    /// Unix epoch seconds after which the probe should give up and
    /// trigger rollback. Prevents a probe from retrying forever if e.g.
    /// nginx is totally wedged.
    pub deadline_ts: i64,
}

async fn write_pending_verification(data_dir: &Path, marker: &PendingVerification) -> Result<()> {
    let path = data_dir.join(PENDING_VERIFY_FILE);
    let data = serde_json::to_string_pretty(marker).context("serialize pending-verify marker")?;
    fs::write(&path, data)
        .await
        .with_context(|| format!("write pending-verify marker to {}", path.display()))?;
    Ok(())
}

async fn read_pending_verification(data_dir: &Path) -> Option<PendingVerification> {
    let path = data_dir.join(PENDING_VERIFY_FILE);
    let data = fs::read_to_string(&path).await.ok()?;
    serde_json::from_str(&data).ok()
}

async fn clear_pending_verification(data_dir: &Path) {
    let path = data_dir.join(PENDING_VERIFY_FILE);
    let _ = fs::remove_file(&path).await;
}

/// Probe the local frontend through nginx. Returns Ok(()) on the first
/// response that's 2xx or 3xx; errors on timeout / connection refused /
/// any 4xx/5xx. `accept_self_signed` because nodes use a self-signed
/// cert the reqwest default root-set doesn't trust.
async fn probe_frontend_once() -> Result<()> {
    let client = reqwest::Client::builder()
        .danger_accept_invalid_certs(true)
        .timeout(std::time::Duration::from_secs(5))
        .build()
        .context("build probe client")?;
    // Prefer HTTPS since that's the failure mode we're catching (nginx
    // 500 on the PWA). HTTP usually redirects to HTTPS and would mask
    // the bug.
    let resp = client
        .get("https://127.0.0.1/")
        .send()
        .await
        .context("probe GET https://127.0.0.1/")?;
    let status = resp.status();
    if status.is_success() || status.is_redirection() {
        return Ok(());
    }
    anyhow::bail!("frontend probe returned HTTP {}", status);
}

/// Called from main.rs startup. If a pending-verification marker is
/// present, probe the frontend; on failure, trigger rollback and
/// restart the service so the OLD binary boots.
///
/// This is the "post-OTA auto-rollback" guardrail. If ANY problem in
/// the new version takes down the PWA (bad tarball perms as in v1.7.38,
/// a broken service worker, a missing asset, a backend panic on first
/// boot), the node self-heals back to the previous working state
/// without SSH intervention.
pub async fn verify_pending_update(data_dir: &Path) {
    let marker = match read_pending_verification(data_dir).await {
        Some(m) => m,
        None => return, // No update pending; nothing to verify.
    };

    // Guard against a marker left behind by some earlier crash path —
    // don't want a user who reboots days later to suddenly get
    // rolled back because the marker was never cleared.
    let applied_at = chrono::DateTime::parse_from_rfc3339(&marker.applied_at);
    if let Ok(ts) = applied_at {
        let age = chrono::Utc::now() - ts.with_timezone(&chrono::Utc);
        if age.num_seconds() > PENDING_VERIFY_MAX_AGE_SECS {
            tracing::warn!(
                age_secs = age.num_seconds(),
                "pending-verify marker is stale, clearing without probing"
            );
            clear_pending_verification(data_dir).await;
            return;
        }
    }

    info!(
        new_version = %marker.new_version,
        previous_version = %marker.previous_version,
        "Post-OTA verification: probing frontend at https://127.0.0.1/"
    );

    // Give the new service time to bind its listeners + nginx to
    // pick up any config changes. 15s matches what we observed on
    // .116 during the v1.7.40 rollout recovery.
    tokio::time::sleep(std::time::Duration::from_secs(15)).await;

    let deadline =
        std::time::Instant::now() + std::time::Duration::from_secs(PENDING_VERIFY_WINDOW_SECS);
    let mut attempt = 0u32;
    let mut last_err: Option<String> = None;

    while std::time::Instant::now() < deadline {
        attempt += 1;
        match probe_frontend_once().await {
            Ok(()) => {
                info!(attempt, "Post-OTA verification succeeded — clearing marker");
                clear_pending_verification(data_dir).await;
                return;
            }
            Err(e) => {
                let msg = e.to_string();
                tracing::warn!(attempt, error = %msg, "Post-OTA probe failed, retrying");
                last_err = Some(msg);
            }
        }
        tokio::time::sleep(std::time::Duration::from_secs(5)).await;
    }

    tracing::error!(
        attempts = attempt,
        window_secs = PENDING_VERIFY_WINDOW_SECS,
        last_error = last_err.as_deref().unwrap_or(""),
        new_version = %marker.new_version,
        previous_version = %marker.previous_version,
        "Post-OTA verification FAILED — rolling back"
    );

    // Restore web-ui.bak on top of web-ui. update.rs keeps web-ui.bak
    // from the previous apply; moving it back is the frontend half of
    // the rollback. The binary half is handled by rollback_update().
    let web_ui_bak = Path::new("/opt/archipelago/web-ui.bak");
    let web_ui = "/opt/archipelago/web-ui";
    if web_ui_bak.exists() {
        let ts = chrono::Utc::now().timestamp_millis();
        let quarantine = format!("/opt/archipelago/web-ui.failed.{}", ts);
        let _ = host_sudo(&["mv", web_ui, &quarantine]).await;
        let _ = host_sudo(&["mv", web_ui_bak.to_str().unwrap_or(""), web_ui]).await;
        tracing::info!(quarantined = %quarantine, "Restored web-ui from web-ui.bak");
    } else {
        tracing::warn!("web-ui.bak not present — frontend cannot be rolled back, only binary");
    }

    if let Err(e) = rollback_update(data_dir).await {
        tracing::error!(error = %e, "rollback_update() failed during post-OTA verification");
        // Leave the marker in place so a future boot gets another shot.
        return;
    }

    clear_pending_verification(data_dir).await;

    // Record why we rolled back so the UI can show it on the next boot.
    if let Ok(mut state) = load_state(data_dir).await {
        state.current_version = marker.previous_version.clone();
        if let Err(e) = save_state(data_dir, &state).await {
            tracing::warn!(error = %e, "Failed to update state after rollback");
        }
    }

    // Restart so the old binary takes over. --no-block because we're
    // the service; systemd can't wait for us to exit before starting
    // the old process.
    let _ = host_sudo(&["systemctl", "--no-block", "restart", "archipelago"]).await;
}

pub async fn load_state(data_dir: &Path) -> Result<UpdateState> {
    let path = data_dir.join(UPDATE_STATE_FILE);
    if !path.exists() {
        let state = UpdateState::default();
        save_state(data_dir, &state).await?;
        return Ok(state);
    }
    let data = fs::read_to_string(&path)
        .await
        .context("Reading update state")?;
    let mut state: UpdateState = serde_json::from_str(&data).context("Parsing update state")?;

    let mut changed = false;

    // Keep current_version in sync with the binary. Sideloaded nodes
    // (ssh + cp /usr/local/bin/archipelago) don't touch the state file,
    // so without this the running 1.7.0-alpha binary would keep seeing
    // `current_version: "1.6.0-alpha"` and re-offer itself as an update.
    let running = env!("CARGO_PKG_VERSION");
    if state.current_version != running {
        state.current_version = running.to_string();
        // Binary version changed (sideload or apply). Any stored
        // `available_update` is either redundant (points at the running
        // version) or stale (points at a version we've already passed —
        // which would surface as a "downgrade" offer in the UI). Clear
        // it unconditionally; the next check_for_updates will repopulate
        // if there's genuinely something newer.
        state.available_update = None;
        state.manifest_mirror = None;
        changed = true;
    }

    // `update_in_progress` means a manifest OTA is downloaded and staged,
    // ready for apply. Older git/self-build update paths could leave this
    // flag stuck true without a staging directory, which traps the UI in an
    // unrecoverable state. Heal that on every state load.
    if state.update_in_progress && !has_staged_update(data_dir).await {
        warn!(
            staging = %data_dir.join("update-staging").display(),
            "Clearing stale update_in_progress without staged OTA files"
        );
        state.update_in_progress = false;
        changed = true;
    }

    if changed {
        save_state(data_dir, &state).await?;
    }
    Ok(state)
}

async fn has_staged_update(data_dir: &Path) -> bool {
    let staging_dir = data_dir.join("update-staging");
    let Ok(mut entries) = fs::read_dir(&staging_dir).await else {
        return false;
    };
    matches!(entries.next_entry().await, Ok(Some(_)))
}

pub async fn save_state(data_dir: &Path, state: &UpdateState) -> Result<()> {
    let path = data_dir.join(UPDATE_STATE_FILE);
    let data = serde_json::to_string_pretty(state)?;
    fs::write(&path, data).await.context("Writing update state")
}

/// Check for available updates by walking the mirror list. The first
/// mirror that returns a parseable manifest with a strictly-newer
/// version wins; if no mirror offers a newer version, the node is
/// reported as up-to-date. Per-mirror we retry up to 3 times on
/// transient failures.
///
/// Manifest `download_url`s are origin-rewritten to match the mirror
/// we fetched from, so switching mirrors in the UI also switches where
/// component downloads come from — even if the publisher baked an
/// absolute URL pointing at a different server into the manifest.
pub async fn check_for_updates(data_dir: &Path) -> Result<UpdateState> {
    let mut state = load_state(data_dir).await?;

    info!("Checking for updates...");
    let client = reqwest::Client::builder()
        // Short per-attempt HTTP timeout so a wedged mirror doesn't
        // delay the whole check — we'd rather move on to the next
        // mirror quickly than sit waiting on a slow one. 15s covers
        // slow but alive mirrors.
        .timeout(std::time::Duration::from_secs(15))
        .connect_timeout(std::time::Duration::from_secs(10))
        .build()
        .context("Failed to create HTTP client")?;

    // Env override (ARCHIPELAGO_UPDATE_URL) short-circuits the mirror
    // list — used on dev boxes that point at a local gitea. Otherwise
    // walk the operator-configured list and fall through on failure.
    let mirrors: Vec<String> = if std::env::var("ARCHIPELAGO_UPDATE_URL").is_ok() {
        vec![update_manifest_url()]
    } else {
        load_mirrors(data_dir)
            .await
            .unwrap_or_else(|_| default_mirrors())
            .into_iter()
            .map(|m| m.url)
            .collect()
    };

    let mut last_err: Option<String> = None;
    let mut handled = false;
    'mirrors: for manifest_url in mirrors.iter() {
        for attempt in 1..=3u8 {
            if attempt > 1 {
                tokio::time::sleep(std::time::Duration::from_secs(2)).await;
            }
            match client.get(manifest_url).send().await {
                Ok(resp) if resp.status().is_success() => match resp.json::<UpdateManifest>().await
                {
                    Ok(mut manifest) => {
                        rewrite_manifest_origins(&mut manifest, manifest_url);
                        if is_newer(&manifest.version, &state.current_version) {
                            info!(
                                current = %state.current_version,
                                available = %manifest.version,
                                mirror = %manifest_url,
                                "Update available"
                            );
                            state.available_update = Some(manifest);
                            state.manifest_mirror = Some(manifest_url.clone());
                        } else {
                            // Manifest version matches us or is behind
                            // us — either we're current, or this mirror
                            // is stale. Try the next mirror; if all are
                            // stale or at our version we'll fall through
                            // to "up to date".
                            debug!(
                                current = %state.current_version,
                                manifest = %manifest.version,
                                mirror = %manifest_url,
                                "No newer version in manifest"
                            );
                            state.manifest_mirror = None;
                            state.available_update = None;
                            handled = true;
                            continue 'mirrors;
                        }
                        handled = true;
                        break 'mirrors;
                    }
                    Err(e) => last_err = Some(format!("{}: parse: {}", manifest_url, e)),
                },
                Ok(resp) => {
                    last_err = Some(format!("{}: HTTP {}", manifest_url, resp.status()));
                }
                Err(e) => {
                    last_err = Some(format!("{}: {}", manifest_url, e));
                }
            }
        }
        tracing::debug!(mirror = %manifest_url, "Mirror exhausted, trying next");
    }
    if !handled {
        if let Some(e) = last_err {
            debug!("Update check failed across all mirrors: {}", e);
        }
    }

    state.last_check = Some(chrono::Utc::now().to_rfc3339());
    save_state(data_dir, &state).await?;
    Ok(state)
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MirrorTestResult {
    pub reachable: bool,
    pub latency_ms: u64,
    pub http_status: Option<u16>,
    pub error: Option<String>,
}

/// Ping a mirror's manifest URL and return reachability + wall-clock
/// latency. Used by the "Test mirror" button so operators can sanity-
/// check a newly added mirror without running a full update check.
pub async fn test_mirror(url: &str) -> MirrorTestResult {
    let client = match reqwest::Client::builder()
        .timeout(std::time::Duration::from_secs(10))
        .connect_timeout(std::time::Duration::from_secs(5))
        .build()
    {
        Ok(c) => c,
        Err(e) => {
            return MirrorTestResult {
                reachable: false,
                latency_ms: 0,
                http_status: None,
                error: Some(format!("client build failed: {}", e)),
            }
        }
    };
    let start = std::time::Instant::now();
    match client.get(url).send().await {
        Ok(resp) => {
            let latency_ms = start.elapsed().as_millis() as u64;
            let status = resp.status();
            if status.is_success() {
                MirrorTestResult {
                    reachable: true,
                    latency_ms,
                    http_status: Some(status.as_u16()),
                    error: None,
                }
            } else {
                MirrorTestResult {
                    reachable: false,
                    latency_ms,
                    http_status: Some(status.as_u16()),
                    error: Some(format!("HTTP {}", status.as_u16())),
                }
            }
        }
        Err(e) => {
            let latency_ms = start.elapsed().as_millis() as u64;
            MirrorTestResult {
                reachable: false,
                latency_ms,
                http_status: None,
                error: Some(e.to_string()),
            }
        }
    }
}

/// Get current update status without checking remote.
pub async fn get_status(data_dir: &Path) -> Result<UpdateState> {
    load_state(data_dir).await
}

/// Dismiss the available update notification.
pub async fn dismiss_update(data_dir: &Path) -> Result<()> {
    let mut state = load_state(data_dir).await?;
    state.available_update = None;
    save_state(data_dir, &state).await
}

/// Download update components to a staging directory.
/// Verifies SHA256 hash for each component.
///
/// Robustness: each component download is **resumable** via HTTP Range
/// requests and retried up to 6 times with exponential backoff. When
/// gitea drops the connection mid-stream (happens regularly at slow
/// raw-file throughput), the next attempt picks up where the previous
/// one left off instead of restarting from byte zero. SHA256 is
/// verified over the complete file at the end of each component, so a
/// partially-corrupt resume still fails cleanly.
pub async fn download_update(data_dir: &Path) -> Result<DownloadProgress> {
    let mut state = load_state(data_dir).await?;
    if state.available_update.is_none() {
        state = check_for_updates(data_dir).await?;
    }
    let manifest = state
        .available_update
        .as_ref()
        .ok_or_else(|| anyhow::anyhow!("No update is available to download"))?;

    let staging_dir = data_dir.join("update-staging");
    fs::create_dir_all(&staging_dir)
        .await
        .context("Failed to create staging dir")?;

    let client = reqwest::Client::builder()
        // Per-request budget; each attempt gets the full hour. A retry
        // restarts the budget cleanly.
        .timeout(std::time::Duration::from_secs(3600))
        .connect_timeout(std::time::Duration::from_secs(30))
        .build()
        .context("Failed to create HTTP client")?;

    let mut downloaded = 0u64;
    let total_bytes: u64 = manifest.components.iter().map(|c| c.size_bytes).sum();

    info!(
        version = %manifest.version,
        components = manifest.components.len(),
        total_bytes,
        staging = %staging_dir.display(),
        "Starting update download"
    );

    // Clear any stale cancel flag from a prior aborted run, then seed
    // the live counters so polls during the handshake show the right
    // denominator immediately instead of 0/0 → NaN%.
    DOWNLOAD_CANCEL.store(false, Ordering::Relaxed);
    DOWNLOAD_TOTAL.store(total_bytes, Ordering::Relaxed);
    DOWNLOAD_BYTES.store(0, Ordering::Relaxed);
    DOWNLOAD_PROGRESS_AT.store(now_ms(), Ordering::Relaxed);

    for component in &manifest.components {
        if is_canceled() {
            DOWNLOAD_TOTAL.store(0, Ordering::Relaxed);
            DOWNLOAD_BYTES.store(0, Ordering::Relaxed);
            anyhow::bail!("Download canceled");
        }
        info!(name = %component.name, url = %component.download_url, "Downloading component");
        let dest = staging_dir.join(&component.name);
        download_component_resumable(&client, component, &dest, downloaded).await?;
        downloaded += component.size_bytes;
        DOWNLOAD_BYTES.store(downloaded, Ordering::Relaxed);
        info!(
            name = %component.name,
            bytes = component.size_bytes,
            "Component downloaded and verified"
        );
    }

    // Mark update as downloaded
    let mut state = load_state(data_dir).await?;
    state.update_in_progress = true;
    save_state(data_dir, &state).await?;

    Ok(DownloadProgress {
        total_bytes,
        downloaded_bytes: downloaded,
        components_downloaded: manifest.components.len(),
        staging_dir: staging_dir.to_string_lossy().to_string(),
    })
}

/// Download a single component to `dest`, resuming from the end of
/// any existing partial file via a Range request. Retries up to 6
/// times with exponential backoff (5s, 15s, 30s, 60s, 120s, 180s).
/// Verifies the SHA256 over the full file at the end.
async fn download_component_resumable(
    client: &reqwest::Client,
    component: &ComponentUpdate,
    dest: &Path,
    prior_total: u64,
) -> Result<()> {
    use sha2::{Digest, Sha256};
    use tokio::io::AsyncWriteExt;
    const MAX_ATTEMPTS: u32 = 6;
    const BACKOFFS: [u64; 5] = [5, 15, 30, 60, 120];

    let mut last_err: Option<anyhow::Error> = None;
    for attempt in 1..=MAX_ATTEMPTS {
        let existing_len = match tokio::fs::metadata(dest).await {
            Ok(m) => m.len(),
            Err(_) => 0,
        };
        if existing_len >= component.size_bytes {
            // File is already complete — break out and go verify.
            break;
        }
        if attempt > 1 {
            let delay = BACKOFFS[(attempt as usize - 2).min(BACKOFFS.len() - 1)];
            tracing::warn!(
                name = %component.name,
                attempt,
                resume_at = existing_len,
                "Retrying download in {}s (previous error: {})",
                delay,
                last_err.as_ref().map(|e| e.to_string()).unwrap_or_default()
            );
            // Sleep in 500ms slices so a Cancel during backoff wakes
            // promptly instead of waiting out the full exponential window.
            let slices = delay * 2;
            for _ in 0..slices {
                if is_canceled() {
                    anyhow::bail!("Download canceled");
                }
                tokio::time::sleep(std::time::Duration::from_millis(500)).await;
            }
        }
        if is_canceled() {
            anyhow::bail!("Download canceled");
        }

        let mut req = client.get(&component.download_url);
        if existing_len > 0 {
            req = req.header("Range", format!("bytes={}-", existing_len));
        }
        let resp = match req.send().await {
            Ok(r) => r,
            Err(e) => {
                last_err = Some(anyhow::anyhow!(e));
                continue;
            }
        };
        let status = resp.status();
        // 200 OK on a fresh start, 206 Partial Content on a resume
        // that the server honoured. Anything else is a problem.
        let is_resume = existing_len > 0 && status == reqwest::StatusCode::PARTIAL_CONTENT;
        let is_fresh = existing_len == 0 && status.is_success();
        let server_ignored_range = existing_len > 0 && status == reqwest::StatusCode::OK;
        if !is_resume && !is_fresh && !server_ignored_range {
            last_err = Some(anyhow::anyhow!(
                "HTTP {} for {} (resume offset {})",
                status,
                component.name,
                existing_len
            ));
            continue;
        }
        // If the server ignored Range (returned 200 with the full
        // body), wipe the partial file and start over.
        let mut file = if server_ignored_range {
            let _ = tokio::fs::remove_file(dest).await;
            tokio::fs::OpenOptions::new()
                .create(true)
                .write(true)
                .truncate(true)
                .open(dest)
                .await
                .context("open staging file")?
        } else if is_resume {
            tokio::fs::OpenOptions::new()
                .append(true)
                .open(dest)
                .await
                .context("open staging file for append")?
        } else {
            tokio::fs::OpenOptions::new()
                .create(true)
                .write(true)
                .truncate(true)
                .open(dest)
                .await
                .context("open staging file")?
        };

        let mut resp = resp;
        let mut stream_err = false;
        let mut on_disk = existing_len;
        let mut canceled = false;
        loop {
            if is_canceled() {
                canceled = true;
                break;
            }
            match resp.chunk().await {
                Ok(Some(bytes)) => {
                    if let Err(e) = file.write_all(&bytes).await {
                        last_err = Some(anyhow::anyhow!(e).context("writing chunk"));
                        stream_err = true;
                        break;
                    }
                    on_disk += bytes.len() as u64;
                    DOWNLOAD_BYTES.store(
                        prior_total + on_disk.min(component.size_bytes),
                        Ordering::Relaxed,
                    );
                    DOWNLOAD_PROGRESS_AT.store(now_ms(), Ordering::Relaxed);
                }
                Ok(None) => break, // stream ended cleanly
                Err(e) => {
                    last_err = Some(anyhow::anyhow!(e).context("reading chunk"));
                    stream_err = true;
                    break;
                }
            }
        }
        if canceled {
            let _ = file.flush().await;
            drop(file);
            DOWNLOAD_TOTAL.store(0, Ordering::Relaxed);
            DOWNLOAD_BYTES.store(0, Ordering::Relaxed);
            anyhow::bail!("Download canceled");
        }
        let _ = file.flush().await;
        let _ = file.sync_all().await;
        drop(file);
        if stream_err {
            continue;
        }

        // Stream ended cleanly. If we've got the expected size, verify
        // the SHA and succeed. Otherwise loop to resume from the new
        // offset on the next attempt.
        let final_len = tokio::fs::metadata(dest)
            .await
            .map(|m| m.len())
            .unwrap_or(0);
        if final_len < component.size_bytes {
            last_err = Some(anyhow::anyhow!(
                "download truncated: got {} of {} bytes",
                final_len,
                component.size_bytes
            ));
            continue;
        }

        // Full file — verify hash.
        let bytes = tokio::fs::read(dest)
            .await
            .context("read staging file for hash check")?;
        let hash = hex::encode(Sha256::digest(&bytes));
        if hash == component.sha256 {
            return Ok(());
        }
        // SHA mismatch — the file on disk is garbage. Nuke it and
        // start over from scratch on the next attempt.
        let _ = tokio::fs::remove_file(dest).await;
        last_err = Some(anyhow::anyhow!(
            "SHA256 mismatch for {}: expected {}, got {}",
            component.name,
            component.sha256,
            hash
        ));
    }
    Err(last_err.unwrap_or_else(|| anyhow::anyhow!("download failed without a captured error")))
}

/// Cancel an in-flight download. Sets the cancellation flag so the
/// download loop bails out at the next chunk or backoff boundary, then
/// zeros the live counters and wipes the staging directory so the UI
/// sees "no active download" immediately and the next attempt starts
/// clean. Safe to call even when no download is running.
pub async fn cancel_download(data_dir: &Path) -> Result<()> {
    DOWNLOAD_CANCEL.store(true, Ordering::Relaxed);
    DOWNLOAD_BYTES.store(0, Ordering::Relaxed);
    DOWNLOAD_TOTAL.store(0, Ordering::Relaxed);
    let staging = data_dir.join("update-staging");
    let wiped = if staging.exists() {
        tokio::fs::remove_dir_all(&staging).await.is_ok()
    } else {
        false
    };
    // Clear the "downloaded, ready to apply" marker too — a canceled
    // download is not a staged update.
    let mut cleared_marker = false;
    if let Ok(mut state) = load_state(data_dir).await {
        if state.update_in_progress {
            state.update_in_progress = false;
            let _ = save_state(data_dir, &state).await;
            cleared_marker = true;
        }
    }
    info!(
        staging = %staging.display(),
        wiped,
        cleared_marker,
        "Update download canceled"
    );
    Ok(())
}

/// Run a command as root, but *outside* the archipelago service's
/// restricted mount namespace.
///
/// archipelago.service uses `ProtectSystem=strict`, which makes `/opt`
/// and `/usr` read-only inside the service — and sudo inherits the
/// namespace, so `sudo mv /opt/archipelago/...` fails with EROFS even
/// though sudo itself is root. `systemd-run --wait` spawns a transient
/// service unit that inherits systemd's default protections (i.e. none
/// of ours), escaping the namespace.
pub(crate) async fn host_sudo(args: &[&str]) -> Result<std::process::ExitStatus> {
    let mut full: Vec<&str> = vec![
        "systemd-run",
        "--wait",
        "--quiet",
        "--collect",
        "--pipe",
        "--",
    ];
    full.extend_from_slice(args);
    tokio::process::Command::new("sudo")
        .args(&full)
        .status()
        .await
        .context("sudo systemd-run spawn failed")
}

/// Apply a downloaded update. Backs up current binaries, replaces with staged versions.
pub async fn apply_update(data_dir: &Path) -> Result<()> {
    let staging_dir = data_dir.join("update-staging");
    if !staging_dir.exists() {
        anyhow::bail!("No staged update found. Download first.");
    }

    let backup_dir = data_dir.join("update-backup");
    fs::create_dir_all(&backup_dir)
        .await
        .context("Failed to create backup dir")?;

    info!(
        staging = %staging_dir.display(),
        backup = %backup_dir.display(),
        "Applying staged update"
    );

    // Back up current backend binary
    let current_binary = Path::new("/usr/local/bin/archipelago");
    if current_binary.exists() {
        let backup_path = backup_dir.join("archipelago");
        fs::copy(current_binary, &backup_path)
            .await
            .context("Failed to backup current binary")?;
        info!("Current binary backed up");
    }

    // Apply staged components
    let mut entries = fs::read_dir(&staging_dir)
        .await
        .context("Failed to read staging dir")?;

    while let Some(entry) = entries.next_entry().await? {
        let name = entry.file_name().to_string_lossy().to_string();
        let src = entry.path();

        match name.as_str() {
            "archipelago" => {
                // Two namespace gotchas this block works around:
                //   1. We're running FROM /usr/local/bin/archipelago, so
                //      `install`/`cp` (O_TRUNC + write) fail with ETXTBSY.
                //      Use `mv`, which is atomic rename() and tolerates a
                //      busy destination.
                //   2. archipelago.service sets ProtectSystem=strict, so
                //      even `sudo mv` into /usr/local/bin/ fails EROFS —
                //      sudo inherits the service's mount namespace. Route
                //      the rename through systemd-run so it runs in a
                //      transient unit with default protections.
                let staged = src.to_string_lossy().to_string();
                let _ = host_sudo(&["chmod", "0755", &staged]).await;
                let _ = host_sudo(&["chown", "root:root", &staged]).await;
                let status = host_sudo(&["mv", &staged, "/usr/local/bin/archipelago"])
                    .await
                    .with_context(|| format!("Failed to spawn mv for {}", name))?;
                if !status.success() {
                    anyhow::bail!(
                        "mv into /usr/local/bin failed for {} (exit {:?})",
                        name,
                        status.code()
                    );
                }
                info!(name = %name, "Backend binary applied");
            }
            _ if name.contains("frontend") && name.ends_with(".tar.gz") => {
                // Tarball contents are the *inside* of web-ui/ (root entries
                // `./test-aiui.html`, `./assets/`, ...). Extract into a
                // uniquely-named staging dir, then mv into place. No `rm
                // -rf` pre-cleanup — that's what hit transient EROFS on
                // .198 and aborted the apply mid-flight.
                let ts = chrono::Utc::now().timestamp_millis();
                let staging_new = format!("/opt/archipelago/web-ui.new.{}", ts);
                let staging_old = format!("/opt/archipelago/web-ui.old.{}", ts);
                let web_ui = "/opt/archipelago/web-ui";
                let backup_path = "/opt/archipelago/web-ui.bak";

                // All sudo calls that touch /opt/archipelago go through
                // host_sudo so they see a normal root mount namespace.
                let mk = host_sudo(&["mkdir", "-p", &staging_new])
                    .await
                    .context("Failed to create frontend staging dir")?;
                if !mk.success() {
                    anyhow::bail!("mkdir {} failed", staging_new);
                }
                let extract =
                    host_sudo(&["tar", "-xzf", &src.to_string_lossy(), "-C", &staging_new])
                        .await
                        .with_context(|| format!("Failed to extract {}", name))?;
                if !extract.success() {
                    let _ = host_sudo(&["rm", "-rf", &staging_new]).await;
                    anyhow::bail!("tar extraction failed for {}", name);
                }
                let _ = host_sudo(&["chown", "-R", "archipelago:archipelago", &staging_new]).await;

                // Set world-readable perms so nginx (runs as www-data)
                // can stat + serve the files. Without this, the tar
                // extraction inherits the staging-dir's 700 mode and
                // nginx returns 403/500 for every request after the
                // swap — exactly what bit .116 on the v1.7.38 rollout.
                let _ = host_sudo(&["chmod", "755", &staging_new]).await;
                let _ = host_sudo(&[
                    "find",
                    &staging_new,
                    "-type",
                    "d",
                    "-exec",
                    "chmod",
                    "755",
                    "{}",
                    "+",
                ])
                .await;
                let _ = host_sudo(&[
                    "find",
                    &staging_new,
                    "-type",
                    "f",
                    "-exec",
                    "chmod",
                    "644",
                    "{}",
                    "+",
                ])
                .await;

                // Preserve paths that are installed outside the Vue build
                // (baked in by the ISO or sibling installers) and so
                // aren't in the new tarball. Without this copy, every OTA
                // wipes them — notably aiui/ (Claude Code sidebar) and
                // the companion APK. `cp -a` preserves mode/ownership.
                for preserved in ["aiui", "archipelago-companion.apk"] {
                    let src = format!("{}/{}", web_ui, preserved);
                    let dst = format!("{}/{}", staging_new, preserved);
                    // Only preserve the old copy if the new tarball
                    // doesn't already ship a fresher one.
                    if Path::new(&src).exists() && !Path::new(&dst).exists() {
                        let _ = host_sudo(&["cp", "-a", &src, &dst]).await;
                    }
                }

                // Swap: mv current web-ui aside, then mv new into place.
                if Path::new(web_ui).exists() {
                    let mv_old = host_sudo(&["mv", web_ui, &staging_old])
                        .await
                        .context("Failed to rotate old web-ui")?;
                    if !mv_old.success() {
                        anyhow::bail!("failed to move old web-ui aside");
                    }
                }
                let mv_new = host_sudo(&["mv", &staging_new, web_ui])
                    .await
                    .context("Failed to swap new web-ui into place")?;
                if !mv_new.success() {
                    if Path::new(&staging_old).exists() {
                        let _ = host_sudo(&["mv", &staging_old, web_ui]).await;
                    }
                    anyhow::bail!("failed to move new web-ui into place");
                }

                // Rotate previous rollback aside and install this apply's
                // old copy as the new rollback.
                if Path::new(&staging_old).exists() {
                    if Path::new(backup_path).exists() {
                        let _ = host_sudo(&["mv", backup_path, &format!("{}.{}", backup_path, ts)])
                            .await;
                    }
                    let _ = host_sudo(&["mv", &staging_old, backup_path]).await;
                }
                info!(name = %name, "Frontend archive extracted to /opt/archipelago/web-ui");
            }
            _ if name.contains("runtime") && name.ends_with(".tar.gz") => {
                let ts = chrono::Utc::now().timestamp_millis();
                let staging_new = format!("/opt/archipelago/runtime.new.{}", ts);
                let archive = src.to_string_lossy().to_string();

                let mk = host_sudo(&["mkdir", "-p", &staging_new])
                    .await
                    .context("Failed to create runtime staging dir")?;
                if !mk.success() {
                    anyhow::bail!("mkdir {} failed", staging_new);
                }

                let extract = host_sudo(&["tar", "-xzf", &archive, "-C", &staging_new])
                    .await
                    .with_context(|| format!("Failed to extract {}", name))?;
                if !extract.success() {
                    let _ = host_sudo(&["rm", "-rf", &staging_new]).await;
                    anyhow::bail!("tar extraction failed for {}", name);
                }

                let runtime_paths = [
                    ("apps", "apps"),
                    ("scripts", "scripts"),
                    ("docker", "docker"),
                    (
                        "image-recipe/configs/archipelago-doctor.service",
                        "archipelago-doctor.service",
                    ),
                    (
                        "image-recipe/configs/archipelago-doctor.timer",
                        "archipelago-doctor.timer",
                    ),
                ];

                for (relative, label) in runtime_paths {
                    let staged_path = format!("{}/{}", staging_new, relative);
                    if !Path::new(&staged_path).exists() {
                        tracing::debug!(path = %relative, "Runtime artifact path absent, skipping");
                        continue;
                    }

                    match label {
                        "apps" | "scripts" | "docker" => {
                            let dest = format!("/opt/archipelago/{}", label);
                            let tmp_dest =
                                format!("{}.new.{}", dest, chrono::Utc::now().timestamp_millis());
                            let _ = host_sudo(&["mkdir", "-p", &tmp_dest]).await;
                            let staged_dot = format!("{}/.", staged_path);
                            let copy = host_sudo(&["cp", "-a", &staged_dot, &tmp_dest])
                                .await
                                .with_context(|| format!("Failed to copy runtime {}", label))?;
                            if !copy.success() {
                                let _ = host_sudo(&["rm", "-rf", &tmp_dest]).await;
                                anyhow::bail!("runtime copy failed for {}", label);
                            }
                            let _ = host_sudo(&["mkdir", "-p", &dest]).await;
                            let clean = host_sudo(&[
                                "find",
                                &dest,
                                "-mindepth",
                                "1",
                                "-maxdepth",
                                "1",
                                "-exec",
                                "rm",
                                "-rf",
                                "{}",
                                "+",
                            ])
                            .await
                            .with_context(|| format!("Failed to clean runtime {}", label))?;
                            if !clean.success() {
                                let _ = host_sudo(&["rm", "-rf", &tmp_dest]).await;
                                anyhow::bail!("runtime clean failed for {}", label);
                            }
                            let tmp_dot = format!("{}/.", tmp_dest);
                            let promote = host_sudo(&["cp", "-a", &tmp_dot, &dest])
                                .await
                                .with_context(|| format!("Failed to promote runtime {}", label))?;
                            let _ = host_sudo(&["rm", "-rf", &tmp_dest]).await;
                            if !promote.success() {
                                anyhow::bail!("runtime promote failed for {}", label);
                            }
                            if label == "scripts" {
                                let _ = host_sudo(&[
                                    "find", &dest, "-type", "f", "-name", "*.sh", "-exec", "chmod",
                                    "755", "{}", "+",
                                ])
                                .await;
                            }
                        }
                        "archipelago-doctor.service" | "archipelago-doctor.timer" => {
                            let dest = format!("/etc/systemd/system/{}", label);
                            let install = host_sudo(&["install", "-m", "644", &staged_path, &dest])
                                .await
                                .with_context(|| format!("Failed to install {}", label))?;
                            if !install.success() {
                                anyhow::bail!("runtime unit install failed for {}", label);
                            }
                        }
                        _ => {}
                    }
                }

                if Path::new(&format!("{}/scripts/image-versions.sh", staging_new)).exists() {
                    let _ = host_sudo(&[
                        "cp",
                        &format!("{}/scripts/image-versions.sh", staging_new),
                        "/opt/archipelago/image-versions.sh",
                    ])
                    .await;
                }

                let _ = host_sudo(&["systemctl", "daemon-reload"]).await;
                let _ =
                    host_sudo(&["systemctl", "enable", "--now", "archipelago-doctor.timer"]).await;
                let _ = host_sudo(&["rm", "-rf", &staging_new]).await;
                info!(name = %name, "Runtime assets applied to /opt/archipelago");
            }
            _ => {
                debug!(name = %name, "Unknown component, skipping");
            }
        }
    }

    // Update state
    let previous_version = {
        let state = load_state(data_dir).await?;
        state.current_version.clone()
    };
    let mut state = load_state(data_dir).await?;
    let new_version = if let Some(manifest) = &state.available_update {
        state.current_version = manifest.version.clone();
        manifest.version.clone()
    } else {
        state.current_version.clone()
    };
    state.available_update = None;
    state.update_in_progress = false;
    state.rollback_available = true;
    save_state(data_dir, &state).await?;

    // Write the post-OTA verification marker BEFORE we schedule the
    // restart. The new binary will read it on startup, probe the
    // frontend, and auto-rollback if nginx is serving 5xx. Covers the
    // class of failure where "apply succeeds, restart succeeds, but
    // the UI is dead" (v1.7.38/39 tarball-perms bug). Best-effort —
    // a failed marker write shouldn't abort the apply.
    let marker = PendingVerification {
        applied_at: chrono::Utc::now().to_rfc3339(),
        new_version,
        previous_version,
        deadline_ts: chrono::Utc::now().timestamp() + PENDING_VERIFY_WINDOW_SECS as i64 + 60,
    };
    if let Err(e) = write_pending_verification(data_dir, &marker).await {
        tracing::warn!(error = %e, "Failed to write post-OTA verify marker — rollback disabled for this OTA");
    } else {
        info!("Post-OTA verify marker written; new binary will probe on boot");
    }

    // Clean staging
    let _ = fs::remove_dir_all(&staging_dir).await;

    info!("Update applied — scheduling service restart in 2s so the RPC reply lands first");

    // Restart asynchronously so the JSON-RPC response actually reaches the
    // UI before systemd kills us. --no-block makes sure systemctl doesn't
    // try to wait for the current service (us) to exit cleanly before
    // starting the new process — it would deadlock otherwise.
    tokio::spawn(async {
        tokio::time::sleep(std::time::Duration::from_secs(2)).await;
        // systemctl talks to PID 1 over D-Bus — doesn't need the host
        // mount namespace, but routing through host_sudo keeps the
        // apply flow's sudo calls uniform.
        let _ = host_sudo(&["systemctl", "--no-block", "restart", "archipelago"]).await;
    });

    Ok(())
}

/// Rollback to the previous version from backup.
pub async fn rollback_update(data_dir: &Path) -> Result<()> {
    let backup_dir = data_dir.join("update-backup");
    if !backup_dir.exists() {
        anyhow::bail!("No rollback backup available");
    }

    let backup_binary = backup_dir.join("archipelago");
    if backup_binary.exists() {
        // Use host_sudo + mv so we escape the archipelago service's
        // ProtectSystem=strict mount namespace. A plain fs::copy or
        // `sudo cp` from inside the service hits EROFS on /usr/local/bin,
        // which would silently orphan the rollback — exactly the
        // opposite of what auto-rollback is for. Pattern matches
        // apply_update()'s binary swap above.
        let backup_str = backup_binary.to_string_lossy().to_string();
        let _ = host_sudo(&["chmod", "0755", &backup_str]).await;
        let _ = host_sudo(&["chown", "root:root", &backup_str]).await;
        let status = host_sudo(&["cp", &backup_str, "/usr/local/bin/archipelago"])
            .await
            .context("Failed to restore backup binary via host_sudo")?;
        if !status.success() {
            anyhow::bail!(
                "cp backup binary into /usr/local/bin failed (exit {:?})",
                status.code()
            );
        }
        info!("Binary rolled back to previous version");
    }

    let mut state = load_state(data_dir).await?;
    state.rollback_available = false;
    save_state(data_dir, &state).await?;

    let _ = fs::remove_dir_all(&backup_dir).await;

    info!("Rollback complete. Restart service to take effect.");
    Ok(())
}

#[derive(Debug, Serialize, Deserialize)]
pub struct DownloadProgress {
    pub total_bytes: u64,
    pub downloaded_bytes: u64,
    pub components_downloaded: usize,
    pub staging_dir: String,
}

/// Set the update schedule preference.
pub async fn set_schedule(data_dir: &Path, schedule: UpdateSchedule) -> Result<()> {
    let mut state = load_state(data_dir).await?;
    state.schedule = schedule;
    save_state(data_dir, &state).await?;
    info!(schedule = ?schedule, "Update schedule changed");
    Ok(())
}

/// Get the current schedule.
pub async fn get_schedule(data_dir: &Path) -> Result<UpdateSchedule> {
    let state = load_state(data_dir).await?;
    Ok(state.schedule)
}

/// Background update scheduler. Runs in a loop, checking/applying based on schedule.
/// Call this once at startup via `tokio::spawn`.
pub async fn run_update_scheduler(data_dir: std::path::PathBuf) {
    use tokio::time::{interval, Duration};

    // Check every hour; act based on schedule setting
    let mut tick = interval(Duration::from_secs(3600));

    loop {
        tick.tick().await;

        let state = match load_state(&data_dir).await {
            Ok(s) => s,
            Err(e) => {
                debug!("Update scheduler: failed to load state: {}", e);
                continue;
            }
        };

        match state.schedule {
            UpdateSchedule::Manual => {
                debug!("Update scheduler: manual mode, skipping");
                continue;
            }
            UpdateSchedule::DailyCheck => {
                // Only check once per day
                if let Some(ref last) = state.last_check {
                    if let Ok(last_time) = chrono::DateTime::parse_from_rfc3339(last) {
                        let elapsed = chrono::Utc::now() - last_time.with_timezone(&chrono::Utc);
                        if elapsed.num_hours() < 24 {
                            debug!("Update scheduler: checked recently, skipping");
                            continue;
                        }
                    }
                }
                info!("Update scheduler: running daily check");
                if let Err(e) = check_for_updates(&data_dir).await {
                    debug!("Update scheduler: check failed: {}", e);
                }
            }
            UpdateSchedule::AutoApply => {
                // Auto-apply: check, download, and apply during 3 AM window
                let hour = chrono::Local::now().hour();
                if hour != 3 {
                    // Still do daily check outside the window
                    if let Some(ref last) = state.last_check {
                        if let Ok(last_time) = chrono::DateTime::parse_from_rfc3339(last) {
                            let elapsed =
                                chrono::Utc::now() - last_time.with_timezone(&chrono::Utc);
                            if elapsed.num_hours() < 24 {
                                continue;
                            }
                        }
                    }
                    info!("Update scheduler: auto-apply check (outside window)");
                    if let Err(e) = check_for_updates(&data_dir).await {
                        debug!("Update scheduler: check failed: {}", e);
                    }
                    continue;
                }

                // 3 AM — check, download, and apply
                info!("Update scheduler: 3 AM auto-apply window");
                match check_for_updates(&data_dir).await {
                    Ok(s) if s.available_update.is_some() => {
                        info!("Update scheduler: downloading update");
                        if let Err(e) = download_update(&data_dir).await {
                            debug!("Update scheduler: download failed: {}", e);
                            continue;
                        }
                        info!("Update scheduler: applying update");
                        if let Err(e) = apply_update(&data_dir).await {
                            debug!("Update scheduler: apply failed: {}", e);
                            continue;
                        }
                        info!(
                            "Update scheduler: update applied, restart scheduled by apply_update"
                        );
                        // apply_update has already spawned a 2s-delayed
                        // `systemctl restart archipelago`. Don't call
                        // std::process::exit here — that kills the runtime
                        // before the spawned restart task runs, and since
                        // the unit is Restart=on-failure a clean exit(0)
                        // leaves the service dead. Fall through; the
                        // scheduled restart will bring us back cleanly.
                        return;
                    }
                    Ok(_) => {
                        debug!("Update scheduler: no update available");
                    }
                    Err(e) => {
                        debug!("Update scheduler: check failed: {}", e);
                    }
                }
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_update_schedule_default_is_daily_check() {
        let schedule = UpdateSchedule::default();
        assert_eq!(schedule, UpdateSchedule::DailyCheck);
    }

    #[test]
    fn test_manifest_origin_parses_https() {
        assert_eq!(
            manifest_origin(
                "https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/manifest.json"
            ),
            Some("https://git.tx1138.com".to_string())
        );
    }

    #[test]
    fn test_manifest_origin_parses_http_with_port() {
        assert_eq!(
            manifest_origin(
                "http://23.182.128.160:3000/lfg2025/archy/raw/branch/main/releases/manifest.json"
            ),
            Some("http://23.182.128.160:3000".to_string())
        );
    }

    #[test]
    fn test_manifest_origin_rejects_garbage() {
        assert_eq!(manifest_origin("not a url"), None);
        assert_eq!(manifest_origin("ftp://git.tx1138.com/x"), None);
    }

    #[test]
    fn test_rewrite_manifest_origins_swaps_all_components() {
        let mut manifest = UpdateManifest {
            version: "1.7.26-alpha".into(),
            release_date: "2026-04-21".into(),
            changelog: vec![],
            components: vec![
                ComponentUpdate {
                    name: "archipelago".into(),
                    current_version: "1.7.25-alpha".into(),
                    new_version: "1.7.26-alpha".into(),
                    download_url: "https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/v1.7.26-alpha/archipelago".into(),
                    sha256: "x".into(),
                    size_bytes: 1,
                },
                ComponentUpdate {
                    name: "frontend".into(),
                    current_version: "1.7.25-alpha".into(),
                    new_version: "1.7.26-alpha".into(),
                    download_url: "https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/v1.7.26-alpha/frontend.tar.gz".into(),
                    sha256: "y".into(),
                    size_bytes: 2,
                },
            ],
        };
        rewrite_manifest_origins(
            &mut manifest,
            "http://23.182.128.160:3000/lfg2025/archy/raw/branch/main/releases/manifest.json",
        );
        assert_eq!(
            manifest.components[0].download_url,
            "http://23.182.128.160:3000/lfg2025/archy/raw/branch/main/releases/v1.7.26-alpha/archipelago"
        );
        assert_eq!(
            manifest.components[1].download_url,
            "http://23.182.128.160:3000/lfg2025/archy/raw/branch/main/releases/v1.7.26-alpha/frontend.tar.gz"
        );
    }

    #[tokio::test]
    async fn test_load_mirrors_returns_defaults_when_absent() {
        let dir = tempfile::tempdir().unwrap();
        let list = load_mirrors(dir.path()).await.unwrap();
        assert_eq!(list.len(), 2);
        assert!(list[0].url.contains("146.59.87.168"));
        assert!(list[1].url.contains("git.tx1138.com"));
    }

    #[tokio::test]
    async fn test_save_and_load_mirrors_roundtrip() {
        let dir = tempfile::tempdir().unwrap();
        let list = vec![UpdateMirror {
            url: "https://example.com/m.json".into(),
            label: "Example".into(),
        }];
        save_mirrors(dir.path(), &list).await.unwrap();
        let back = load_mirrors(dir.path()).await.unwrap();
        // load_mirrors merges in any missing default mirrors so a node
        // that explicitly added a single custom mirror still gets the
        // built-in OVH + tx1138 fallbacks. The custom mirror is preserved.
        assert!(
            back.iter().any(|m| m.url == "https://example.com/m.json"),
            "custom mirror should round-trip; got {:?}",
            back
        );
        for def in default_mirrors() {
            assert!(
                back.iter().any(|m| m.url == def.url),
                "default mirror {} should be present after load; got {:?}",
                def.url,
                back
            );
        }
    }

    #[test]
    fn test_update_state_default_values() {
        let state = UpdateState::default();
        assert_eq!(state.current_version, env!("CARGO_PKG_VERSION"));
        assert!(state.last_check.is_none());
        assert!(state.available_update.is_none());
        assert!(!state.update_in_progress);
        assert!(!state.rollback_available);
        assert_eq!(state.schedule, UpdateSchedule::DailyCheck);
    }

    #[test]
    fn test_update_state_serialization_roundtrip() {
        let state = UpdateState {
            current_version: "0.2.0".to_string(),
            last_check: Some("2025-01-01T00:00:00Z".to_string()),
            available_update: None,
            update_in_progress: false,
            rollback_available: true,
            schedule: UpdateSchedule::AutoApply,
            manifest_mirror: None,
        };
        let json = serde_json::to_string(&state).unwrap();
        let deserialized: UpdateState = serde_json::from_str(&json).unwrap();
        assert_eq!(deserialized.current_version, "0.2.0");
        assert!(deserialized.rollback_available);
        assert_eq!(deserialized.schedule, UpdateSchedule::AutoApply);
    }

    #[test]
    fn test_update_schedule_serde_rename() {
        let json = serde_json::to_string(&UpdateSchedule::DailyCheck).unwrap();
        assert_eq!(json, "\"daily_check\"");
        let json = serde_json::to_string(&UpdateSchedule::Manual).unwrap();
        assert_eq!(json, "\"manual\"");
        let json = serde_json::to_string(&UpdateSchedule::AutoApply).unwrap();
        assert_eq!(json, "\"auto_apply\"");
    }

    #[test]
    fn test_update_state_schedule_defaults_on_missing_field() {
        // When schedule field is missing from JSON, it should default to DailyCheck
        let json = r#"{
            "current_version": "0.1.0",
            "last_check": null,
            "available_update": null,
            "update_in_progress": false,
            "rollback_available": false
        }"#;
        let state: UpdateState = serde_json::from_str(json).unwrap();
        assert_eq!(state.schedule, UpdateSchedule::DailyCheck);
    }

    #[test]
    fn test_parse_version_triple() {
        assert_eq!(parse_version_triple("1.7.18"), Some((1, 7, 18)));
        assert_eq!(parse_version_triple("1.7.18-alpha"), Some((1, 7, 18)));
        assert_eq!(parse_version_triple("0.0.1"), Some((0, 0, 1)));
        assert_eq!(parse_version_triple("garbage"), None);
        assert_eq!(parse_version_triple("1.2"), None);
    }

    #[test]
    fn test_is_newer() {
        assert!(is_newer("1.7.19-alpha", "1.7.18-alpha"));
        assert!(is_newer("1.8.0-alpha", "1.7.99-alpha"));
        assert!(is_newer("1.7.10-alpha", "1.7.9-alpha")); // numeric, not lexical
        assert!(!is_newer("1.7.18-alpha", "1.7.18-alpha"));
        assert!(!is_newer("1.7.17-alpha", "1.7.18-alpha")); // would-be downgrade
        assert!(!is_newer("1.7.9-alpha", "1.7.10-alpha"));
    }

    #[tokio::test]
    async fn test_load_state_clears_stale_available_on_version_bump() {
        // Simulates a sideload: state file on disk says we're on
        // 1.7.16-alpha with 1.7.17-alpha staged as the pending update,
        // but the running binary is 1.7.18-alpha (skipped a version).
        // load_state must drop the stale available_update so the UI
        // doesn't offer a downgrade.
        let dir = tempfile::tempdir().unwrap();
        let stale = UpdateState {
            current_version: "1.7.16-alpha".to_string(),
            available_update: Some(UpdateManifest {
                version: "1.7.17-alpha".to_string(),
                release_date: "2026-04-20".to_string(),
                changelog: vec![],
                components: vec![],
            }),
            ..UpdateState::default()
        };
        save_state(dir.path(), &stale).await.unwrap();
        let loaded = load_state(dir.path()).await.unwrap();
        assert_eq!(loaded.current_version, env!("CARGO_PKG_VERSION"));
        assert!(
            loaded.available_update.is_none(),
            "stale available_update must be cleared after version bump"
        );
    }

    #[tokio::test]
    async fn test_load_state_creates_default_when_missing() {
        let dir = tempfile::tempdir().unwrap();
        let state = load_state(dir.path()).await.unwrap();
        assert_eq!(state.current_version, env!("CARGO_PKG_VERSION"));
        assert!(!state.update_in_progress);
        // File should now exist after load created the default
        assert!(dir.path().join(UPDATE_STATE_FILE).exists());
    }

    #[tokio::test]
    async fn test_save_and_load_state_roundtrip() {
        let dir = tempfile::tempdir().unwrap();
        let staging = dir.path().join("update-staging");
        tokio::fs::create_dir_all(&staging).await.unwrap();
        tokio::fs::write(staging.join("archipelago"), b"staged")
            .await
            .unwrap();
        let state = UpdateState {
            current_version: "1.0.0".to_string(),
            last_check: Some("2025-06-15T12:00:00Z".to_string()),
            available_update: Some(UpdateManifest {
                version: "1.1.0".to_string(),
                release_date: "2025-06-20".to_string(),
                changelog: vec!["Fix bugs".to_string(), "New feature".to_string()],
                components: vec![ComponentUpdate {
                    name: "archipelago".to_string(),
                    current_version: "1.0.0".to_string(),
                    new_version: "1.1.0".to_string(),
                    download_url: "https://example.com/binary".to_string(),
                    sha256: "abc123".to_string(),
                    size_bytes: 5000,
                }],
            }),
            update_in_progress: true,
            rollback_available: false,
            schedule: UpdateSchedule::Manual,
            manifest_mirror: Some(
                "https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/manifest.json"
                    .to_string(),
            ),
        };
        save_state(dir.path(), &state).await.unwrap();
        let loaded = load_state(dir.path()).await.unwrap();
        // load_state rewrites current_version to match the running
        // binary (sideload self-heal), so don't assert on the saved
        // value. The migration also clears available_update when the
        // version changes — check the other fields survived.
        assert_eq!(loaded.current_version, env!("CARGO_PKG_VERSION"));
        assert!(loaded.update_in_progress);
        assert_eq!(loaded.schedule, UpdateSchedule::Manual);
        assert!(loaded.available_update.is_none());
    }

    #[tokio::test]
    async fn test_load_state_clears_stale_in_progress_without_staging() {
        let dir = tempfile::tempdir().unwrap();
        let state = UpdateState {
            update_in_progress: true,
            ..UpdateState::default()
        };
        save_state(dir.path(), &state).await.unwrap();

        let loaded = load_state(dir.path()).await.unwrap();

        assert!(!loaded.update_in_progress);
        let persisted = load_state(dir.path()).await.unwrap();
        assert!(!persisted.update_in_progress);
    }

    #[tokio::test]
    async fn test_dismiss_update_clears_available() {
        let dir = tempfile::tempdir().unwrap();
        let state = UpdateState {
            available_update: Some(UpdateManifest {
                version: "2.0.0".to_string(),
                release_date: "2025-07-01".to_string(),
                changelog: vec![],
                components: vec![],
            }),
            ..UpdateState::default()
        };
        save_state(dir.path(), &state).await.unwrap();
        dismiss_update(dir.path()).await.unwrap();
        let loaded = load_state(dir.path()).await.unwrap();
        assert!(loaded.available_update.is_none());
    }

    #[tokio::test]
    async fn test_set_and_get_schedule() {
        let dir = tempfile::tempdir().unwrap();
        // Initialize state
        let _ = load_state(dir.path()).await.unwrap();

        set_schedule(dir.path(), UpdateSchedule::AutoApply)
            .await
            .unwrap();
        let schedule = get_schedule(dir.path()).await.unwrap();
        assert_eq!(schedule, UpdateSchedule::AutoApply);

        set_schedule(dir.path(), UpdateSchedule::Manual)
            .await
            .unwrap();
        let schedule = get_schedule(dir.path()).await.unwrap();
        assert_eq!(schedule, UpdateSchedule::Manual);
    }

    #[tokio::test]
    async fn test_get_status_returns_current_state() {
        let dir = tempfile::tempdir().unwrap();
        let state = UpdateState {
            current_version: "3.0.0".to_string(),
            rollback_available: true,
            ..UpdateState::default()
        };
        save_state(dir.path(), &state).await.unwrap();
        let status = get_status(dir.path()).await.unwrap();
        // get_status → load_state, which rewrites current_version to
        // match the running binary (see the sideload-self-heal path).
        assert_eq!(status.current_version, env!("CARGO_PKG_VERSION"));
        assert!(status.rollback_available);
    }

    #[tokio::test]
    async fn test_pending_verification_round_trip() {
        let dir = tempfile::tempdir().unwrap();
        let marker = PendingVerification {
            applied_at: chrono::Utc::now().to_rfc3339(),
            new_version: "1.7.41-alpha".into(),
            previous_version: "1.7.40-alpha".into(),
            deadline_ts: chrono::Utc::now().timestamp() + 150,
        };
        write_pending_verification(dir.path(), &marker)
            .await
            .unwrap();
        let read = read_pending_verification(dir.path()).await.unwrap();
        assert_eq!(read.new_version, "1.7.41-alpha");
        assert_eq!(read.previous_version, "1.7.40-alpha");
        clear_pending_verification(dir.path()).await;
        assert!(read_pending_verification(dir.path()).await.is_none());
    }

    #[tokio::test]
    async fn test_pending_verification_absent_is_none() {
        let dir = tempfile::tempdir().unwrap();
        assert!(read_pending_verification(dir.path()).await.is_none());
    }

    #[tokio::test]
    async fn test_verify_pending_update_noop_without_marker() {
        let dir = tempfile::tempdir().unwrap();
        // No marker written -- must return quickly without doing anything
        // risky (network probes, rollback calls). We're just asserting
        // it doesn't panic or hang.
        verify_pending_update(dir.path()).await;
    }

    #[test]
    fn test_pending_verify_constants_are_sensible() {
        // Window must be generous enough for nginx + backend startup,
        // but less than the stale-marker threshold so a normal cycle
        // can complete without the marker being considered stale.
        assert!(PENDING_VERIFY_WINDOW_SECS < PENDING_VERIFY_MAX_AGE_SECS as u64);
        assert!(PENDING_VERIFY_WINDOW_SECS >= 60);
    }
}