diff --git a/core/archipelago/src/bootstrap.rs b/core/archipelago/src/bootstrap.rs index f7892471..cbccf2dd 100644 --- a/core/archipelago/src/bootstrap.rs +++ b/core/archipelago/src/bootstrap.rs @@ -516,6 +516,63 @@ async fn write_root_if_needed(path: &str, content: &str) -> Result { Ok(true) } +const ARCHIPELAGO_SERVICE_PATH: &str = "/etc/systemd/system/archipelago.service"; +const MOUNT_REQUIRE_LINE: &str = "RequiresMountsFor=/var/lib/archipelago"; + +/// B17 self-heal: ensure the installed archipelago.service waits for the data +/// volume to mount before it starts. On production nodes `/var/lib/archipelago` +/// (the app data dir AND podman's graphroot) is a separate device-mapper volume; +/// without a mount dependency the service can start before `var-lib-archipelago.mount`, +/// write to the bare mountpoint on rootfs, fail every podman call, exit, and be +/// restarted every 5s until the volume mounts (~5 min of "[FAILED] Failed to start" +/// on cold boots). Fresh ISOs already ship the directive; this heals already-deployed +/// nodes. The change is boot-ordering only — it takes effect on the NEXT reboot, so we +/// never restart the running service here. Idempotent; no-op if the unit is absent +/// (dev runs) or already patched. Harmless when the data dir is on rootfs (systemd maps +/// the requirement to the always-mounted root). +pub async fn ensure_archipelago_mount_ordering() { + let current = match fs::read_to_string(ARCHIPELAGO_SERVICE_PATH).await { + Ok(c) => c, + Err(e) => { + tracing::debug!( + "mount-ordering self-heal: {} not readable ({}) — skipping", + ARCHIPELAGO_SERVICE_PATH, + e + ); + return; + } + }; + if current.contains(MOUNT_REQUIRE_LINE) { + return; // already healed + } + // Insert the directive into the [Unit] section, immediately before [Service]. + let Some(idx) = current.find("\n[Service]") else { + tracing::warn!( + "mount-ordering self-heal: no [Service] section in {} — skipping", + ARCHIPELAGO_SERVICE_PATH + ); + return; + }; + let mut patched = String::with_capacity(current.len() + MOUNT_REQUIRE_LINE.len() + 96); + patched.push_str(¤t[..idx]); + patched.push_str("\n# B17: start only after the data volume (+ podman graphroot) is mounted\n"); + patched.push_str(MOUNT_REQUIRE_LINE); + patched.push_str(¤t[idx..]); + match write_root_if_needed(ARCHIPELAGO_SERVICE_PATH, &patched).await { + Ok(true) => { + info!( + "B17: added '{}' to archipelago.service (effective next reboot)", + MOUNT_REQUIRE_LINE + ); + if let Err(e) = host_sudo(&["systemctl", "daemon-reload"]).await { + tracing::warn!("B17 self-heal: daemon-reload failed: {:#}", e); + } + } + Ok(false) => {} + Err(e) => tracing::warn!("B17 mount-ordering self-heal failed: {:#}", e), + } +} + /// Patch the nginx site config to add missing backend proxy blocks. Older ISO /// configs shipped individual per-endpoint `location` blocks, so missing /// endpoints silently fell through to the SPA `index.html` and the frontend diff --git a/core/archipelago/src/main.rs b/core/archipelago/src/main.rs index 93e24bdd..b78720fe 100644 --- a/core/archipelago/src/main.rs +++ b/core/archipelago/src/main.rs @@ -271,6 +271,11 @@ async fn main() -> Result<()> { // delays server readiness; best-effort, warnings only. tokio::spawn(bootstrap::ensure_doctor_installed()); + // B17: heal already-deployed nodes whose archipelago.service lacks a mount + // dependency on the data volume, so cold boots stop flapping. Boot-ordering + // only — effective next reboot; never restarts the running service. + tokio::spawn(bootstrap::ensure_archipelago_mount_ordering()); + // Spawn periodic container snapshot (for crash recovery) crash_recovery::spawn_snapshot_task(config.data_dir.clone()); diff --git a/image-recipe/configs/archipelago.service b/image-recipe/configs/archipelago.service index 0cddae7c..87f89149 100644 --- a/image-recipe/configs/archipelago.service +++ b/image-recipe/configs/archipelago.service @@ -2,6 +2,14 @@ Description=Archipelago Backend After=network-online.target archipelago-setup-tor.service Wants=network-online.target +# The data dir AND podman's graphroot (containers/storage) both live on the +# separate /var/lib/archipelago volume. Without this, on a cold boot the service +# (and its ExecStartPre) can start BEFORE var-lib-archipelago.mount, write to the +# bare mountpoint on rootfs, fail every podman call, exit, and get restarted every +# 5s until the volume mounts (~5 min of "[FAILED] Failed to start" on boot — B17). +# RequiresMountsFor adds both Requires= and After= on the mount unit so we never +# start until the data volume is mounted. +RequiresMountsFor=/var/lib/archipelago [Service] Type=notify