fix(boot): order archipelago.service after the data volume mount (B17)
On production nodes /var/lib/archipelago (the app data dir AND podman's graphroot=/var/lib/archipelago/containers/storage) is a separate device-mapper volume. archipelago.service ordered only After=network-online .target, so on cold boots it (and its ExecStartPre) could start BEFORE var-lib-archipelago.mount, write to the bare mountpoint on rootfs, fail every podman call, exit, and be restarted every 5s until the volume mounted — the "~20x [FAILED] Failed to start over ~5min" boot flap. Proven live on .198: "var-lib-archipelago.mount: Directory /var/lib/archipelago to mount over is not empty, mounting anyway" — the service had written there pre-mount. Fix: RequiresMountsFor=/var/lib/archipelago (adds Requires= + After= on the mount unit). - image-recipe/configs/archipelago.service: ships the directive on fresh ISOs. - bootstrap::ensure_archipelago_mount_ordering(): self-heals already-deployed nodes' installed unit + daemon-reload (boot-ordering only, effective next reboot; never restarts the running service). Idempotent; harmless on rootfs installs (maps to the always-mounted root). Verified on .198: after applying, systemctl shows After=var-lib-archipelago .mount and systemd-analyze verify is clean. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
2943fd0c5e
commit
34b1fdc1a3
@ -516,6 +516,63 @@ async fn write_root_if_needed(path: &str, content: &str) -> Result<bool> {
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
const ARCHIPELAGO_SERVICE_PATH: &str = "/etc/systemd/system/archipelago.service";
|
||||
const MOUNT_REQUIRE_LINE: &str = "RequiresMountsFor=/var/lib/archipelago";
|
||||
|
||||
/// B17 self-heal: ensure the installed archipelago.service waits for the data
|
||||
/// volume to mount before it starts. On production nodes `/var/lib/archipelago`
|
||||
/// (the app data dir AND podman's graphroot) is a separate device-mapper volume;
|
||||
/// without a mount dependency the service can start before `var-lib-archipelago.mount`,
|
||||
/// write to the bare mountpoint on rootfs, fail every podman call, exit, and be
|
||||
/// restarted every 5s until the volume mounts (~5 min of "[FAILED] Failed to start"
|
||||
/// on cold boots). Fresh ISOs already ship the directive; this heals already-deployed
|
||||
/// nodes. The change is boot-ordering only — it takes effect on the NEXT reboot, so we
|
||||
/// never restart the running service here. Idempotent; no-op if the unit is absent
|
||||
/// (dev runs) or already patched. Harmless when the data dir is on rootfs (systemd maps
|
||||
/// the requirement to the always-mounted root).
|
||||
pub async fn ensure_archipelago_mount_ordering() {
|
||||
let current = match fs::read_to_string(ARCHIPELAGO_SERVICE_PATH).await {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
tracing::debug!(
|
||||
"mount-ordering self-heal: {} not readable ({}) — skipping",
|
||||
ARCHIPELAGO_SERVICE_PATH,
|
||||
e
|
||||
);
|
||||
return;
|
||||
}
|
||||
};
|
||||
if current.contains(MOUNT_REQUIRE_LINE) {
|
||||
return; // already healed
|
||||
}
|
||||
// Insert the directive into the [Unit] section, immediately before [Service].
|
||||
let Some(idx) = current.find("\n[Service]") else {
|
||||
tracing::warn!(
|
||||
"mount-ordering self-heal: no [Service] section in {} — skipping",
|
||||
ARCHIPELAGO_SERVICE_PATH
|
||||
);
|
||||
return;
|
||||
};
|
||||
let mut patched = String::with_capacity(current.len() + MOUNT_REQUIRE_LINE.len() + 96);
|
||||
patched.push_str(¤t[..idx]);
|
||||
patched.push_str("\n# B17: start only after the data volume (+ podman graphroot) is mounted\n");
|
||||
patched.push_str(MOUNT_REQUIRE_LINE);
|
||||
patched.push_str(¤t[idx..]);
|
||||
match write_root_if_needed(ARCHIPELAGO_SERVICE_PATH, &patched).await {
|
||||
Ok(true) => {
|
||||
info!(
|
||||
"B17: added '{}' to archipelago.service (effective next reboot)",
|
||||
MOUNT_REQUIRE_LINE
|
||||
);
|
||||
if let Err(e) = host_sudo(&["systemctl", "daemon-reload"]).await {
|
||||
tracing::warn!("B17 self-heal: daemon-reload failed: {:#}", e);
|
||||
}
|
||||
}
|
||||
Ok(false) => {}
|
||||
Err(e) => tracing::warn!("B17 mount-ordering self-heal failed: {:#}", e),
|
||||
}
|
||||
}
|
||||
|
||||
/// Patch the nginx site config to add missing backend proxy blocks. Older ISO
|
||||
/// configs shipped individual per-endpoint `location` blocks, so missing
|
||||
/// endpoints silently fell through to the SPA `index.html` and the frontend
|
||||
|
||||
@ -271,6 +271,11 @@ async fn main() -> Result<()> {
|
||||
// delays server readiness; best-effort, warnings only.
|
||||
tokio::spawn(bootstrap::ensure_doctor_installed());
|
||||
|
||||
// B17: heal already-deployed nodes whose archipelago.service lacks a mount
|
||||
// dependency on the data volume, so cold boots stop flapping. Boot-ordering
|
||||
// only — effective next reboot; never restarts the running service.
|
||||
tokio::spawn(bootstrap::ensure_archipelago_mount_ordering());
|
||||
|
||||
// Spawn periodic container snapshot (for crash recovery)
|
||||
crash_recovery::spawn_snapshot_task(config.data_dir.clone());
|
||||
|
||||
|
||||
@ -2,6 +2,14 @@
|
||||
Description=Archipelago Backend
|
||||
After=network-online.target archipelago-setup-tor.service
|
||||
Wants=network-online.target
|
||||
# The data dir AND podman's graphroot (containers/storage) both live on the
|
||||
# separate /var/lib/archipelago volume. Without this, on a cold boot the service
|
||||
# (and its ExecStartPre) can start BEFORE var-lib-archipelago.mount, write to the
|
||||
# bare mountpoint on rootfs, fail every podman call, exit, and get restarted every
|
||||
# 5s until the volume mounts (~5 min of "[FAILED] Failed to start" on boot — B17).
|
||||
# RequiresMountsFor adds both Requires= and After= on the mount unit so we never
|
||||
# start until the data volume is mounted.
|
||||
RequiresMountsFor=/var/lib/archipelago
|
||||
|
||||
[Service]
|
||||
Type=notify
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user