feat(orchestrator): desired-state recovery + recreate volume-ownership [UNVALIDATED WIP]

NOT yet validated on a node or fleet-deployed — cargo check passes, release build
+ .228 canary validation pending. Committed as a checkpoint so the work survives.

Two fixes the immich .198 incident exposed:

Fix A (reconcile_all_with_mode): a previously-running app whose container vanished
(e.g. a wedged podman teardown cleared by a reboot) was left absent on boot. Now,
when boot reconcile would leave an app 'absent' but it was running at the last
running-containers snapshot, recreate it (install_fresh). New
crash_recovery::load_last_running_names() reads the snapshot without the PID/crash
gate (+2 unit tests). Match is exact on compute_container_name (incl stack
members); user-stopped + uninstalled apps are already excluded, so no false
positives.

Fix B (ensure_bind_mount_dirs): a freshly-created bind dir was left root:root, so a
no-data_uid app running as container-root (→ host rootless user) hit EACCES and
crash-looped (the exact immich upload-dir failure). Now a newly-created bind dir
for a no-data_uid app is chowned via --reference=<parent> to match the rootless
data root — no host-uid guessing, only fresh dirs (no regression for existing
installs).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
archipelago 2026-06-24 09:28:40 -04:00
parent 80f49cac1c
commit a721532f55
2 changed files with 127 additions and 0 deletions

View File

@ -1262,6 +1262,11 @@ impl ProdContainerOrchestrator {
async fn reconcile_all_with_mode(&self, mode: ReconcileMode) -> ReconcileReport { async fn reconcile_all_with_mode(&self, mode: ReconcileMode) -> ReconcileReport {
let user_stopped = crate::crash_recovery::load_user_stopped(&self.data_dir).await; let user_stopped = crate::crash_recovery::load_user_stopped(&self.data_dir).await;
// Durable desired-state signal: the container names that were running at
// the last periodic snapshot. Used below to recreate a previously-running
// app whose container vanished (e.g. a wedged teardown cleared by a
// reboot) instead of leaving it down. See the immich .198 incident.
let was_running = crate::crash_recovery::load_last_running_names(&self.data_dir).await;
let manifests: Vec<LoadedManifest> = { let manifests: Vec<LoadedManifest> = {
let state = self.state.read().await; let state = self.state.read().await;
let dependency_required = dependency_manifests_required_by_active_apps( let dependency_required = dependency_manifests_required_by_active_apps(
@ -1295,6 +1300,34 @@ impl ProdContainerOrchestrator {
continue; continue;
} }
match self.ensure_running_with_mode(&lm, mode).await { match self.ensure_running_with_mode(&lm, mode).await {
// Desired-state recovery: the app has no container and was left
// "absent" by boot reconcile, BUT it was running at the last
// snapshot — so its container vanished unexpectedly (a wedged
// teardown cleared by a reboot, a lost container record after a
// crash). It isn't user-stopped (those are filtered out of
// `manifests` above) and it's still installed (manifest present),
// so recreate it rather than leave a previously-running app down.
// Match is exact: compute_container_name == the snapshot's podman
// name (incl. each stack member), so no false positives. The only
// "absent" Left reason is the optional-missing case, so this never
// fires for paused/unknown states.
Ok(ReconcileAction::Left(reason))
if mode == ReconcileMode::ExistingOnly
&& reason == "absent"
&& was_running.contains(&compute_container_name(&lm.manifest)) =>
{
tracing::warn!(
app_id = %app_id,
"previously-running app has no container after boot — recreating (desired-state recovery)"
);
match self.install_fresh(&lm).await {
Ok(()) => report.record(&app_id, ReconcileAction::Installed),
Err(e) => {
tracing::error!(app_id = %app_id, error = %e, "desired-state recovery (recreate) failed");
report.failures.push((app_id, e.to_string()));
}
}
}
Ok(action) => report.record(&app_id, action), Ok(action) => report.record(&app_id, action),
Err(e) => { Err(e) => {
tracing::error!(app_id = %app_id, error = %e, "reconcile failed"); tracing::error!(app_id = %app_id, error = %e, "reconcile failed");
@ -2774,6 +2807,10 @@ impl ProdContainerOrchestrator {
continue; continue;
} }
// Whether the bind source already existed BEFORE we (root) create it,
// so the ownership fix-up below only touches a dir we just made.
let source_existed = Path::new(&volume.source).exists();
let mkdir_status = host_sudo(&["mkdir", "-p", &volume.source]) let mkdir_status = host_sudo(&["mkdir", "-p", &volume.source])
.await .await
.with_context(|| format!("mkdir {}", volume.source))?; .with_context(|| format!("mkdir {}", volume.source))?;
@ -2784,6 +2821,43 @@ impl ProdContainerOrchestrator {
mkdir_status.code() mkdir_status.code()
)); ));
} }
// A bind dir we JUST created is owned root:root (mkdir ran via sudo).
// An app that declares no `data_uid` runs as its own root inside the
// container, which rootless Podman maps to the host user running
// archipelago — so a root:root dir is UNWRITABLE from inside and the
// app EACCES-crash-loops the moment it tries to create a subdir
// (observed: immich upload dir `/var/lib/archipelago/immich` after a
// recreate). The in-container ownership self-heal only runs on RUNNING
// containers, so it never fires for an app that crashes on startup.
// Match the new dir to its parent's owner — the rootless data root
// (`/var/lib/archipelago`, owned by the service user) — via
// `--reference`, so there's no host-uid guessing. Only on fresh
// creation, and only when apply_data_uid won't already chown it.
if !source_existed && manifest.app.container.data_uid.is_none() {
if let Some(parent) = Path::new(&volume.source)
.parent()
.map(|p| p.display().to_string())
{
match host_sudo(&[
"chown",
&format!("--reference={parent}"),
&volume.source,
])
.await
{
Ok(s) if s.success() => {}
Ok(s) => tracing::warn!(
app_id = %manifest.app.id, dir = %volume.source,
"bind-dir ownership match exited {:?} (app may EACCES)", s.code()
),
Err(e) => tracing::warn!(
app_id = %manifest.app.id, dir = %volume.source,
"bind-dir ownership match failed (non-fatal): {e}"
),
}
}
}
} }
Ok(()) Ok(())
} }

View File

@ -61,6 +61,22 @@ pub async fn load_user_stopped(data_dir: &Path) -> std::collections::HashSet<Str
} }
} }
/// Names of the containers that were running at the last periodic snapshot
/// (`running-containers.json`, saved every ~120s by `save_container_snapshot`).
/// Unlike `check_for_crash`, this reads the snapshot unconditionally (no PID/crash
/// gate) — it's the durable "what was running" signal the boot reconciler uses to
/// recreate a previously-running app whose container vanished. Empty if absent.
pub async fn load_last_running_names(data_dir: &Path) -> std::collections::HashSet<String> {
let path = data_dir.join(CONTAINER_STATE_FILE);
match fs::read_to_string(&path).await {
Ok(content) => match serde_json::from_str::<ContainerSnapshot>(&content) {
Ok(snapshot) => snapshot.containers.into_iter().map(|c| c.name).collect(),
Err(_) => std::collections::HashSet::new(),
},
Err(_) => std::collections::HashSet::new(),
}
}
/// Save the set of user-stopped containers to disk. /// Save the set of user-stopped containers to disk.
pub async fn save_user_stopped(data_dir: &Path, stopped: &std::collections::HashSet<String>) { pub async fn save_user_stopped(data_dir: &Path, stopped: &std::collections::HashSet<String>) {
let path = data_dir.join(USER_STOPPED_FILE); let path = data_dir.join(USER_STOPPED_FILE);
@ -898,6 +914,43 @@ mod tests {
assert_eq!(containers[1].name, "archy-mempool-web"); assert_eq!(containers[1].name, "archy-mempool-web");
} }
#[tokio::test]
async fn test_load_last_running_names_reads_snapshot_without_pid_gate() {
let tmp = TempDir::new().unwrap();
// No PID file written — load_last_running_names must NOT require a crash.
let snapshot = ContainerSnapshot {
timestamp: 1000,
containers: vec![
RunningContainerRecord {
name: "immich_server".to_string(),
image: "immich:2.7".to_string(),
},
RunningContainerRecord {
name: "immich_postgres".to_string(),
image: "postgres:16".to_string(),
},
],
};
fs::write(
tmp.path().join(CONTAINER_STATE_FILE),
serde_json::to_string(&snapshot).unwrap(),
)
.await
.unwrap();
let names = load_last_running_names(tmp.path()).await;
assert_eq!(names.len(), 2);
assert!(names.contains("immich_server"));
assert!(names.contains("immich_postgres"));
assert!(!names.contains("immich_redis"));
}
#[tokio::test]
async fn test_load_last_running_names_empty_when_absent() {
let tmp = TempDir::new().unwrap();
assert!(load_last_running_names(tmp.path()).await.is_empty());
}
#[tokio::test] #[tokio::test]
async fn test_write_and_remove_pid_marker() { async fn test_write_and_remove_pid_marker() {
let tmp = TempDir::new().unwrap(); let tmp = TempDir::new().unwrap();