feat(orchestrator): desired-state recovery + recreate volume-ownership [UNVALIDATED WIP]
NOT yet validated on a node or fleet-deployed — cargo check passes, release build + .228 canary validation pending. Committed as a checkpoint so the work survives. Two fixes the immich .198 incident exposed: Fix A (reconcile_all_with_mode): a previously-running app whose container vanished (e.g. a wedged podman teardown cleared by a reboot) was left absent on boot. Now, when boot reconcile would leave an app 'absent' but it was running at the last running-containers snapshot, recreate it (install_fresh). New crash_recovery::load_last_running_names() reads the snapshot without the PID/crash gate (+2 unit tests). Match is exact on compute_container_name (incl stack members); user-stopped + uninstalled apps are already excluded, so no false positives. Fix B (ensure_bind_mount_dirs): a freshly-created bind dir was left root:root, so a no-data_uid app running as container-root (→ host rootless user) hit EACCES and crash-looped (the exact immich upload-dir failure). Now a newly-created bind dir for a no-data_uid app is chowned via --reference=<parent> to match the rootless data root — no host-uid guessing, only fresh dirs (no regression for existing installs). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
80f49cac1c
commit
a721532f55
@ -1262,6 +1262,11 @@ impl ProdContainerOrchestrator {
|
||||
|
||||
async fn reconcile_all_with_mode(&self, mode: ReconcileMode) -> ReconcileReport {
|
||||
let user_stopped = crate::crash_recovery::load_user_stopped(&self.data_dir).await;
|
||||
// Durable desired-state signal: the container names that were running at
|
||||
// the last periodic snapshot. Used below to recreate a previously-running
|
||||
// app whose container vanished (e.g. a wedged teardown cleared by a
|
||||
// reboot) instead of leaving it down. See the immich .198 incident.
|
||||
let was_running = crate::crash_recovery::load_last_running_names(&self.data_dir).await;
|
||||
let manifests: Vec<LoadedManifest> = {
|
||||
let state = self.state.read().await;
|
||||
let dependency_required = dependency_manifests_required_by_active_apps(
|
||||
@ -1295,6 +1300,34 @@ impl ProdContainerOrchestrator {
|
||||
continue;
|
||||
}
|
||||
match self.ensure_running_with_mode(&lm, mode).await {
|
||||
// Desired-state recovery: the app has no container and was left
|
||||
// "absent" by boot reconcile, BUT it was running at the last
|
||||
// snapshot — so its container vanished unexpectedly (a wedged
|
||||
// teardown cleared by a reboot, a lost container record after a
|
||||
// crash). It isn't user-stopped (those are filtered out of
|
||||
// `manifests` above) and it's still installed (manifest present),
|
||||
// so recreate it rather than leave a previously-running app down.
|
||||
// Match is exact: compute_container_name == the snapshot's podman
|
||||
// name (incl. each stack member), so no false positives. The only
|
||||
// "absent" Left reason is the optional-missing case, so this never
|
||||
// fires for paused/unknown states.
|
||||
Ok(ReconcileAction::Left(reason))
|
||||
if mode == ReconcileMode::ExistingOnly
|
||||
&& reason == "absent"
|
||||
&& was_running.contains(&compute_container_name(&lm.manifest)) =>
|
||||
{
|
||||
tracing::warn!(
|
||||
app_id = %app_id,
|
||||
"previously-running app has no container after boot — recreating (desired-state recovery)"
|
||||
);
|
||||
match self.install_fresh(&lm).await {
|
||||
Ok(()) => report.record(&app_id, ReconcileAction::Installed),
|
||||
Err(e) => {
|
||||
tracing::error!(app_id = %app_id, error = %e, "desired-state recovery (recreate) failed");
|
||||
report.failures.push((app_id, e.to_string()));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(action) => report.record(&app_id, action),
|
||||
Err(e) => {
|
||||
tracing::error!(app_id = %app_id, error = %e, "reconcile failed");
|
||||
@ -2774,6 +2807,10 @@ impl ProdContainerOrchestrator {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Whether the bind source already existed BEFORE we (root) create it,
|
||||
// so the ownership fix-up below only touches a dir we just made.
|
||||
let source_existed = Path::new(&volume.source).exists();
|
||||
|
||||
let mkdir_status = host_sudo(&["mkdir", "-p", &volume.source])
|
||||
.await
|
||||
.with_context(|| format!("mkdir {}", volume.source))?;
|
||||
@ -2784,6 +2821,43 @@ impl ProdContainerOrchestrator {
|
||||
mkdir_status.code()
|
||||
));
|
||||
}
|
||||
|
||||
// A bind dir we JUST created is owned root:root (mkdir ran via sudo).
|
||||
// An app that declares no `data_uid` runs as its own root inside the
|
||||
// container, which rootless Podman maps to the host user running
|
||||
// archipelago — so a root:root dir is UNWRITABLE from inside and the
|
||||
// app EACCES-crash-loops the moment it tries to create a subdir
|
||||
// (observed: immich upload dir `/var/lib/archipelago/immich` after a
|
||||
// recreate). The in-container ownership self-heal only runs on RUNNING
|
||||
// containers, so it never fires for an app that crashes on startup.
|
||||
// Match the new dir to its parent's owner — the rootless data root
|
||||
// (`/var/lib/archipelago`, owned by the service user) — via
|
||||
// `--reference`, so there's no host-uid guessing. Only on fresh
|
||||
// creation, and only when apply_data_uid won't already chown it.
|
||||
if !source_existed && manifest.app.container.data_uid.is_none() {
|
||||
if let Some(parent) = Path::new(&volume.source)
|
||||
.parent()
|
||||
.map(|p| p.display().to_string())
|
||||
{
|
||||
match host_sudo(&[
|
||||
"chown",
|
||||
&format!("--reference={parent}"),
|
||||
&volume.source,
|
||||
])
|
||||
.await
|
||||
{
|
||||
Ok(s) if s.success() => {}
|
||||
Ok(s) => tracing::warn!(
|
||||
app_id = %manifest.app.id, dir = %volume.source,
|
||||
"bind-dir ownership match exited {:?} (app may EACCES)", s.code()
|
||||
),
|
||||
Err(e) => tracing::warn!(
|
||||
app_id = %manifest.app.id, dir = %volume.source,
|
||||
"bind-dir ownership match failed (non-fatal): {e}"
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@ -61,6 +61,22 @@ pub async fn load_user_stopped(data_dir: &Path) -> std::collections::HashSet<Str
|
||||
}
|
||||
}
|
||||
|
||||
/// Names of the containers that were running at the last periodic snapshot
|
||||
/// (`running-containers.json`, saved every ~120s by `save_container_snapshot`).
|
||||
/// Unlike `check_for_crash`, this reads the snapshot unconditionally (no PID/crash
|
||||
/// gate) — it's the durable "what was running" signal the boot reconciler uses to
|
||||
/// recreate a previously-running app whose container vanished. Empty if absent.
|
||||
pub async fn load_last_running_names(data_dir: &Path) -> std::collections::HashSet<String> {
|
||||
let path = data_dir.join(CONTAINER_STATE_FILE);
|
||||
match fs::read_to_string(&path).await {
|
||||
Ok(content) => match serde_json::from_str::<ContainerSnapshot>(&content) {
|
||||
Ok(snapshot) => snapshot.containers.into_iter().map(|c| c.name).collect(),
|
||||
Err(_) => std::collections::HashSet::new(),
|
||||
},
|
||||
Err(_) => std::collections::HashSet::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Save the set of user-stopped containers to disk.
|
||||
pub async fn save_user_stopped(data_dir: &Path, stopped: &std::collections::HashSet<String>) {
|
||||
let path = data_dir.join(USER_STOPPED_FILE);
|
||||
@ -898,6 +914,43 @@ mod tests {
|
||||
assert_eq!(containers[1].name, "archy-mempool-web");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_load_last_running_names_reads_snapshot_without_pid_gate() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
// No PID file written — load_last_running_names must NOT require a crash.
|
||||
let snapshot = ContainerSnapshot {
|
||||
timestamp: 1000,
|
||||
containers: vec![
|
||||
RunningContainerRecord {
|
||||
name: "immich_server".to_string(),
|
||||
image: "immich:2.7".to_string(),
|
||||
},
|
||||
RunningContainerRecord {
|
||||
name: "immich_postgres".to_string(),
|
||||
image: "postgres:16".to_string(),
|
||||
},
|
||||
],
|
||||
};
|
||||
fs::write(
|
||||
tmp.path().join(CONTAINER_STATE_FILE),
|
||||
serde_json::to_string(&snapshot).unwrap(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let names = load_last_running_names(tmp.path()).await;
|
||||
assert_eq!(names.len(), 2);
|
||||
assert!(names.contains("immich_server"));
|
||||
assert!(names.contains("immich_postgres"));
|
||||
assert!(!names.contains("immich_redis"));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_load_last_running_names_empty_when_absent() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
assert!(load_last_running_names(tmp.path()).await.is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_write_and_remove_pid_marker() {
|
||||
let tmp = TempDir::new().unwrap();
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user