feat(orchestrator): desired-state recovery + recreate volume-ownership [UNVALIDATED WIP]
NOT yet validated on a node or fleet-deployed — cargo check passes, release build + .228 canary validation pending. Committed as a checkpoint so the work survives. Two fixes the immich .198 incident exposed: Fix A (reconcile_all_with_mode): a previously-running app whose container vanished (e.g. a wedged podman teardown cleared by a reboot) was left absent on boot. Now, when boot reconcile would leave an app 'absent' but it was running at the last running-containers snapshot, recreate it (install_fresh). New crash_recovery::load_last_running_names() reads the snapshot without the PID/crash gate (+2 unit tests). Match is exact on compute_container_name (incl stack members); user-stopped + uninstalled apps are already excluded, so no false positives. Fix B (ensure_bind_mount_dirs): a freshly-created bind dir was left root:root, so a no-data_uid app running as container-root (→ host rootless user) hit EACCES and crash-looped (the exact immich upload-dir failure). Now a newly-created bind dir for a no-data_uid app is chowned via --reference=<parent> to match the rootless data root — no host-uid guessing, only fresh dirs (no regression for existing installs). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
80f49cac1c
commit
a721532f55
@ -1262,6 +1262,11 @@ impl ProdContainerOrchestrator {
|
|||||||
|
|
||||||
async fn reconcile_all_with_mode(&self, mode: ReconcileMode) -> ReconcileReport {
|
async fn reconcile_all_with_mode(&self, mode: ReconcileMode) -> ReconcileReport {
|
||||||
let user_stopped = crate::crash_recovery::load_user_stopped(&self.data_dir).await;
|
let user_stopped = crate::crash_recovery::load_user_stopped(&self.data_dir).await;
|
||||||
|
// Durable desired-state signal: the container names that were running at
|
||||||
|
// the last periodic snapshot. Used below to recreate a previously-running
|
||||||
|
// app whose container vanished (e.g. a wedged teardown cleared by a
|
||||||
|
// reboot) instead of leaving it down. See the immich .198 incident.
|
||||||
|
let was_running = crate::crash_recovery::load_last_running_names(&self.data_dir).await;
|
||||||
let manifests: Vec<LoadedManifest> = {
|
let manifests: Vec<LoadedManifest> = {
|
||||||
let state = self.state.read().await;
|
let state = self.state.read().await;
|
||||||
let dependency_required = dependency_manifests_required_by_active_apps(
|
let dependency_required = dependency_manifests_required_by_active_apps(
|
||||||
@ -1295,6 +1300,34 @@ impl ProdContainerOrchestrator {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
match self.ensure_running_with_mode(&lm, mode).await {
|
match self.ensure_running_with_mode(&lm, mode).await {
|
||||||
|
// Desired-state recovery: the app has no container and was left
|
||||||
|
// "absent" by boot reconcile, BUT it was running at the last
|
||||||
|
// snapshot — so its container vanished unexpectedly (a wedged
|
||||||
|
// teardown cleared by a reboot, a lost container record after a
|
||||||
|
// crash). It isn't user-stopped (those are filtered out of
|
||||||
|
// `manifests` above) and it's still installed (manifest present),
|
||||||
|
// so recreate it rather than leave a previously-running app down.
|
||||||
|
// Match is exact: compute_container_name == the snapshot's podman
|
||||||
|
// name (incl. each stack member), so no false positives. The only
|
||||||
|
// "absent" Left reason is the optional-missing case, so this never
|
||||||
|
// fires for paused/unknown states.
|
||||||
|
Ok(ReconcileAction::Left(reason))
|
||||||
|
if mode == ReconcileMode::ExistingOnly
|
||||||
|
&& reason == "absent"
|
||||||
|
&& was_running.contains(&compute_container_name(&lm.manifest)) =>
|
||||||
|
{
|
||||||
|
tracing::warn!(
|
||||||
|
app_id = %app_id,
|
||||||
|
"previously-running app has no container after boot — recreating (desired-state recovery)"
|
||||||
|
);
|
||||||
|
match self.install_fresh(&lm).await {
|
||||||
|
Ok(()) => report.record(&app_id, ReconcileAction::Installed),
|
||||||
|
Err(e) => {
|
||||||
|
tracing::error!(app_id = %app_id, error = %e, "desired-state recovery (recreate) failed");
|
||||||
|
report.failures.push((app_id, e.to_string()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Ok(action) => report.record(&app_id, action),
|
Ok(action) => report.record(&app_id, action),
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
tracing::error!(app_id = %app_id, error = %e, "reconcile failed");
|
tracing::error!(app_id = %app_id, error = %e, "reconcile failed");
|
||||||
@ -2774,6 +2807,10 @@ impl ProdContainerOrchestrator {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Whether the bind source already existed BEFORE we (root) create it,
|
||||||
|
// so the ownership fix-up below only touches a dir we just made.
|
||||||
|
let source_existed = Path::new(&volume.source).exists();
|
||||||
|
|
||||||
let mkdir_status = host_sudo(&["mkdir", "-p", &volume.source])
|
let mkdir_status = host_sudo(&["mkdir", "-p", &volume.source])
|
||||||
.await
|
.await
|
||||||
.with_context(|| format!("mkdir {}", volume.source))?;
|
.with_context(|| format!("mkdir {}", volume.source))?;
|
||||||
@ -2784,6 +2821,43 @@ impl ProdContainerOrchestrator {
|
|||||||
mkdir_status.code()
|
mkdir_status.code()
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// A bind dir we JUST created is owned root:root (mkdir ran via sudo).
|
||||||
|
// An app that declares no `data_uid` runs as its own root inside the
|
||||||
|
// container, which rootless Podman maps to the host user running
|
||||||
|
// archipelago — so a root:root dir is UNWRITABLE from inside and the
|
||||||
|
// app EACCES-crash-loops the moment it tries to create a subdir
|
||||||
|
// (observed: immich upload dir `/var/lib/archipelago/immich` after a
|
||||||
|
// recreate). The in-container ownership self-heal only runs on RUNNING
|
||||||
|
// containers, so it never fires for an app that crashes on startup.
|
||||||
|
// Match the new dir to its parent's owner — the rootless data root
|
||||||
|
// (`/var/lib/archipelago`, owned by the service user) — via
|
||||||
|
// `--reference`, so there's no host-uid guessing. Only on fresh
|
||||||
|
// creation, and only when apply_data_uid won't already chown it.
|
||||||
|
if !source_existed && manifest.app.container.data_uid.is_none() {
|
||||||
|
if let Some(parent) = Path::new(&volume.source)
|
||||||
|
.parent()
|
||||||
|
.map(|p| p.display().to_string())
|
||||||
|
{
|
||||||
|
match host_sudo(&[
|
||||||
|
"chown",
|
||||||
|
&format!("--reference={parent}"),
|
||||||
|
&volume.source,
|
||||||
|
])
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(s) if s.success() => {}
|
||||||
|
Ok(s) => tracing::warn!(
|
||||||
|
app_id = %manifest.app.id, dir = %volume.source,
|
||||||
|
"bind-dir ownership match exited {:?} (app may EACCES)", s.code()
|
||||||
|
),
|
||||||
|
Err(e) => tracing::warn!(
|
||||||
|
app_id = %manifest.app.id, dir = %volume.source,
|
||||||
|
"bind-dir ownership match failed (non-fatal): {e}"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@ -61,6 +61,22 @@ pub async fn load_user_stopped(data_dir: &Path) -> std::collections::HashSet<Str
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Names of the containers that were running at the last periodic snapshot
|
||||||
|
/// (`running-containers.json`, saved every ~120s by `save_container_snapshot`).
|
||||||
|
/// Unlike `check_for_crash`, this reads the snapshot unconditionally (no PID/crash
|
||||||
|
/// gate) — it's the durable "what was running" signal the boot reconciler uses to
|
||||||
|
/// recreate a previously-running app whose container vanished. Empty if absent.
|
||||||
|
pub async fn load_last_running_names(data_dir: &Path) -> std::collections::HashSet<String> {
|
||||||
|
let path = data_dir.join(CONTAINER_STATE_FILE);
|
||||||
|
match fs::read_to_string(&path).await {
|
||||||
|
Ok(content) => match serde_json::from_str::<ContainerSnapshot>(&content) {
|
||||||
|
Ok(snapshot) => snapshot.containers.into_iter().map(|c| c.name).collect(),
|
||||||
|
Err(_) => std::collections::HashSet::new(),
|
||||||
|
},
|
||||||
|
Err(_) => std::collections::HashSet::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Save the set of user-stopped containers to disk.
|
/// Save the set of user-stopped containers to disk.
|
||||||
pub async fn save_user_stopped(data_dir: &Path, stopped: &std::collections::HashSet<String>) {
|
pub async fn save_user_stopped(data_dir: &Path, stopped: &std::collections::HashSet<String>) {
|
||||||
let path = data_dir.join(USER_STOPPED_FILE);
|
let path = data_dir.join(USER_STOPPED_FILE);
|
||||||
@ -898,6 +914,43 @@ mod tests {
|
|||||||
assert_eq!(containers[1].name, "archy-mempool-web");
|
assert_eq!(containers[1].name, "archy-mempool-web");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_load_last_running_names_reads_snapshot_without_pid_gate() {
|
||||||
|
let tmp = TempDir::new().unwrap();
|
||||||
|
// No PID file written — load_last_running_names must NOT require a crash.
|
||||||
|
let snapshot = ContainerSnapshot {
|
||||||
|
timestamp: 1000,
|
||||||
|
containers: vec![
|
||||||
|
RunningContainerRecord {
|
||||||
|
name: "immich_server".to_string(),
|
||||||
|
image: "immich:2.7".to_string(),
|
||||||
|
},
|
||||||
|
RunningContainerRecord {
|
||||||
|
name: "immich_postgres".to_string(),
|
||||||
|
image: "postgres:16".to_string(),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
fs::write(
|
||||||
|
tmp.path().join(CONTAINER_STATE_FILE),
|
||||||
|
serde_json::to_string(&snapshot).unwrap(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let names = load_last_running_names(tmp.path()).await;
|
||||||
|
assert_eq!(names.len(), 2);
|
||||||
|
assert!(names.contains("immich_server"));
|
||||||
|
assert!(names.contains("immich_postgres"));
|
||||||
|
assert!(!names.contains("immich_redis"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn test_load_last_running_names_empty_when_absent() {
|
||||||
|
let tmp = TempDir::new().unwrap();
|
||||||
|
assert!(load_last_running_names(tmp.path()).await.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_write_and_remove_pid_marker() {
|
async fn test_write_and_remove_pid_marker() {
|
||||||
let tmp = TempDir::new().unwrap();
|
let tmp = TempDir::new().unwrap();
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user