From d0710e7491d8e4f6e81b657cf117ccd9240956a8 Mon Sep 17 00:00:00 2001
From: archipelago <archipelago@localhost>
Date: Wed, 1 Jul 2026 08:19:54 -0400
Subject: [PATCH] fix(orchestrator,content): bound repair-recreate loops;
 self-heal stale content catalog entries

- prod_orchestrator.rs: the boot reconciler's zombie-guard and start-failed
  recreate paths (Created/Stopped/Exited states) had no attempt cap, unlike
  health_monitor's independent restart tracker. A container whose entrypoint
  fatally crashes right after `podman start` succeeds got stop+remove+
  install_fresh'd every ~30s reconcile tick forever (portainer on .198,
  2026-07-01: a DB schema newer than the pinned binary could read -- no
  amount of recreating fixes that). Added a 5-attempts/30-minute circuit
  breaker; once exhausted the container is left alone with an error! log
  instead of looping, and an explicit install/start clears the counter.
- content_server.rs: serve_content now prunes a catalog entry whose backing
  file is missing on disk, instead of leaving it advertised to every peer
  forever with no way to distinguish "gone" from "transient failure."

Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
---
 .../src/container/prod_orchestrator.rs        | 162 ++++++++++++++++++
 core/archipelago/src/content_server.rs        | 123 ++++++++++++-
 docs/PRODUCTION-MASTER-PLAN.md                | 139 +++++++++++++--
 3 files changed, 408 insertions(+), 16 deletions(-)
diff --git a/core/archipelago/src/container/prod_orchestrator.rs b/core/archipelago/src/container/prod_orchestrator.rs
index cb10ec69..572dd5ad 100644
--- a/core/archipelago/src/container/prod_orchestrator.rs
+++ b/core/archipelago/src/container/prod_orchestrator.rs
@@ -1025,6 +1025,10 @@ struct OrchestratorState {
     disabled: HashSet<String>,
     /// app_id → per-app mutex, created lazily the first time we touch an app
     locks: HashMap<String, Arc<Mutex<()>>>,
+    /// container name → (attempt count, first-attempt time) for the
+    /// stop+remove+install_fresh "repair" recreate paths below. See
+    /// `should_attempt_repair`.
+    repair_attempts: HashMap<String, (u32, std::time::Instant)>,
 }
 
 impl OrchestratorState {
@@ -1033,10 +1037,29 @@ impl OrchestratorState {
             manifests: HashMap::new(),
             disabled: HashSet::new(),
             locks: HashMap::new(),
+            repair_attempts: HashMap::new(),
         }
     }
 }
 
+/// Cap on how many times the boot reconciler will recreate the same
+/// container within `REPAIR_ATTEMPT_RESET_WINDOW` before giving up on it.
+///
+/// Without this, a container whose entrypoint process fatally exits right
+/// after `podman start` succeeds (podman itself reports no error — the crash
+/// happens inside the app a moment later) gets stop+remove+install_fresh'd
+/// again on every ~30s reconcile tick, forever. `health_monitor.rs`'s
+/// restart tracker already bounds ITS OWN independent restart path
+/// (`MAX_RESTART_ATTEMPTS`) and eventually surfaces a user-facing
+/// notification — but the boot reconciler's repair-recreate path had no
+/// equivalent circuit breaker, so the two could race indefinitely on the
+/// same fatally-broken container (portainer on `.198`, 2026-07-01: crashed
+/// on every start because its on-disk DB was written by a newer binary than
+/// the pinned image — a data/version mismatch no amount of recreating could
+/// fix, yet it kept looping every 30s until manually intervened on).
+const MAX_REPAIR_ATTEMPTS: u32 = 5;
+const REPAIR_ATTEMPT_RESET_WINDOW: std::time::Duration = std::time::Duration::from_secs(1800);
+
 pub struct ProdContainerOrchestrator {
     runtime: Arc<dyn ContainerRuntimeTrait>,
     manifests_dir: PathBuf,
@@ -1499,6 +1522,48 @@ impl ProdContainerOrchestrator {
             .await
     }
 
+    /// Whether the reconciler should attempt another stop+remove+install_fresh
+    /// repair recreate for `name`, or has already tried too many times
+    /// recently and should leave it alone instead of looping forever. See
+    /// `MAX_REPAIR_ATTEMPTS`.
+    async fn should_attempt_repair(&self, name: &str) -> bool {
+        let mut state = self.state.write().await;
+        let now = std::time::Instant::now();
+        let entry = state
+            .repair_attempts
+            .entry(name.to_string())
+            .or_insert((0, now));
+        if now.duration_since(entry.1) > REPAIR_ATTEMPT_RESET_WINDOW {
+            *entry = (0, now);
+        }
+        entry.0 += 1;
+        if entry.0 > MAX_REPAIR_ATTEMPTS {
+            tracing::error!(
+                container = %name,
+                attempts = entry.0,
+                window_secs = REPAIR_ATTEMPT_RESET_WINDOW.as_secs(),
+                "giving up on repairing container after too many recreate attempts — it keeps failing to \
+                 come up cleanly on its own; check `podman logs {name}` for the real cause (a data/version \
+                 mismatch or another fatal startup error is likely, not something recreating the container \
+                 again will fix). Leaving it as-is instead of recreating it forever; a manual fix + restart \
+                 (or a subsequent explicit install/start, which clears this counter) is needed to recover it.",
+                name = name,
+            );
+            false
+        } else {
+            true
+        }
+    }
+
+    /// Clears the repair-attempt counter for `name` — call on any path that
+    /// reaches a stable Running/NoOp/Started outcome, so a container that
+    /// recovers (on its own, or after a real fix) doesn't inherit a stale
+    /// near-exhausted counter from an earlier unrelated failure.
+    async fn clear_repair_attempts(&self, name: &str) {
+        let mut state = self.state.write().await;
+        state.repair_attempts.remove(name);
+    }
+
     async fn ensure_running_with_mode(
         &self,
         lm: &LoadedManifest,
@@ -1611,6 +1676,11 @@ impl ProdContainerOrchestrator {
                         // "Up" → proxy 502 → NetBird login broke). Conservative:
                         // only fires on a concrete dead PID, never on uncertainty.
                         if !container_running_process_alive(&name).await {
+                            if !self.should_attempt_repair(&name).await {
+                                return Ok(ReconcileAction::Left(
+                                    "repair-attempts-exhausted".into(),
+                                ));
+                            }
                             tracing::warn!(
                                 app_id = %app_id,
                                 container = %name,
@@ -1718,6 +1788,7 @@ impl ProdContainerOrchestrator {
                                 return Ok(ReconcileAction::Installed);
                             }
                         }
+                        self.clear_repair_attempts(&name).await;
                         Ok(ReconcileAction::NoOp)
                     }
                     ContainerState::Stopped | ContainerState::Exited => {
@@ -1743,6 +1814,11 @@ impl ProdContainerOrchestrator {
                         )
                         .await
                         {
+                            if !self.should_attempt_repair(&name).await {
+                                return Ok(ReconcileAction::Left(
+                                    "repair-attempts-exhausted".into(),
+                                ));
+                            }
                             tracing::warn!(
                                 app_id = %app_id,
                                 container = %name,
@@ -1764,6 +1840,7 @@ impl ProdContainerOrchestrator {
                             wait_for_container_stable_running(self.runtime.as_ref(), &name, 15, 90)
                                 .await?;
                         }
+                        self.clear_repair_attempts(&name).await;
                         Ok(ReconcileAction::Started)
                     }
                     ContainerState::Stopping => {
@@ -1792,6 +1869,11 @@ impl ProdContainerOrchestrator {
                         )
                         .await
                         {
+                            if !self.should_attempt_repair(&name).await {
+                                return Ok(ReconcileAction::Left(
+                                    "repair-attempts-exhausted".into(),
+                                ));
+                            }
                             tracing::warn!(
                                 app_id = %app_id,
                                 container = %name,
@@ -1813,6 +1895,7 @@ impl ProdContainerOrchestrator {
                             wait_for_container_stable_running(self.runtime.as_ref(), &name, 15, 90)
                                 .await?;
                         }
+                        self.clear_repair_attempts(&name).await;
                         Ok(ReconcileAction::Started)
                     }
                     ContainerState::Paused => Ok(ReconcileAction::Left("paused".to_string())),
@@ -4594,6 +4677,85 @@ app:
             .any(|c| c == "create_container:bitcoin-knots:offset=0"));
     }
 
+    #[tokio::test]
+    async fn repair_recreate_stops_after_max_attempts_instead_of_looping_forever() {
+        // A container whose entrypoint fatally crashes every time (portainer
+        // on .198, 2026-07-01: DB schema too new for the pinned binary) must
+        // not be stop+remove+install_fresh'd forever by the boot reconciler.
+        let rt = Arc::new(MockRuntime::default());
+        rt.mark_image_present("docker.io/bitcoin/knots:28");
+        rt.set_state("bitcoin-knots", ContainerState::Exited);
+        let orch = orch_with(rt.clone()).await;
+        orch.insert_manifest_for_test(
+            pull_manifest("bitcoin-knots", "docker.io/bitcoin/knots:28"),
+            PathBuf::from("/tmp/bk"),
+        )
+        .await;
+
+        let mut last_report = None;
+        for _ in 0..MAX_REPAIR_ATTEMPTS {
+            // fail_start entries are consumed on use — re-arm every pass so
+            // this container fails to start EVERY time, not just once.
+            rt.fail_start
+                .lock()
+                .unwrap()
+                .insert("bitcoin-knots".into(), "fatal startup error".into());
+            rt.set_state("bitcoin-knots", ContainerState::Exited);
+            let report = orch.reconcile_all().await;
+            assert_eq!(
+                report.actions,
+                vec![("bitcoin-knots".to_string(), ReconcileAction::Installed)],
+                "expected a recreate attempt within the attempt budget"
+            );
+            last_report = Some(report);
+        }
+        assert!(last_report.is_some());
+
+        // One more pass exceeds MAX_REPAIR_ATTEMPTS — the breaker must trip:
+        // no further remove/create calls (it may still probe status/attempt
+        // a start, same as any other pass — it just must not recreate).
+        let remove_calls_before = rt
+            .calls()
+            .iter()
+            .filter(|c| c.starts_with("remove_container:"))
+            .count();
+        let create_calls_before = rt
+            .calls()
+            .iter()
+            .filter(|c| c.starts_with("create_container:"))
+            .count();
+        rt.fail_start
+            .lock()
+            .unwrap()
+            .insert("bitcoin-knots".into(), "fatal startup error".into());
+        rt.set_state("bitcoin-knots", ContainerState::Exited);
+        let report = orch.reconcile_all().await;
+        assert_eq!(
+            report.actions,
+            vec![(
+                "bitcoin-knots".to_string(),
+                ReconcileAction::Left("repair-attempts-exhausted".to_string())
+            )]
+        );
+        let calls_after = rt.calls();
+        assert_eq!(
+            calls_after
+                .iter()
+                .filter(|c| c.starts_with("remove_container:"))
+                .count(),
+            remove_calls_before,
+            "breaker must skip the recreate's remove_container entirely"
+        );
+        assert_eq!(
+            calls_after
+                .iter()
+                .filter(|c| c.starts_with("create_container:"))
+                .count(),
+            create_calls_before,
+            "breaker must skip the recreate's create_container entirely"
+        );
+    }
+
     #[tokio::test]
     async fn reconcile_installs_missing_container() {
         let rt = Arc::new(MockRuntime::default());
diff --git a/core/archipelago/src/content_server.rs b/core/archipelago/src/content_server.rs
index 0118c8ea..c9bae8f2 100644
--- a/core/archipelago/src/content_server.rs
+++ b/core/archipelago/src/content_server.rs
@@ -7,7 +7,7 @@ use anyhow::{Context, Result};
 use serde::{Deserialize, Serialize};
 use std::path::{Path, PathBuf};
 use tokio::fs;
-use tracing::debug;
+use tracing::{debug, warn};
 
 const CATALOG_FILE: &str = "content/catalog.json";
 const CONTENT_DIR: &str = "content/files";
@@ -86,6 +86,22 @@ pub async fn save_catalog(data_dir: &Path, catalog: &ContentCatalog) -> Result<(
     Ok(())
 }
 
+/// Removes `id` from the on-disk catalog. Best-effort: a failure here just
+/// means the entry gets pruned again next time it's requested, so errors are
+/// logged rather than propagated.
+async fn prune_missing_content_entry(data_dir: &Path, id: &str) {
+    let Ok(mut catalog) = load_catalog(data_dir).await else {
+        return;
+    };
+    let before = catalog.items.len();
+    catalog.items.retain(|i| i.id != id);
+    if catalog.items.len() != before {
+        if let Err(e) = save_catalog(data_dir, &catalog).await {
+            warn!(error = %e, content_id = %id, "failed to save catalog after pruning missing content entry");
+        }
+    }
+}
+
 /// Get the full filesystem path for a content item.
 /// Checks the dedicated content/files/ directory first, then falls back to the
 /// FileBrowser data directory (where users manage files via the web UI).
@@ -268,6 +284,19 @@ pub async fn serve_content(
 
     let file_path = content_file_path(data_dir, item);
     if !file_path.exists() {
+        // The catalog entry survived (it's a separate JSON file) but its
+        // backing file is gone — most likely lost in an unrelated data-dir
+        // reset (a shared filebrowser file, 2026-07-01: two catalog entries
+        // outlived a filebrowser reinstall that wiped the files themselves).
+        // Leaving the entry in place would keep advertising it as available
+        // to every peer forever, each hitting the exact same dead end this
+        // one just did. Prune it so it stops being offered.
+        warn!(
+            content_id = %id,
+            filename = %item.filename,
+            "content catalog entry's file is missing on disk — pruning the stale entry"
+        );
+        prune_missing_content_entry(data_dir, id).await;
         return Ok(ServeResult::NotFound);
     }
 
@@ -555,3 +584,95 @@ mod faststart_tests {
         assert_eq!(mp4_is_faststart(&p).await, Some(false));
     }
 }
+
+#[cfg(test)]
+mod prune_missing_content_tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn serve_content_prunes_catalog_entry_whose_file_is_missing() {
+        // Simulates a catalog entry that outlived its backing file (a shared
+        // filebrowser file lost in an unrelated data-dir reset, 2026-07-01) —
+        // every peer request for it would otherwise 404 forever with no way
+        // to tell it apart from a transient failure.
+        let dir = tempfile::tempdir().unwrap();
+        let data_dir = dir.path();
+        let item = ContentItem {
+            id: "missing-item".to_string(),
+            filename: "gone.mp4".to_string(),
+            mime_type: "video/mp4".to_string(),
+            size_bytes: 123,
+            description: String::new(),
+            access: AccessControl::Free,
+            availability: Availability::AllPeers,
+            added_at: "2026-01-01T00:00:00Z".to_string(),
+        };
+        save_catalog(
+            data_dir,
+            &ContentCatalog {
+                items: vec![item],
+            },
+        )
+        .await
+        .unwrap();
+
+        // File was never written to disk under content/files/ or filebrowser/.
+        let result = serve_content(data_dir, "missing-item", None, None, None, None)
+            .await
+            .unwrap();
+        assert!(matches!(result, ServeResult::NotFound));
+
+        let reloaded = load_catalog(data_dir).await.unwrap();
+        assert!(
+            reloaded.items.is_empty(),
+            "stale entry should have been pruned after the 404"
+        );
+    }
+
+    #[tokio::test]
+    async fn serve_content_leaves_other_entries_untouched_when_pruning() {
+        let dir = tempfile::tempdir().unwrap();
+        let data_dir = dir.path();
+        let missing = ContentItem {
+            id: "missing-item".to_string(),
+            filename: "gone.mp4".to_string(),
+            mime_type: "video/mp4".to_string(),
+            size_bytes: 123,
+            description: String::new(),
+            access: AccessControl::Free,
+            availability: Availability::AllPeers,
+            added_at: "2026-01-01T00:00:00Z".to_string(),
+        };
+        let present = ContentItem {
+            id: "present-item".to_string(),
+            filename: "here.mp4".to_string(),
+            mime_type: "video/mp4".to_string(),
+            size_bytes: 4,
+            description: String::new(),
+            access: AccessControl::Free,
+            availability: Availability::AllPeers,
+            added_at: "2026-01-01T00:00:00Z".to_string(),
+        };
+        save_catalog(
+            data_dir,
+            &ContentCatalog {
+                items: vec![missing, present],
+            },
+        )
+        .await
+        .unwrap();
+        let content_dir = data_dir.join("content").join("files");
+        tokio::fs::create_dir_all(&content_dir).await.unwrap();
+        tokio::fs::write(content_dir.join("here.mp4"), b"data")
+            .await
+            .unwrap();
+
+        let _ = serve_content(data_dir, "missing-item", None, None, None, None)
+            .await
+            .unwrap();
+
+        let reloaded = load_catalog(data_dir).await.unwrap();
+        assert_eq!(reloaded.items.len(), 1);
+        assert_eq!(reloaded.items[0].id, "present-item");
+    }
+}
diff --git a/docs/PRODUCTION-MASTER-PLAN.md b/docs/PRODUCTION-MASTER-PLAN.md
index 079899d8..c3363499 100644
--- a/docs/PRODUCTION-MASTER-PLAN.md
+++ b/docs/PRODUCTION-MASTER-PLAN.md
@@ -1071,18 +1071,127 @@ non-mesh thread**; they route to the mesh/Reticulum agent (§10d owner).
   bulletproof switch mechanism itself — `package.set-config {id: "bitcoin-knots", version:
   "29.3.knots20260508"}` (an upgrade, so no downgrade-confirm gate) — to move `.228` onto the real
   latest image. Confirmed: `bitcoind --version` now reports `v29.3.knots20260508`, no reindex
-  triggered, tip advancing normally. Not yet committed/pushed — pending user go-ahead, same batch as
-  the uninstall-durability fix above.
-- **[NON-MESH, untriaged]** `.198` — `bitcoin-knots` RPC is saturated: logs flooded with "Request
-  rejected because http work queue depth exceeded" despite `-rpcworkqueue=256` already applied
-  (confirmed via `podman inspect`/entrypoint). This cascades into fedimint: `fedimint` /
-  `fedimint-gateway` / `fedimint-clientd` have been stuck in `(starting)` for 36–46h because their
-  RPC calls to `bitcoin-knots` time out (45s) — this is almost certainly what the user meant by
-  "fedimint guardian keeps going down" (not `.228`, whose fedimint stack looks healthy). Root cause
-  of the saturation itself not yet found — suspect a multi-service retry storm (health_monitor +
-  fedimint x2 + electrumx + mempool + UI all polling without backoff) compounding under any bitcoind
-  slowdown, but not confirmed.
-- **[NON-MESH, untriaged]** `.198` — portainer is completely absent from `podman ps -a` (not just
-  crashed/stopped — no container record at all). `.228`'s portainer is healthy for comparison. No
-  `/var/lib/archipelago/install.log` found on `.198` to check install history; needs a
-  package_data/state check via RPC or `journalctl` for the archipelago service.
+  triggered, tip advancing normally. **Committed + pushed** `5b7cd5d5` (same batch as the
+  uninstall-durability fix above).
+- **[NON-MESH] ROOT-CAUSED 2026-07-01, NOT A CODE BUG — needs a capacity/ops decision** — `.198`
+  `bitcoin-knots` RPC saturation ("work queue depth exceeded" despite `-rpcworkqueue=256`),
+  cascading into stuck `fedimint`/`fedimint-gateway`/`fedimint-clientd` (`(starting)` 36-46h — this
+  is what the user meant by "fedimint guardian keeps going down," not `.228`) and portainer
+  flapping (seen completely absent from `podman ps -a` at one check, `Up 12 seconds` moments later
+  at a follow-up check — it's being killed+recreated repeatedly, not missing). Real root cause:
+  **`.198`'s `bitcoin-knots` is still only ~21% synced (height 507247, unchanged from the ~21%
+  noted 2026-06-28 in [[project_bitcoin_multiversion_integration]] three days ago) and its root
+  disk is nearly I/O-saturated** (`iostat -x`: `%util` 92-97%, `w_await` ~82ms) from IBD validation
+  competing with ~30 other containers' disk I/O on a small (29GB) root partition on an OptiPlex
+  3020M. CPU is mostly idle (bitcoin-knots at 3.68%) — this is a **disk I/O bottleneck**, not the
+  retry-storm hypothesis first suspected. Every RPC caller (health_monitor, fedimint, electrumx,
+  UI) times out waiting on a disk that can't keep up, and portainer's health-check failures trigger
+  the orchestrator's zombie/drift-repair kill+recreate cycle, which never stabilizes because the
+  underlying I/O contention never resolves. **Not fixed** — this needs a user decision (accept slow
+  IBD and wait, uninstall some of the ~15 other apps competing for I/O on this node, or a hardware
+  upgrade), not a code change. `docs/multinode-testing-plan.md` already treats `.198` IBD status as
+  a pre-req to check before the multinode pass, consistent with this finding.
+- **[NON-MESH] ROOT-CAUSED + FIXED 2026-07-01** — Indeedhub wouldn't install on Arch Dev (`.116`).
+  Root cause: orphan leftover containers (`indeedhub-api`, `indeedhub-ffmpeg`) from a prior
+  partial/failed install, with `indeedhub-postgres` and the rest of the stack never created.
+  `health_monitor` correctly saw these as orphans (no `package_data` entry) and left them alone, but
+  a separate runtime crash-recovery loop (`start_stopped_app_stacks` in `crash_recovery.rs`, runs
+  every 120s — see `main.rs` "Stack supervisor") fired on ANY existing stack container regardless of
+  whether the stack's core dependency existed, force-restarting `indeedhub-api` forever against a
+  `postgres` hostname that could never resolve (`indeedhub-postgres` doesn't exist) — an infinite
+  crash loop that also blocked a real reinstall via container-name conflicts. **Fixed**: added an
+  `anchor` field to `StackRecoverySpec` (the stack's core DB/server container — `immich_postgres`,
+  `indeedhub-postgres`, `netbird-server`) and gated recovery on that anchor existing first, not on
+  any container existing. New test `stack_recovery_anchor_is_the_stacks_own_core_dependency`.
+  **Committed + pushed** `d414ae3d`.
+- **[NON-MESH] ROOT-CAUSED + FIXED 2026-07-01** — Electrum launch/app-loader UI overlapped with the
+  ElectrumX syncing screen. Root cause (found via a parallel Explore-agent investigation):
+  `AppSessionFrame.vue` rendered the generic `AppLoadingScreen` and the ElectrumX sync overlay
+  simultaneously at the same `z-index: 10` — both conditions (`loading` and
+  `electrsSync && !electrsSync.stale`) could be true at once during launch. **Fixed**: the generic
+  loader now also checks `!(electrsSync && !electrsSync.stale)` so the more-informative sync screen
+  takes precedence instead of the two stacking. `vue-tsc --noEmit` clean. **Committed + pushed**
+  `d414ae3d`.
+
+## 12. `.198` portainer + boot-reconciler circuit breaker (2026-07-01)
+
+**`.198` portainer flapping was NOT the same root cause as the disk-I/O issue above** — user
+correctly pushed back on that assumption. Actual cause: fatal, permanent — `podman logs portainer`
+showed `The database schema version does not align with the server version`. `.116`/`.228` both run
+the same pinned `portainer:2.19.4` and are healthy, so this was `.198`-specific data drift: its
+`portainer.db` was created/upgraded by a newer binary at some point in that node's own history,
+independent of the other nodes (git history has no record of the pin ever being anything but
+2.19.4, so this was very likely a manual/ad-hoc podman operation on `.198` outside the normal
+install/update path, not a platform bug in version selection). **Fixed live**: backed up
+`portainer.db` to `_reset-backup-2026-07-01/` (not deleted) and let the pinned `2.19.4` reinitialize
+fresh — portainer only holds its own dashboard/endpoint config, not irreplaceable user data, and the
+user approved a reset over attempting recovery. Confirmed stable afterward.
+
+**Follow-up "make sure this can't happen again" (user request)** — root-caused why this could loop
+forever undetected: `BootReconciler` (`boot_reconciler.rs`, ticks every 30s, `reconcile_existing()`)
+recreates containers via `ensure_running_with_mode`'s `ContainerState::Created`/`Stopped`/`Exited`
+"start failed → stop+remove+install_fresh" branches with **no bound at all** — unlike
+`health_monitor.rs`'s independent restart path, which already has `MAX_RESTART_ATTEMPTS=10` +
+backoff + a persistent user-facing notification after giving up. A container whose entrypoint
+process fatally crashes moments after `podman start` succeeds (podman itself sees no error) has its
+container recreated every single tick, forever, with only debug/warn-level logs — exactly
+portainer's failure mode, and the reason it could keep looping (crash_recovery's periodic
+supervisor doesn't cover single-container apps like portainer — only stack members — so this was
+the actual mechanism, not the one used for indeedhub above).
+
+**Fixed**: added `MAX_REPAIR_ATTEMPTS=5` / `REPAIR_ATTEMPT_RESET_WINDOW=30min` circuit breaker
+(`should_attempt_repair`/`clear_repair_attempts`, `prod_orchestrator.rs`) gating the zombie-guard
+recreate and both "start failed" recreate branches (`Created` and `Stopped|Exited` states). Once
+exhausted, reconcile leaves the container alone (`ReconcileAction::Left("repair-attempts-exhausted")`)
+and logs an `error!` pointing at `podman logs <name>` instead of recreating forever; an explicit
+`install()`/`start()` clears the counter, same pattern as `user_stopped`. New test
+`repair_recreate_stops_after_max_attempts_instead_of_looping_forever`. **Scoped deliberately**: left
+the drift-detection recreates (port/env drift, `Stopping`-stuck) unguarded for this pass — those are
+host-state-corrections that normally resolve in one shot, a materially different failure shape from
+"the app itself is fatally broken," and touching all ~8 recreate call sites in one pass risked
+regressing carefully-tuned existing behavior for low incremental benefit. Full breaker coverage
+(and/or wiring a persistent `Notification` through, which needs `StateManager` threaded into
+`BootReconciler` — a bigger `main.rs` startup-order change not attempted here) is a reasonable
+future follow-up if another single-container app hits this same failure class.
+
+**Also answered**: "why does portainer's setup wizard not have podman as an option?" —
+`apps/portainer/manifest.yml` bind-mounts the rootless podman socket
+(`/run/user/1000/podman/podman.sock`) to `/var/run/docker.sock` inside the container. Portainer
+never knows it's talking to podman — it just sees the standard Docker socket path and speaks the
+Docker Engine API, which podman's socket implements compatibly. Not a bug: pick "Docker" (local) in
+the wizard.
+
+## 13. Peer-federated content 404s over FIPS (2026-07-01) — DATA LOSS, not a code bug in the transport
+
+User report: `.116 → .228` streaming/downloading peer-federated content over FIPS failed with
+`/api/peer-content/<onion>/<id>` 404s, surfacing in the browser as `NotSupportedError: no supported
+source`. Investigated the full path: nginx's `/api/peer-content/` proxy block is present on `.116`;
+`handle_peer_content_stream` (`api/handler/proxy.rs`) correctly dials `.228` over FIPS and passes
+the peer's real HTTP status straight through — not a routing bug. `.228`'s `content/catalog.json`
+genuinely lists both content IDs from the error log as `access: free`, `availability: allpeers` (so
+not a permissions bug either), **but the backing files don't exist anywhere on `.228`** — checked
+both `content/files/` (empty except `catalog.json`) and the FileBrowser fallback path (`Music/`,
+`Photos/` dirs exist but are empty, `mtime` 2026-06-26). The catalog's last real edit was
+2026-06-19, so these files were lost in a data-dir reset that post-dates the catalog (most likely
+the same window as other 2026-06-26 fixes in `docs/PRODUCTION-MASTER-PLAN.md` §6c) and nobody
+pruned the stale catalog entries or re-uploaded the files since. **This is real data loss on `.228`,
+not recoverable via code** — flag to the user if the original files (a screen recording + an mp3)
+still exist somewhere else to re-add.
+
+**Code fix shipped regardless** (self-healing, generalizable): `content_server::serve_content` now
+prunes a catalog entry from disk the moment it 404s because its backing file is missing
+(`prune_missing_content_entry`), instead of leaving it advertised to every peer forever with no way
+to distinguish "gone" from "transient failure." New tests
+`serve_content_prunes_catalog_entry_whose_file_is_missing` +
+`serve_content_leaves_other_entries_untouched_when_pruning`.
+
+## 14. Known test flakiness (not investigated, low priority)
+
+`credentials::operations::tests::*` has thrown 3 different failures
+(`test_list_credentials_no_filter`, `test_list_credentials_filter_by_did`) across separate
+`cargo test --workspace` runs this session — `invalid utf-8 sequence` panics from
+`credentials/operations.rs:336`. Passes reliably in isolation and under `--test-threads=1`; only
+fails under full-parallel `--workspace` runs, and never on the same test twice — points to a shared
+test-fixture/tempfile collision generating non-UTF8 bytes under parallelism, not a real credentials
+bug and not related to anything touched this session. Worth a real fix at some point (a test isolation
+issue makes CI flaky) but out of scope here.