From d0710e7491d8e4f6e81b657cf117ccd9240956a8 Mon Sep 17 00:00:00 2001 From: archipelago Date: Wed, 1 Jul 2026 08:19:54 -0400 Subject: [PATCH] fix(orchestrator,content): bound repair-recreate loops; self-heal stale content catalog entries - prod_orchestrator.rs: the boot reconciler's zombie-guard and start-failed recreate paths (Created/Stopped/Exited states) had no attempt cap, unlike health_monitor's independent restart tracker. A container whose entrypoint fatally crashes right after `podman start` succeeds got stop+remove+ install_fresh'd every ~30s reconcile tick forever (portainer on .198, 2026-07-01: a DB schema newer than the pinned binary could read -- no amount of recreating fixes that). Added a 5-attempts/30-minute circuit breaker; once exhausted the container is left alone with an error! log instead of looping, and an explicit install/start clears the counter. - content_server.rs: serve_content now prunes a catalog entry whose backing file is missing on disk, instead of leaving it advertised to every peer forever with no way to distinguish "gone" from "transient failure." Co-Authored-By: Claude Sonnet 5 --- .../src/container/prod_orchestrator.rs | 162 ++++++++++++++++++ core/archipelago/src/content_server.rs | 123 ++++++++++++- docs/PRODUCTION-MASTER-PLAN.md | 139 +++++++++++++-- 3 files changed, 408 insertions(+), 16 deletions(-) diff --git a/core/archipelago/src/container/prod_orchestrator.rs b/core/archipelago/src/container/prod_orchestrator.rs index cb10ec69..572dd5ad 100644 --- a/core/archipelago/src/container/prod_orchestrator.rs +++ b/core/archipelago/src/container/prod_orchestrator.rs @@ -1025,6 +1025,10 @@ struct OrchestratorState { disabled: HashSet, /// app_id → per-app mutex, created lazily the first time we touch an app locks: HashMap>>, + /// container name → (attempt count, first-attempt time) for the + /// stop+remove+install_fresh "repair" recreate paths below. See + /// `should_attempt_repair`. + repair_attempts: HashMap, } impl OrchestratorState { @@ -1033,10 +1037,29 @@ impl OrchestratorState { manifests: HashMap::new(), disabled: HashSet::new(), locks: HashMap::new(), + repair_attempts: HashMap::new(), } } } +/// Cap on how many times the boot reconciler will recreate the same +/// container within `REPAIR_ATTEMPT_RESET_WINDOW` before giving up on it. +/// +/// Without this, a container whose entrypoint process fatally exits right +/// after `podman start` succeeds (podman itself reports no error — the crash +/// happens inside the app a moment later) gets stop+remove+install_fresh'd +/// again on every ~30s reconcile tick, forever. `health_monitor.rs`'s +/// restart tracker already bounds ITS OWN independent restart path +/// (`MAX_RESTART_ATTEMPTS`) and eventually surfaces a user-facing +/// notification — but the boot reconciler's repair-recreate path had no +/// equivalent circuit breaker, so the two could race indefinitely on the +/// same fatally-broken container (portainer on `.198`, 2026-07-01: crashed +/// on every start because its on-disk DB was written by a newer binary than +/// the pinned image — a data/version mismatch no amount of recreating could +/// fix, yet it kept looping every 30s until manually intervened on). +const MAX_REPAIR_ATTEMPTS: u32 = 5; +const REPAIR_ATTEMPT_RESET_WINDOW: std::time::Duration = std::time::Duration::from_secs(1800); + pub struct ProdContainerOrchestrator { runtime: Arc, manifests_dir: PathBuf, @@ -1499,6 +1522,48 @@ impl ProdContainerOrchestrator { .await } + /// Whether the reconciler should attempt another stop+remove+install_fresh + /// repair recreate for `name`, or has already tried too many times + /// recently and should leave it alone instead of looping forever. See + /// `MAX_REPAIR_ATTEMPTS`. + async fn should_attempt_repair(&self, name: &str) -> bool { + let mut state = self.state.write().await; + let now = std::time::Instant::now(); + let entry = state + .repair_attempts + .entry(name.to_string()) + .or_insert((0, now)); + if now.duration_since(entry.1) > REPAIR_ATTEMPT_RESET_WINDOW { + *entry = (0, now); + } + entry.0 += 1; + if entry.0 > MAX_REPAIR_ATTEMPTS { + tracing::error!( + container = %name, + attempts = entry.0, + window_secs = REPAIR_ATTEMPT_RESET_WINDOW.as_secs(), + "giving up on repairing container after too many recreate attempts — it keeps failing to \ + come up cleanly on its own; check `podman logs {name}` for the real cause (a data/version \ + mismatch or another fatal startup error is likely, not something recreating the container \ + again will fix). Leaving it as-is instead of recreating it forever; a manual fix + restart \ + (or a subsequent explicit install/start, which clears this counter) is needed to recover it.", + name = name, + ); + false + } else { + true + } + } + + /// Clears the repair-attempt counter for `name` — call on any path that + /// reaches a stable Running/NoOp/Started outcome, so a container that + /// recovers (on its own, or after a real fix) doesn't inherit a stale + /// near-exhausted counter from an earlier unrelated failure. + async fn clear_repair_attempts(&self, name: &str) { + let mut state = self.state.write().await; + state.repair_attempts.remove(name); + } + async fn ensure_running_with_mode( &self, lm: &LoadedManifest, @@ -1611,6 +1676,11 @@ impl ProdContainerOrchestrator { // "Up" → proxy 502 → NetBird login broke). Conservative: // only fires on a concrete dead PID, never on uncertainty. if !container_running_process_alive(&name).await { + if !self.should_attempt_repair(&name).await { + return Ok(ReconcileAction::Left( + "repair-attempts-exhausted".into(), + )); + } tracing::warn!( app_id = %app_id, container = %name, @@ -1718,6 +1788,7 @@ impl ProdContainerOrchestrator { return Ok(ReconcileAction::Installed); } } + self.clear_repair_attempts(&name).await; Ok(ReconcileAction::NoOp) } ContainerState::Stopped | ContainerState::Exited => { @@ -1743,6 +1814,11 @@ impl ProdContainerOrchestrator { ) .await { + if !self.should_attempt_repair(&name).await { + return Ok(ReconcileAction::Left( + "repair-attempts-exhausted".into(), + )); + } tracing::warn!( app_id = %app_id, container = %name, @@ -1764,6 +1840,7 @@ impl ProdContainerOrchestrator { wait_for_container_stable_running(self.runtime.as_ref(), &name, 15, 90) .await?; } + self.clear_repair_attempts(&name).await; Ok(ReconcileAction::Started) } ContainerState::Stopping => { @@ -1792,6 +1869,11 @@ impl ProdContainerOrchestrator { ) .await { + if !self.should_attempt_repair(&name).await { + return Ok(ReconcileAction::Left( + "repair-attempts-exhausted".into(), + )); + } tracing::warn!( app_id = %app_id, container = %name, @@ -1813,6 +1895,7 @@ impl ProdContainerOrchestrator { wait_for_container_stable_running(self.runtime.as_ref(), &name, 15, 90) .await?; } + self.clear_repair_attempts(&name).await; Ok(ReconcileAction::Started) } ContainerState::Paused => Ok(ReconcileAction::Left("paused".to_string())), @@ -4594,6 +4677,85 @@ app: .any(|c| c == "create_container:bitcoin-knots:offset=0")); } + #[tokio::test] + async fn repair_recreate_stops_after_max_attempts_instead_of_looping_forever() { + // A container whose entrypoint fatally crashes every time (portainer + // on .198, 2026-07-01: DB schema too new for the pinned binary) must + // not be stop+remove+install_fresh'd forever by the boot reconciler. + let rt = Arc::new(MockRuntime::default()); + rt.mark_image_present("docker.io/bitcoin/knots:28"); + rt.set_state("bitcoin-knots", ContainerState::Exited); + let orch = orch_with(rt.clone()).await; + orch.insert_manifest_for_test( + pull_manifest("bitcoin-knots", "docker.io/bitcoin/knots:28"), + PathBuf::from("/tmp/bk"), + ) + .await; + + let mut last_report = None; + for _ in 0..MAX_REPAIR_ATTEMPTS { + // fail_start entries are consumed on use — re-arm every pass so + // this container fails to start EVERY time, not just once. + rt.fail_start + .lock() + .unwrap() + .insert("bitcoin-knots".into(), "fatal startup error".into()); + rt.set_state("bitcoin-knots", ContainerState::Exited); + let report = orch.reconcile_all().await; + assert_eq!( + report.actions, + vec![("bitcoin-knots".to_string(), ReconcileAction::Installed)], + "expected a recreate attempt within the attempt budget" + ); + last_report = Some(report); + } + assert!(last_report.is_some()); + + // One more pass exceeds MAX_REPAIR_ATTEMPTS — the breaker must trip: + // no further remove/create calls (it may still probe status/attempt + // a start, same as any other pass — it just must not recreate). + let remove_calls_before = rt + .calls() + .iter() + .filter(|c| c.starts_with("remove_container:")) + .count(); + let create_calls_before = rt + .calls() + .iter() + .filter(|c| c.starts_with("create_container:")) + .count(); + rt.fail_start + .lock() + .unwrap() + .insert("bitcoin-knots".into(), "fatal startup error".into()); + rt.set_state("bitcoin-knots", ContainerState::Exited); + let report = orch.reconcile_all().await; + assert_eq!( + report.actions, + vec![( + "bitcoin-knots".to_string(), + ReconcileAction::Left("repair-attempts-exhausted".to_string()) + )] + ); + let calls_after = rt.calls(); + assert_eq!( + calls_after + .iter() + .filter(|c| c.starts_with("remove_container:")) + .count(), + remove_calls_before, + "breaker must skip the recreate's remove_container entirely" + ); + assert_eq!( + calls_after + .iter() + .filter(|c| c.starts_with("create_container:")) + .count(), + create_calls_before, + "breaker must skip the recreate's create_container entirely" + ); + } + #[tokio::test] async fn reconcile_installs_missing_container() { let rt = Arc::new(MockRuntime::default()); diff --git a/core/archipelago/src/content_server.rs b/core/archipelago/src/content_server.rs index 0118c8ea..c9bae8f2 100644 --- a/core/archipelago/src/content_server.rs +++ b/core/archipelago/src/content_server.rs @@ -7,7 +7,7 @@ use anyhow::{Context, Result}; use serde::{Deserialize, Serialize}; use std::path::{Path, PathBuf}; use tokio::fs; -use tracing::debug; +use tracing::{debug, warn}; const CATALOG_FILE: &str = "content/catalog.json"; const CONTENT_DIR: &str = "content/files"; @@ -86,6 +86,22 @@ pub async fn save_catalog(data_dir: &Path, catalog: &ContentCatalog) -> Result<( Ok(()) } +/// Removes `id` from the on-disk catalog. Best-effort: a failure here just +/// means the entry gets pruned again next time it's requested, so errors are +/// logged rather than propagated. +async fn prune_missing_content_entry(data_dir: &Path, id: &str) { + let Ok(mut catalog) = load_catalog(data_dir).await else { + return; + }; + let before = catalog.items.len(); + catalog.items.retain(|i| i.id != id); + if catalog.items.len() != before { + if let Err(e) = save_catalog(data_dir, &catalog).await { + warn!(error = %e, content_id = %id, "failed to save catalog after pruning missing content entry"); + } + } +} + /// Get the full filesystem path for a content item. /// Checks the dedicated content/files/ directory first, then falls back to the /// FileBrowser data directory (where users manage files via the web UI). @@ -268,6 +284,19 @@ pub async fn serve_content( let file_path = content_file_path(data_dir, item); if !file_path.exists() { + // The catalog entry survived (it's a separate JSON file) but its + // backing file is gone — most likely lost in an unrelated data-dir + // reset (a shared filebrowser file, 2026-07-01: two catalog entries + // outlived a filebrowser reinstall that wiped the files themselves). + // Leaving the entry in place would keep advertising it as available + // to every peer forever, each hitting the exact same dead end this + // one just did. Prune it so it stops being offered. + warn!( + content_id = %id, + filename = %item.filename, + "content catalog entry's file is missing on disk — pruning the stale entry" + ); + prune_missing_content_entry(data_dir, id).await; return Ok(ServeResult::NotFound); } @@ -555,3 +584,95 @@ mod faststart_tests { assert_eq!(mp4_is_faststart(&p).await, Some(false)); } } + +#[cfg(test)] +mod prune_missing_content_tests { + use super::*; + + #[tokio::test] + async fn serve_content_prunes_catalog_entry_whose_file_is_missing() { + // Simulates a catalog entry that outlived its backing file (a shared + // filebrowser file lost in an unrelated data-dir reset, 2026-07-01) — + // every peer request for it would otherwise 404 forever with no way + // to tell it apart from a transient failure. + let dir = tempfile::tempdir().unwrap(); + let data_dir = dir.path(); + let item = ContentItem { + id: "missing-item".to_string(), + filename: "gone.mp4".to_string(), + mime_type: "video/mp4".to_string(), + size_bytes: 123, + description: String::new(), + access: AccessControl::Free, + availability: Availability::AllPeers, + added_at: "2026-01-01T00:00:00Z".to_string(), + }; + save_catalog( + data_dir, + &ContentCatalog { + items: vec![item], + }, + ) + .await + .unwrap(); + + // File was never written to disk under content/files/ or filebrowser/. + let result = serve_content(data_dir, "missing-item", None, None, None, None) + .await + .unwrap(); + assert!(matches!(result, ServeResult::NotFound)); + + let reloaded = load_catalog(data_dir).await.unwrap(); + assert!( + reloaded.items.is_empty(), + "stale entry should have been pruned after the 404" + ); + } + + #[tokio::test] + async fn serve_content_leaves_other_entries_untouched_when_pruning() { + let dir = tempfile::tempdir().unwrap(); + let data_dir = dir.path(); + let missing = ContentItem { + id: "missing-item".to_string(), + filename: "gone.mp4".to_string(), + mime_type: "video/mp4".to_string(), + size_bytes: 123, + description: String::new(), + access: AccessControl::Free, + availability: Availability::AllPeers, + added_at: "2026-01-01T00:00:00Z".to_string(), + }; + let present = ContentItem { + id: "present-item".to_string(), + filename: "here.mp4".to_string(), + mime_type: "video/mp4".to_string(), + size_bytes: 4, + description: String::new(), + access: AccessControl::Free, + availability: Availability::AllPeers, + added_at: "2026-01-01T00:00:00Z".to_string(), + }; + save_catalog( + data_dir, + &ContentCatalog { + items: vec![missing, present], + }, + ) + .await + .unwrap(); + let content_dir = data_dir.join("content").join("files"); + tokio::fs::create_dir_all(&content_dir).await.unwrap(); + tokio::fs::write(content_dir.join("here.mp4"), b"data") + .await + .unwrap(); + + let _ = serve_content(data_dir, "missing-item", None, None, None, None) + .await + .unwrap(); + + let reloaded = load_catalog(data_dir).await.unwrap(); + assert_eq!(reloaded.items.len(), 1); + assert_eq!(reloaded.items[0].id, "present-item"); + } +} diff --git a/docs/PRODUCTION-MASTER-PLAN.md b/docs/PRODUCTION-MASTER-PLAN.md index 079899d8..c3363499 100644 --- a/docs/PRODUCTION-MASTER-PLAN.md +++ b/docs/PRODUCTION-MASTER-PLAN.md @@ -1071,18 +1071,127 @@ non-mesh thread**; they route to the mesh/Reticulum agent (§10d owner). bulletproof switch mechanism itself — `package.set-config {id: "bitcoin-knots", version: "29.3.knots20260508"}` (an upgrade, so no downgrade-confirm gate) — to move `.228` onto the real latest image. Confirmed: `bitcoind --version` now reports `v29.3.knots20260508`, no reindex - triggered, tip advancing normally. Not yet committed/pushed — pending user go-ahead, same batch as - the uninstall-durability fix above. -- **[NON-MESH, untriaged]** `.198` — `bitcoin-knots` RPC is saturated: logs flooded with "Request - rejected because http work queue depth exceeded" despite `-rpcworkqueue=256` already applied - (confirmed via `podman inspect`/entrypoint). This cascades into fedimint: `fedimint` / - `fedimint-gateway` / `fedimint-clientd` have been stuck in `(starting)` for 36–46h because their - RPC calls to `bitcoin-knots` time out (45s) — this is almost certainly what the user meant by - "fedimint guardian keeps going down" (not `.228`, whose fedimint stack looks healthy). Root cause - of the saturation itself not yet found — suspect a multi-service retry storm (health_monitor + - fedimint x2 + electrumx + mempool + UI all polling without backoff) compounding under any bitcoind - slowdown, but not confirmed. -- **[NON-MESH, untriaged]** `.198` — portainer is completely absent from `podman ps -a` (not just - crashed/stopped — no container record at all). `.228`'s portainer is healthy for comparison. No - `/var/lib/archipelago/install.log` found on `.198` to check install history; needs a - package_data/state check via RPC or `journalctl` for the archipelago service. + triggered, tip advancing normally. **Committed + pushed** `5b7cd5d5` (same batch as the + uninstall-durability fix above). +- **[NON-MESH] ROOT-CAUSED 2026-07-01, NOT A CODE BUG — needs a capacity/ops decision** — `.198` + `bitcoin-knots` RPC saturation ("work queue depth exceeded" despite `-rpcworkqueue=256`), + cascading into stuck `fedimint`/`fedimint-gateway`/`fedimint-clientd` (`(starting)` 36-46h — this + is what the user meant by "fedimint guardian keeps going down," not `.228`) and portainer + flapping (seen completely absent from `podman ps -a` at one check, `Up 12 seconds` moments later + at a follow-up check — it's being killed+recreated repeatedly, not missing). Real root cause: + **`.198`'s `bitcoin-knots` is still only ~21% synced (height 507247, unchanged from the ~21% + noted 2026-06-28 in [[project_bitcoin_multiversion_integration]] three days ago) and its root + disk is nearly I/O-saturated** (`iostat -x`: `%util` 92-97%, `w_await` ~82ms) from IBD validation + competing with ~30 other containers' disk I/O on a small (29GB) root partition on an OptiPlex + 3020M. CPU is mostly idle (bitcoin-knots at 3.68%) — this is a **disk I/O bottleneck**, not the + retry-storm hypothesis first suspected. Every RPC caller (health_monitor, fedimint, electrumx, + UI) times out waiting on a disk that can't keep up, and portainer's health-check failures trigger + the orchestrator's zombie/drift-repair kill+recreate cycle, which never stabilizes because the + underlying I/O contention never resolves. **Not fixed** — this needs a user decision (accept slow + IBD and wait, uninstall some of the ~15 other apps competing for I/O on this node, or a hardware + upgrade), not a code change. `docs/multinode-testing-plan.md` already treats `.198` IBD status as + a pre-req to check before the multinode pass, consistent with this finding. +- **[NON-MESH] ROOT-CAUSED + FIXED 2026-07-01** — Indeedhub wouldn't install on Arch Dev (`.116`). + Root cause: orphan leftover containers (`indeedhub-api`, `indeedhub-ffmpeg`) from a prior + partial/failed install, with `indeedhub-postgres` and the rest of the stack never created. + `health_monitor` correctly saw these as orphans (no `package_data` entry) and left them alone, but + a separate runtime crash-recovery loop (`start_stopped_app_stacks` in `crash_recovery.rs`, runs + every 120s — see `main.rs` "Stack supervisor") fired on ANY existing stack container regardless of + whether the stack's core dependency existed, force-restarting `indeedhub-api` forever against a + `postgres` hostname that could never resolve (`indeedhub-postgres` doesn't exist) — an infinite + crash loop that also blocked a real reinstall via container-name conflicts. **Fixed**: added an + `anchor` field to `StackRecoverySpec` (the stack's core DB/server container — `immich_postgres`, + `indeedhub-postgres`, `netbird-server`) and gated recovery on that anchor existing first, not on + any container existing. New test `stack_recovery_anchor_is_the_stacks_own_core_dependency`. + **Committed + pushed** `d414ae3d`. +- **[NON-MESH] ROOT-CAUSED + FIXED 2026-07-01** — Electrum launch/app-loader UI overlapped with the + ElectrumX syncing screen. Root cause (found via a parallel Explore-agent investigation): + `AppSessionFrame.vue` rendered the generic `AppLoadingScreen` and the ElectrumX sync overlay + simultaneously at the same `z-index: 10` — both conditions (`loading` and + `electrsSync && !electrsSync.stale`) could be true at once during launch. **Fixed**: the generic + loader now also checks `!(electrsSync && !electrsSync.stale)` so the more-informative sync screen + takes precedence instead of the two stacking. `vue-tsc --noEmit` clean. **Committed + pushed** + `d414ae3d`. + +## 12. `.198` portainer + boot-reconciler circuit breaker (2026-07-01) + +**`.198` portainer flapping was NOT the same root cause as the disk-I/O issue above** — user +correctly pushed back on that assumption. Actual cause: fatal, permanent — `podman logs portainer` +showed `The database schema version does not align with the server version`. `.116`/`.228` both run +the same pinned `portainer:2.19.4` and are healthy, so this was `.198`-specific data drift: its +`portainer.db` was created/upgraded by a newer binary at some point in that node's own history, +independent of the other nodes (git history has no record of the pin ever being anything but +2.19.4, so this was very likely a manual/ad-hoc podman operation on `.198` outside the normal +install/update path, not a platform bug in version selection). **Fixed live**: backed up +`portainer.db` to `_reset-backup-2026-07-01/` (not deleted) and let the pinned `2.19.4` reinitialize +fresh — portainer only holds its own dashboard/endpoint config, not irreplaceable user data, and the +user approved a reset over attempting recovery. Confirmed stable afterward. + +**Follow-up "make sure this can't happen again" (user request)** — root-caused why this could loop +forever undetected: `BootReconciler` (`boot_reconciler.rs`, ticks every 30s, `reconcile_existing()`) +recreates containers via `ensure_running_with_mode`'s `ContainerState::Created`/`Stopped`/`Exited` +"start failed → stop+remove+install_fresh" branches with **no bound at all** — unlike +`health_monitor.rs`'s independent restart path, which already has `MAX_RESTART_ATTEMPTS=10` + +backoff + a persistent user-facing notification after giving up. A container whose entrypoint +process fatally crashes moments after `podman start` succeeds (podman itself sees no error) has its +container recreated every single tick, forever, with only debug/warn-level logs — exactly +portainer's failure mode, and the reason it could keep looping (crash_recovery's periodic +supervisor doesn't cover single-container apps like portainer — only stack members — so this was +the actual mechanism, not the one used for indeedhub above). + +**Fixed**: added `MAX_REPAIR_ATTEMPTS=5` / `REPAIR_ATTEMPT_RESET_WINDOW=30min` circuit breaker +(`should_attempt_repair`/`clear_repair_attempts`, `prod_orchestrator.rs`) gating the zombie-guard +recreate and both "start failed" recreate branches (`Created` and `Stopped|Exited` states). Once +exhausted, reconcile leaves the container alone (`ReconcileAction::Left("repair-attempts-exhausted")`) +and logs an `error!` pointing at `podman logs ` instead of recreating forever; an explicit +`install()`/`start()` clears the counter, same pattern as `user_stopped`. New test +`repair_recreate_stops_after_max_attempts_instead_of_looping_forever`. **Scoped deliberately**: left +the drift-detection recreates (port/env drift, `Stopping`-stuck) unguarded for this pass — those are +host-state-corrections that normally resolve in one shot, a materially different failure shape from +"the app itself is fatally broken," and touching all ~8 recreate call sites in one pass risked +regressing carefully-tuned existing behavior for low incremental benefit. Full breaker coverage +(and/or wiring a persistent `Notification` through, which needs `StateManager` threaded into +`BootReconciler` — a bigger `main.rs` startup-order change not attempted here) is a reasonable +future follow-up if another single-container app hits this same failure class. + +**Also answered**: "why does portainer's setup wizard not have podman as an option?" — +`apps/portainer/manifest.yml` bind-mounts the rootless podman socket +(`/run/user/1000/podman/podman.sock`) to `/var/run/docker.sock` inside the container. Portainer +never knows it's talking to podman — it just sees the standard Docker socket path and speaks the +Docker Engine API, which podman's socket implements compatibly. Not a bug: pick "Docker" (local) in +the wizard. + +## 13. Peer-federated content 404s over FIPS (2026-07-01) — DATA LOSS, not a code bug in the transport + +User report: `.116 → .228` streaming/downloading peer-federated content over FIPS failed with +`/api/peer-content//` 404s, surfacing in the browser as `NotSupportedError: no supported +source`. Investigated the full path: nginx's `/api/peer-content/` proxy block is present on `.116`; +`handle_peer_content_stream` (`api/handler/proxy.rs`) correctly dials `.228` over FIPS and passes +the peer's real HTTP status straight through — not a routing bug. `.228`'s `content/catalog.json` +genuinely lists both content IDs from the error log as `access: free`, `availability: allpeers` (so +not a permissions bug either), **but the backing files don't exist anywhere on `.228`** — checked +both `content/files/` (empty except `catalog.json`) and the FileBrowser fallback path (`Music/`, +`Photos/` dirs exist but are empty, `mtime` 2026-06-26). The catalog's last real edit was +2026-06-19, so these files were lost in a data-dir reset that post-dates the catalog (most likely +the same window as other 2026-06-26 fixes in `docs/PRODUCTION-MASTER-PLAN.md` §6c) and nobody +pruned the stale catalog entries or re-uploaded the files since. **This is real data loss on `.228`, +not recoverable via code** — flag to the user if the original files (a screen recording + an mp3) +still exist somewhere else to re-add. + +**Code fix shipped regardless** (self-healing, generalizable): `content_server::serve_content` now +prunes a catalog entry from disk the moment it 404s because its backing file is missing +(`prune_missing_content_entry`), instead of leaving it advertised to every peer forever with no way +to distinguish "gone" from "transient failure." New tests +`serve_content_prunes_catalog_entry_whose_file_is_missing` + +`serve_content_leaves_other_entries_untouched_when_pruning`. + +## 14. Known test flakiness (not investigated, low priority) + +`credentials::operations::tests::*` has thrown 3 different failures +(`test_list_credentials_no_filter`, `test_list_credentials_filter_by_did`) across separate +`cargo test --workspace` runs this session — `invalid utf-8 sequence` panics from +`credentials/operations.rs:336`. Passes reliably in isolation and under `--test-threads=1`; only +fails under full-parallel `--workspace` runs, and never on the same test twice — points to a shared +test-fixture/tempfile collision generating non-UTF8 bytes under parallelism, not a real credentials +bug and not related to anything touched this session. Worth a real fix at some point (a test isolation +issue makes CI flaky) but out of scope here.