fix(orchestrator): stop retrying unrepairable volume chowns every reconcile

ensure_running_container_ownership re-probed and re-attempted the in-container chown on every reconcile pass. For a mount that can't be re-owned from inside the userns (observed: mempool-api /data -> 'Operation not permitted'), this burned CPU and logged a WARN on every pass, forever (~6x/30min on .228/.116). Remember hard chown failures in a process-lifetime set keyed by (container-id, dest) and skip the probe+chown for known-unrepairable mounts. Keyed by Id (not name) so a recreated container gets a fresh repair attempt. Verified on .116: one recorded failure at startup, then silent across subsequent reconciles. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
fix(ui): debounce connection-lost banner so transient ws blips don't flash
2026-06-24 04:58:57 -04:00 · 2026-06-24 04:58:54 -04:00
2 changed files with 109 additions and 10 deletions
--- a/core/archipelago/src/container/prod_orchestrator.rs
+++ b/core/archipelago/src/container/prod_orchestrator.rs
@ -294,6 +294,20 @@ async fn chown_for_rootless_container(uid_gid: &str, path: &str) -> Result<()> {
    ))
 }

+/// `(container-id, mount-dest)` pairs whose in-container chown returned a hard,
+/// permanent failure (e.g. "Operation not permitted" on a mount that can't be
+/// re-owned from inside the userns). Remembered for the life of the process so
+/// the per-reconcile repair stops re-attempting them — otherwise a single
+/// unrepairable mount (observed: mempool-api `/data`) burns CPU + floods the
+/// journal on every pass. Keyed by Id so a recreated container retries afresh.
+fn unrepairable_ownership() -> &'static std::sync::Mutex<std::collections::HashSet<(String, String)>>
+{
+    static SET: std::sync::OnceLock<
+        std::sync::Mutex<std::collections::HashSet<(String, String)>>,
+    > = std::sync::OnceLock::new();
+    SET.get_or_init(|| std::sync::Mutex::new(std::collections::HashSet::new()))
+}
+
 /// App-agnostic, userns-mapping-proof volume-ownership repair for a RUNNING
 /// container.
 ///
@ -332,6 +346,13 @@ async fn ensure_running_container_ownership(name: &str) -> bool {
        .filter(|g| !g.is_empty())
        .unwrap_or_else(|| uid.clone());

+    // Stable identity of THIS container instance — used to remember mounts whose
+    // chown is hard-unrepairable so we stop hammering them every reconcile. Keyed
+    // by Id (not name) so a recreated container gets a fresh repair attempt.
+    let cid = podman_stdout(&["inspect", name, "--format", "{{.Id}}"])
+        .await
+        .unwrap_or_default();
+
    // Writable bind-mount destinations only.
    let dests = match podman_stdout(&[
        "inspect",
@ -359,6 +380,19 @@ async fn ensure_running_container_ownership(name: &str) -> bool {
            continue;
        }

+        // Known hard-unrepairable for this container instance (a previous chown
+        // returned a permanent error like "Operation not permitted"). Skip the
+        // probe+chown entirely — retrying every reconcile only burns CPU and
+        // floods the journal; it will never succeed for this instance.
+        if !cid.is_empty()
+            && unrepairable_ownership()
+                .lock()
+                .map(|s| s.contains(&(cid.clone(), dest.to_string())))
+                .unwrap_or(false)
+        {
+            continue;
+        }
+
        // Drift check: can the service user write here already?
        let probe = format!(
            "t=\"{dest}/.archy-wtest.$$\"; touch \"$t\" 2>/dev/null && rm -f \"$t\" 2>/dev/null"
@ -395,11 +429,21 @@ async fn ensure_running_container_ownership(name: &str) -> bool {
                    "repaired unwritable volume ownership (in-container chown)"
                );
            }
-            Ok(o) => tracing::warn!(
-                container = %name, dest,
-                "volume ownership repair failed: {}",
-                String::from_utf8_lossy(&o.stderr).trim()
-            ),
+            Ok(o) => {
+                // Permanent failure (e.g. "Operation not permitted" on a mount
+                // that simply can't be re-owned from inside the userns). Record
+                // it so we don't re-attempt every reconcile — log once, loudly.
+                if !cid.is_empty() {
+                    if let Ok(mut s) = unrepairable_ownership().lock() {
+                        s.insert((cid.clone(), dest.to_string()));
+                    }
+                }
+                tracing::warn!(
+                    container = %name, dest,
+                    "volume ownership repair failed (won't retry for this container instance): {}",
+                    String::from_utf8_lossy(&o.stderr).trim()
+                )
+            }
            Err(e) => {
                tracing::warn!(container = %name, dest, "volume ownership repair errored: {e}")
            }
--- a/neode-ui/src/views/dashboard/ConnectionBanner.vue
+++ b/neode-ui/src/views/dashboard/ConnectionBanner.vue
@ -1,9 +1,12 @@
 <template>
  <Teleport to="body">
-    <!-- Offline Banner -->
+    <!-- Lifecycle / Offline Banner.
+         Server restart/shutdown is deliberate → shown immediately. A plain
+         connection blip is debounced (showConnIssue) so transient sub-grace
+         reconnects don't flash. -->
    <Transition name="conn-banner">
      <div
-        v-if="isOffline && !store.isReconnecting && store.isAuthenticated"
+        v-if="(showLifecycle || showConnectionLost)"
        class="conn-banner-overlay"
      >
        <div class="path-option-card px-6 py-3 border-l-4 border-yellow-500 inline-flex items-center gap-2 text-yellow-200 shadow-2xl">
@ -17,10 +20,10 @@
      </div>
    </Transition>

-    <!-- Reconnecting Banner -->
+    <!-- Reconnecting Banner (debounced) -->
    <Transition name="conn-banner">
      <div
-        v-if="store.isReconnecting && store.isAuthenticated"
+        v-if="showReconnecting"
        class="conn-banner-overlay"
      >
        <div class="path-option-card px-6 py-3 border-l-4 border-blue-500 inline-flex items-center gap-2 text-blue-200 shadow-2xl">
@ -35,7 +38,7 @@
 </template>

 <script setup lang="ts">
-import { computed } from 'vue'
+import { computed, ref, watch, onUnmounted } from 'vue'
 import { useAppStore } from '@/stores/app'

 const store = useAppStore()
@ -43,6 +46,58 @@ const store = useAppStore()
 const isOffline = computed(() => store.isOffline)
 const isRestarting = computed(() => store.isRestarting)
 const isShuttingDown = computed(() => store.isShuttingDown)
+
+// A deliberate server lifecycle transition (restart/shutdown) is real and
+// user-initiated — surface it immediately, no debounce.
+const isLifecycleTransition = computed(() => isRestarting.value || isShuttingDown.value)
+const showLifecycle = computed(() => isLifecycleTransition.value && store.isAuthenticated)
+
+// A plain connection blip (offline or reconnecting, not a lifecycle transition).
+// The overwhelming majority recover within a second or two (load spikes,
+// Tailscale/relay TCP resets), so showing the banner instantly makes a healthy
+// node read as unstable. Debounce: only surface after the issue persists past a
+// grace window; hide immediately on recovery.
+const hasConnIssue = computed(
+  () => (store.isReconnecting || isOffline.value) && !isLifecycleTransition.value
+)
+
+const SHOW_DELAY_MS = 2500
+const showConnIssue = ref(false)
+let pendingTimer: ReturnType<typeof setTimeout> | null = null
+
+function clearTimer() {
+  if (pendingTimer) {
+    clearTimeout(pendingTimer)
+    pendingTimer = null
+  }
+}
+
+watch(
+  hasConnIssue,
+  (issue) => {
+    clearTimer()
+    if (issue) {
+      pendingTimer = setTimeout(() => {
+        showConnIssue.value = true
+        pendingTimer = null
+      }, SHOW_DELAY_MS)
+    } else {
+      // Recovered before the grace window elapsed — hide at once.
+      showConnIssue.value = false
+    }
+  },
+  { immediate: true }
+)
+
+onUnmounted(clearTimer)
+
+// Debounced visual states the template renders.
+const showReconnecting = computed(
+  () => showConnIssue.value && store.isReconnecting && store.isAuthenticated
+)
+const showConnectionLost = computed(
+  () => showConnIssue.value && isOffline.value && !store.isReconnecting && store.isAuthenticated
+)
 </script>

 <style scoped>