diff --git a/core/archipelago/src/container/boot_reconciler.rs b/core/archipelago/src/container/boot_reconciler.rs index d52bd3ed..a3e0f65a 100644 --- a/core/archipelago/src/container/boot_reconciler.rs +++ b/core/archipelago/src/container/boot_reconciler.rs @@ -96,6 +96,35 @@ impl BootReconciler { } } + // Companion self-heal runs on its OWN cadence, decoupled from the + // per-app reconcile pass. On a heavily loaded node `reconcile_existing` + // over dozens of apps can take well over a minute, which would delay a + // companion-unit repair (deleted/lost unit file) past any reasonable + // safety window. Detecting + rewriting a companion unit is cheap, so it + // gets a dedicated `interval` loop. The handle is aborted when the main + // loop exits (shutdown uses `notify_one`, so we must NOT add a second + // waiter on `self.shutdown` — it would steal the single wake permit). + let companion_handle = if self.companion_stage { + let orchestrator = self.orchestrator.clone(); + let interval = self.interval; + Some(tokio::spawn(async move { + loop { + let installed = orchestrator.manifest_ids().await; + for (companion, err) in crate::container::companion::reconcile(&installed).await + { + tracing::warn!( + companion = %companion, + error = %err, + "companion reconcile failed" + ); + } + time::sleep(interval).await; + } + })) + } else { + None + }; + // Initial pass: no delay. self.tick().await; @@ -111,23 +140,15 @@ impl BootReconciler { } } } + + if let Some(handle) = companion_handle { + handle.abort(); + } } async fn tick(&self) { let report = self.orchestrator.reconcile_existing().await; Self::log_report(&report); - - if !self.companion_stage { - return; - } - let installed = self.orchestrator.manifest_ids().await; - for (companion, err) in crate::container::companion::reconcile(&installed).await { - tracing::warn!( - companion = %companion, - error = %err, - "companion reconcile failed" - ); - } } fn log_report(report: &ReconcileReport) {