From 452f05d849fab731e41ddacc3059a9e5b89fc942 Mon Sep 17 00:00:00 2001 From: archipelago Date: Mon, 22 Jun 2026 13:04:28 -0400 Subject: [PATCH] fix(reconciler): decouple companion self-heal onto its own cadence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The companion-unit repair stage ran at the END of each boot-reconciler tick, after reconcile_existing(). On a heavily loaded node that per-app pass takes >60-90s, so a deleted/lost companion unit (electrs-ui, bitcoin-ui, …) wasn't repaired within any reasonable window (gate test 31 'deleted unit recreated within one reconcile tick' timed out at 90s on the 45-app .228 node). Detecting + rewriting a companion unit is cheap, so spawn it as its own ~interval(30s) loop, independent of the slow app pass. Handle is aborted when the main loop exits (shutdown uses notify_one, so a second waiter would steal the wake permit). tick() is now app-reconcile only. All 4 boot_reconciler cadence tests still green (companion_stage=false in tests). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/container/boot_reconciler.rs | 45 ++++++++++++++----- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/core/archipelago/src/container/boot_reconciler.rs b/core/archipelago/src/container/boot_reconciler.rs index d52bd3ed..a3e0f65a 100644 --- a/core/archipelago/src/container/boot_reconciler.rs +++ b/core/archipelago/src/container/boot_reconciler.rs @@ -96,6 +96,35 @@ impl BootReconciler { } } + // Companion self-heal runs on its OWN cadence, decoupled from the + // per-app reconcile pass. On a heavily loaded node `reconcile_existing` + // over dozens of apps can take well over a minute, which would delay a + // companion-unit repair (deleted/lost unit file) past any reasonable + // safety window. Detecting + rewriting a companion unit is cheap, so it + // gets a dedicated `interval` loop. The handle is aborted when the main + // loop exits (shutdown uses `notify_one`, so we must NOT add a second + // waiter on `self.shutdown` — it would steal the single wake permit). + let companion_handle = if self.companion_stage { + let orchestrator = self.orchestrator.clone(); + let interval = self.interval; + Some(tokio::spawn(async move { + loop { + let installed = orchestrator.manifest_ids().await; + for (companion, err) in crate::container::companion::reconcile(&installed).await + { + tracing::warn!( + companion = %companion, + error = %err, + "companion reconcile failed" + ); + } + time::sleep(interval).await; + } + })) + } else { + None + }; + // Initial pass: no delay. self.tick().await; @@ -111,23 +140,15 @@ impl BootReconciler { } } } + + if let Some(handle) = companion_handle { + handle.abort(); + } } async fn tick(&self) { let report = self.orchestrator.reconcile_existing().await; Self::log_report(&report); - - if !self.companion_stage { - return; - } - let installed = self.orchestrator.manifest_ids().await; - for (companion, err) in crate::container::companion::reconcile(&installed).await { - tracing::warn!( - companion = %companion, - error = %err, - "companion reconcile failed" - ); - } } fn log_report(report: &ReconcileReport) {