fix(reconciler): decouple companion self-heal onto its own cadence
The companion-unit repair stage ran at the END of each boot-reconciler tick, after reconcile_existing(). On a heavily loaded node that per-app pass takes >60-90s, so a deleted/lost companion unit (electrs-ui, bitcoin-ui, …) wasn't repaired within any reasonable window (gate test 31 'deleted unit recreated within one reconcile tick' timed out at 90s on the 45-app .228 node). Detecting + rewriting a companion unit is cheap, so spawn it as its own ~interval(30s) loop, independent of the slow app pass. Handle is aborted when the main loop exits (shutdown uses notify_one, so a second waiter would steal the wake permit). tick() is now app-reconcile only. All 4 boot_reconciler cadence tests still green (companion_stage=false in tests). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
de7d3d83dc
commit
452f05d849
@ -96,6 +96,35 @@ impl BootReconciler {
|
||||
}
|
||||
}
|
||||
|
||||
// Companion self-heal runs on its OWN cadence, decoupled from the
|
||||
// per-app reconcile pass. On a heavily loaded node `reconcile_existing`
|
||||
// over dozens of apps can take well over a minute, which would delay a
|
||||
// companion-unit repair (deleted/lost unit file) past any reasonable
|
||||
// safety window. Detecting + rewriting a companion unit is cheap, so it
|
||||
// gets a dedicated `interval` loop. The handle is aborted when the main
|
||||
// loop exits (shutdown uses `notify_one`, so we must NOT add a second
|
||||
// waiter on `self.shutdown` — it would steal the single wake permit).
|
||||
let companion_handle = if self.companion_stage {
|
||||
let orchestrator = self.orchestrator.clone();
|
||||
let interval = self.interval;
|
||||
Some(tokio::spawn(async move {
|
||||
loop {
|
||||
let installed = orchestrator.manifest_ids().await;
|
||||
for (companion, err) in crate::container::companion::reconcile(&installed).await
|
||||
{
|
||||
tracing::warn!(
|
||||
companion = %companion,
|
||||
error = %err,
|
||||
"companion reconcile failed"
|
||||
);
|
||||
}
|
||||
time::sleep(interval).await;
|
||||
}
|
||||
}))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Initial pass: no delay.
|
||||
self.tick().await;
|
||||
|
||||
@ -111,23 +140,15 @@ impl BootReconciler {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(handle) = companion_handle {
|
||||
handle.abort();
|
||||
}
|
||||
}
|
||||
|
||||
async fn tick(&self) {
|
||||
let report = self.orchestrator.reconcile_existing().await;
|
||||
Self::log_report(&report);
|
||||
|
||||
if !self.companion_stage {
|
||||
return;
|
||||
}
|
||||
let installed = self.orchestrator.manifest_ids().await;
|
||||
for (companion, err) in crate::container::companion::reconcile(&installed).await {
|
||||
tracing::warn!(
|
||||
companion = %companion,
|
||||
error = %err,
|
||||
"companion reconcile failed"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn log_report(report: &ReconcileReport) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user