fix(reconciler): decouple companion self-heal onto its own cadence
The companion-unit repair stage ran at the END of each boot-reconciler tick, after reconcile_existing(). On a heavily loaded node that per-app pass takes >60-90s, so a deleted/lost companion unit (electrs-ui, bitcoin-ui, …) wasn't repaired within any reasonable window (gate test 31 'deleted unit recreated within one reconcile tick' timed out at 90s on the 45-app .228 node). Detecting + rewriting a companion unit is cheap, so spawn it as its own ~interval(30s) loop, independent of the slow app pass. Handle is aborted when the main loop exits (shutdown uses notify_one, so a second waiter would steal the wake permit). tick() is now app-reconcile only. All 4 boot_reconciler cadence tests still green (companion_stage=false in tests). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
de7d3d83dc
commit
452f05d849
@ -96,6 +96,35 @@ impl BootReconciler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Companion self-heal runs on its OWN cadence, decoupled from the
|
||||||
|
// per-app reconcile pass. On a heavily loaded node `reconcile_existing`
|
||||||
|
// over dozens of apps can take well over a minute, which would delay a
|
||||||
|
// companion-unit repair (deleted/lost unit file) past any reasonable
|
||||||
|
// safety window. Detecting + rewriting a companion unit is cheap, so it
|
||||||
|
// gets a dedicated `interval` loop. The handle is aborted when the main
|
||||||
|
// loop exits (shutdown uses `notify_one`, so we must NOT add a second
|
||||||
|
// waiter on `self.shutdown` — it would steal the single wake permit).
|
||||||
|
let companion_handle = if self.companion_stage {
|
||||||
|
let orchestrator = self.orchestrator.clone();
|
||||||
|
let interval = self.interval;
|
||||||
|
Some(tokio::spawn(async move {
|
||||||
|
loop {
|
||||||
|
let installed = orchestrator.manifest_ids().await;
|
||||||
|
for (companion, err) in crate::container::companion::reconcile(&installed).await
|
||||||
|
{
|
||||||
|
tracing::warn!(
|
||||||
|
companion = %companion,
|
||||||
|
error = %err,
|
||||||
|
"companion reconcile failed"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
time::sleep(interval).await;
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
// Initial pass: no delay.
|
// Initial pass: no delay.
|
||||||
self.tick().await;
|
self.tick().await;
|
||||||
|
|
||||||
@ -111,23 +140,15 @@ impl BootReconciler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(handle) = companion_handle {
|
||||||
|
handle.abort();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn tick(&self) {
|
async fn tick(&self) {
|
||||||
let report = self.orchestrator.reconcile_existing().await;
|
let report = self.orchestrator.reconcile_existing().await;
|
||||||
Self::log_report(&report);
|
Self::log_report(&report);
|
||||||
|
|
||||||
if !self.companion_stage {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
let installed = self.orchestrator.manifest_ids().await;
|
|
||||||
for (companion, err) in crate::container::companion::reconcile(&installed).await {
|
|
||||||
tracing::warn!(
|
|
||||||
companion = %companion,
|
|
||||||
error = %err,
|
|
||||||
"companion reconcile failed"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn log_report(report: &ReconcileReport) {
|
fn log_report(report: &ReconcileReport) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user