fix(reconcile): keep user-stopped apps stopped (reconciler was resurrecting them)
package.stop a dependency (e.g. electrumx, a mempool dep) and the reconciler
restarts it within ~8s: the reconcile filter's dependency_required override
re-includes a user-stopped app that an active app depends on, and the in-memory
disabled set is wiped on manifest reload — so ensure_running runs, the stopped
app's unreachable ports look like a fault, the host-port repair restarts it, and
package.stop never sticks (gate 'transitions to stopped' times out).
Fix: guard ensure_running_with_mode on the on-disk user_stopped marker (the single
choke point every reconcile flows through) → Left('user-stopped'). Explicit
install/start clear the marker first (added clear_user_stopped to orchestrator
install/start, symmetric with disabled.remove; start/restart RPC already cleared
it) so user actions are unaffected. The container itself already stopped correctly
— this stops the resurrection.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
29cd167894
commit
760a32bccf
@ -1326,6 +1326,27 @@ impl ProdContainerOrchestrator {
|
||||
self.resolve_dynamic_env(&mut resolved_manifest)?;
|
||||
let name = compute_container_name(&lm.manifest);
|
||||
|
||||
// An explicitly user-stopped app MUST stay stopped. The reconcile filter
|
||||
// already drops user-stopped apps, but its `dependency_required` override
|
||||
// re-includes a stopped app that an *active* app depends on (e.g. mempool
|
||||
// keeps electrumx in the list), and the in-memory `disabled` set is wiped
|
||||
// on manifest reload — so reconcile would resurrect it: its now-unreachable
|
||||
// ports look like a fault, the host-port "repair" restarts it, and
|
||||
// package.stop never sticks. Honour the on-disk marker here, the single
|
||||
// choke point every reconcile flows through. Explicit install/start/restart
|
||||
// clear the marker BEFORE calling this, so they are unaffected.
|
||||
{
|
||||
let user_stopped = crate::crash_recovery::load_user_stopped(&self.data_dir).await;
|
||||
if user_stopped.contains(&app_id) || user_stopped.contains(&name) {
|
||||
tracing::debug!(
|
||||
app_id = %app_id,
|
||||
container = %name,
|
||||
"reconcile skipped — app is user-stopped (must stay stopped)"
|
||||
);
|
||||
return Ok(ReconcileAction::Left("user-stopped".into()));
|
||||
}
|
||||
}
|
||||
|
||||
match self.runtime.get_container_status(&name).await {
|
||||
Ok(status) => {
|
||||
// Phase 3.3: migrate pre-Phase-3 containers in place, but only
|
||||
@ -2839,6 +2860,11 @@ impl ContainerOrchestrator for ProdContainerOrchestrator {
|
||||
let mut state = self.state.write().await;
|
||||
state.disabled.remove(app_id);
|
||||
}
|
||||
// Installing is an explicit "I want this running" action — clear the
|
||||
// user-stopped marker so the new reconcile guard in
|
||||
// `ensure_running_with_mode` doesn't skip the very container we're
|
||||
// installing. (start/restart RPC handlers clear it on their side too.)
|
||||
crate::crash_recovery::clear_user_stopped(&self.data_dir, app_id).await;
|
||||
// Idempotent: if the container is already up and healthy, just
|
||||
// refresh hooks and return. If it's stopped, start it. If it's
|
||||
// missing or in a wedged state, install fresh.
|
||||
@ -2882,6 +2908,10 @@ impl ContainerOrchestrator for ProdContainerOrchestrator {
|
||||
let mut state = self.state.write().await;
|
||||
state.disabled.remove(app_id);
|
||||
}
|
||||
// Explicit start clears the user-stopped marker so the reconcile guard in
|
||||
// `ensure_running_with_mode` doesn't skip this container (symmetric with
|
||||
// install; the start/restart RPC handlers also clear it).
|
||||
crate::crash_recovery::clear_user_stopped(&self.data_dir, app_id).await;
|
||||
let lm = self.loaded(app_id).await?;
|
||||
let action = self.ensure_running(&lm).await?;
|
||||
match action {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user