fix(reconcile): keep user-stopped apps stopped (reconciler was resurrecting them)
package.stop a dependency (e.g. electrumx, a mempool dep) and the reconciler
restarts it within ~8s: the reconcile filter's dependency_required override
re-includes a user-stopped app that an active app depends on, and the in-memory
disabled set is wiped on manifest reload — so ensure_running runs, the stopped
app's unreachable ports look like a fault, the host-port repair restarts it, and
package.stop never sticks (gate 'transitions to stopped' times out).
Fix: guard ensure_running_with_mode on the on-disk user_stopped marker (the single
choke point every reconcile flows through) → Left('user-stopped'). Explicit
install/start clear the marker first (added clear_user_stopped to orchestrator
install/start, symmetric with disabled.remove; start/restart RPC already cleared
it) so user actions are unaffected. The container itself already stopped correctly
— this stops the resurrection.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
29cd167894
commit
760a32bccf
@ -1326,6 +1326,27 @@ impl ProdContainerOrchestrator {
|
|||||||
self.resolve_dynamic_env(&mut resolved_manifest)?;
|
self.resolve_dynamic_env(&mut resolved_manifest)?;
|
||||||
let name = compute_container_name(&lm.manifest);
|
let name = compute_container_name(&lm.manifest);
|
||||||
|
|
||||||
|
// An explicitly user-stopped app MUST stay stopped. The reconcile filter
|
||||||
|
// already drops user-stopped apps, but its `dependency_required` override
|
||||||
|
// re-includes a stopped app that an *active* app depends on (e.g. mempool
|
||||||
|
// keeps electrumx in the list), and the in-memory `disabled` set is wiped
|
||||||
|
// on manifest reload — so reconcile would resurrect it: its now-unreachable
|
||||||
|
// ports look like a fault, the host-port "repair" restarts it, and
|
||||||
|
// package.stop never sticks. Honour the on-disk marker here, the single
|
||||||
|
// choke point every reconcile flows through. Explicit install/start/restart
|
||||||
|
// clear the marker BEFORE calling this, so they are unaffected.
|
||||||
|
{
|
||||||
|
let user_stopped = crate::crash_recovery::load_user_stopped(&self.data_dir).await;
|
||||||
|
if user_stopped.contains(&app_id) || user_stopped.contains(&name) {
|
||||||
|
tracing::debug!(
|
||||||
|
app_id = %app_id,
|
||||||
|
container = %name,
|
||||||
|
"reconcile skipped — app is user-stopped (must stay stopped)"
|
||||||
|
);
|
||||||
|
return Ok(ReconcileAction::Left("user-stopped".into()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
match self.runtime.get_container_status(&name).await {
|
match self.runtime.get_container_status(&name).await {
|
||||||
Ok(status) => {
|
Ok(status) => {
|
||||||
// Phase 3.3: migrate pre-Phase-3 containers in place, but only
|
// Phase 3.3: migrate pre-Phase-3 containers in place, but only
|
||||||
@ -2839,6 +2860,11 @@ impl ContainerOrchestrator for ProdContainerOrchestrator {
|
|||||||
let mut state = self.state.write().await;
|
let mut state = self.state.write().await;
|
||||||
state.disabled.remove(app_id);
|
state.disabled.remove(app_id);
|
||||||
}
|
}
|
||||||
|
// Installing is an explicit "I want this running" action — clear the
|
||||||
|
// user-stopped marker so the new reconcile guard in
|
||||||
|
// `ensure_running_with_mode` doesn't skip the very container we're
|
||||||
|
// installing. (start/restart RPC handlers clear it on their side too.)
|
||||||
|
crate::crash_recovery::clear_user_stopped(&self.data_dir, app_id).await;
|
||||||
// Idempotent: if the container is already up and healthy, just
|
// Idempotent: if the container is already up and healthy, just
|
||||||
// refresh hooks and return. If it's stopped, start it. If it's
|
// refresh hooks and return. If it's stopped, start it. If it's
|
||||||
// missing or in a wedged state, install fresh.
|
// missing or in a wedged state, install fresh.
|
||||||
@ -2882,6 +2908,10 @@ impl ContainerOrchestrator for ProdContainerOrchestrator {
|
|||||||
let mut state = self.state.write().await;
|
let mut state = self.state.write().await;
|
||||||
state.disabled.remove(app_id);
|
state.disabled.remove(app_id);
|
||||||
}
|
}
|
||||||
|
// Explicit start clears the user-stopped marker so the reconcile guard in
|
||||||
|
// `ensure_running_with_mode` doesn't skip this container (symmetric with
|
||||||
|
// install; the start/restart RPC handlers also clear it).
|
||||||
|
crate::crash_recovery::clear_user_stopped(&self.data_dir, app_id).await;
|
||||||
let lm = self.loaded(app_id).await?;
|
let lm = self.loaded(app_id).await?;
|
||||||
let action = self.ensure_running(&lm).await?;
|
let action = self.ensure_running(&lm).await?;
|
||||||
match action {
|
match action {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user