From 760a32bccf1e64b51e5f821ef08b8804e85514b2 Mon Sep 17 00:00:00 2001 From: archipelago Date: Mon, 22 Jun 2026 09:04:02 -0400 Subject: [PATCH] fix(reconcile): keep user-stopped apps stopped (reconciler was resurrecting them) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit package.stop a dependency (e.g. electrumx, a mempool dep) and the reconciler restarts it within ~8s: the reconcile filter's dependency_required override re-includes a user-stopped app that an active app depends on, and the in-memory disabled set is wiped on manifest reload — so ensure_running runs, the stopped app's unreachable ports look like a fault, the host-port repair restarts it, and package.stop never sticks (gate 'transitions to stopped' times out). Fix: guard ensure_running_with_mode on the on-disk user_stopped marker (the single choke point every reconcile flows through) → Left('user-stopped'). Explicit install/start clear the marker first (added clear_user_stopped to orchestrator install/start, symmetric with disabled.remove; start/restart RPC already cleared it) so user actions are unaffected. The container itself already stopped correctly — this stops the resurrection. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/container/prod_orchestrator.rs | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/core/archipelago/src/container/prod_orchestrator.rs b/core/archipelago/src/container/prod_orchestrator.rs index 1dac2496..a45cee5d 100644 --- a/core/archipelago/src/container/prod_orchestrator.rs +++ b/core/archipelago/src/container/prod_orchestrator.rs @@ -1326,6 +1326,27 @@ impl ProdContainerOrchestrator { self.resolve_dynamic_env(&mut resolved_manifest)?; let name = compute_container_name(&lm.manifest); + // An explicitly user-stopped app MUST stay stopped. The reconcile filter + // already drops user-stopped apps, but its `dependency_required` override + // re-includes a stopped app that an *active* app depends on (e.g. mempool + // keeps electrumx in the list), and the in-memory `disabled` set is wiped + // on manifest reload — so reconcile would resurrect it: its now-unreachable + // ports look like a fault, the host-port "repair" restarts it, and + // package.stop never sticks. Honour the on-disk marker here, the single + // choke point every reconcile flows through. Explicit install/start/restart + // clear the marker BEFORE calling this, so they are unaffected. + { + let user_stopped = crate::crash_recovery::load_user_stopped(&self.data_dir).await; + if user_stopped.contains(&app_id) || user_stopped.contains(&name) { + tracing::debug!( + app_id = %app_id, + container = %name, + "reconcile skipped — app is user-stopped (must stay stopped)" + ); + return Ok(ReconcileAction::Left("user-stopped".into())); + } + } + match self.runtime.get_container_status(&name).await { Ok(status) => { // Phase 3.3: migrate pre-Phase-3 containers in place, but only @@ -2839,6 +2860,11 @@ impl ContainerOrchestrator for ProdContainerOrchestrator { let mut state = self.state.write().await; state.disabled.remove(app_id); } + // Installing is an explicit "I want this running" action — clear the + // user-stopped marker so the new reconcile guard in + // `ensure_running_with_mode` doesn't skip the very container we're + // installing. (start/restart RPC handlers clear it on their side too.) + crate::crash_recovery::clear_user_stopped(&self.data_dir, app_id).await; // Idempotent: if the container is already up and healthy, just // refresh hooks and return. If it's stopped, start it. If it's // missing or in a wedged state, install fresh. @@ -2882,6 +2908,10 @@ impl ContainerOrchestrator for ProdContainerOrchestrator { let mut state = self.state.write().await; state.disabled.remove(app_id); } + // Explicit start clears the user-stopped marker so the reconcile guard in + // `ensure_running_with_mode` doesn't skip this container (symmetric with + // install; the start/restart RPC handlers also clear it). + crate::crash_recovery::clear_user_stopped(&self.data_dir, app_id).await; let lm = self.loaded(app_id).await?; let action = self.ensure_running(&lm).await?; match action {