diff --git a/core/archipelago/src/container/prod_orchestrator.rs b/core/archipelago/src/container/prod_orchestrator.rs index 1dac2496..a45cee5d 100644 --- a/core/archipelago/src/container/prod_orchestrator.rs +++ b/core/archipelago/src/container/prod_orchestrator.rs @@ -1326,6 +1326,27 @@ impl ProdContainerOrchestrator { self.resolve_dynamic_env(&mut resolved_manifest)?; let name = compute_container_name(&lm.manifest); + // An explicitly user-stopped app MUST stay stopped. The reconcile filter + // already drops user-stopped apps, but its `dependency_required` override + // re-includes a stopped app that an *active* app depends on (e.g. mempool + // keeps electrumx in the list), and the in-memory `disabled` set is wiped + // on manifest reload — so reconcile would resurrect it: its now-unreachable + // ports look like a fault, the host-port "repair" restarts it, and + // package.stop never sticks. Honour the on-disk marker here, the single + // choke point every reconcile flows through. Explicit install/start/restart + // clear the marker BEFORE calling this, so they are unaffected. + { + let user_stopped = crate::crash_recovery::load_user_stopped(&self.data_dir).await; + if user_stopped.contains(&app_id) || user_stopped.contains(&name) { + tracing::debug!( + app_id = %app_id, + container = %name, + "reconcile skipped — app is user-stopped (must stay stopped)" + ); + return Ok(ReconcileAction::Left("user-stopped".into())); + } + } + match self.runtime.get_container_status(&name).await { Ok(status) => { // Phase 3.3: migrate pre-Phase-3 containers in place, but only @@ -2839,6 +2860,11 @@ impl ContainerOrchestrator for ProdContainerOrchestrator { let mut state = self.state.write().await; state.disabled.remove(app_id); } + // Installing is an explicit "I want this running" action — clear the + // user-stopped marker so the new reconcile guard in + // `ensure_running_with_mode` doesn't skip the very container we're + // installing. (start/restart RPC handlers clear it on their side too.) + crate::crash_recovery::clear_user_stopped(&self.data_dir, app_id).await; // Idempotent: if the container is already up and healthy, just // refresh hooks and return. If it's stopped, start it. If it's // missing or in a wedged state, install fresh. @@ -2882,6 +2908,10 @@ impl ContainerOrchestrator for ProdContainerOrchestrator { let mut state = self.state.write().await; state.disabled.remove(app_id); } + // Explicit start clears the user-stopped marker so the reconcile guard in + // `ensure_running_with_mode` doesn't skip this container (symmetric with + // install; the start/restart RPC handlers also clear it). + crate::crash_recovery::clear_user_stopped(&self.data_dir, app_id).await; let lm = self.loaded(app_id).await?; let action = self.ensure_running(&lm).await?; match action {