archy/core/archipelago/src/api/rpc/transitional.rs
archipelago cd6f8bad70 fix(install-log): pre-create /var/log/archipelago/ so non-root backend can write
The backend runs as `archipelago` and calls `install_log()` to append
audit lines to the install log on every install / update / remove /
start / stop / restart. Target path was /var/log/archipelago-container-installs.log,
which does not exist and cannot be created by the service because
/var/log/ is root-owned. OpenOptions errors were silently swallowed,
so the log was never written on any node.

Ship a tmpfiles.d rule that pre-creates /var/log/archipelago/ and
container-installs.log with archipelago:archipelago ownership. Move
the const path to match, keeping logs inside the directory logrotate
already rotates (image-recipe/configs/logrotate.conf). Install the
rule from both the ISO build and self-update, and apply it
immediately on self-update so existing nodes get a working log
without needing a reboot.

Verified on .228: file created, backend user can write, backend
binary rebuilt with new const.
2026-04-23 12:02:46 -04:00

172 lines
7.0 KiB
Rust

//! Async lifecycle helper for container Stop/Start/Restart RPCs.
//!
//! The `ContainerOrchestrator` trait is intentionally synchronous — blocking
//! calls keep the reconciler, boot flow, chaos harness, and unit tests
//! deterministic. But the RPC layer must return to the UI in <1s so the
//! dashboard can render a transitional "Stopping…" / "Starting…" label while
//! the underlying `podman stop` (up to 600s for bitcoin-core) runs in the
//! background.
//!
//! `RpcHandler::spawn_transitional` bridges the two: it
//! 1. flips the package state in `StateManager` to the appropriate
//! transitional variant (`Stopping` / `Starting` / `Restarting`),
//! which fans out to WebSocket clients immediately.
//! 2. `tokio::spawn`s the actual orchestrator call.
//! 3. on success, writes the final state (`Stopped` / `Running`).
//! 4. on error, reverts to the pre-transition state and logs via
//! `install_log()` so the incident shows up in
//! `/var/log/archipelago/container-installs.log`.
//!
//! The server.rs package-scan loop must also be taught to preserve
//! transitional states — see `server.rs:scan_and_update_packages`'s merge
//! logic and the companion `merge_preserving_transitional` helper.
use super::package::install_log;
use super::RpcHandler;
use crate::container::ContainerOrchestrator;
use crate::data_model::PackageState;
use crate::state::StateManager;
use anyhow::Result;
use std::sync::Arc;
use tracing::{error, info, warn};
/// The three transitional lifecycle operations that run asynchronously from
/// the RPC handler. `Install` and `Remove` are intentionally NOT here — they
/// already have their own progress-tracking paths (`install_progress`,
/// `uninstall_stage`) with multi-step UI feedback.
#[derive(Debug, Clone, Copy)]
pub(super) enum Op {
Stop,
Start,
Restart,
}
impl Op {
/// The `PackageState` to set on the entry while the operation is in
/// flight. The package-scan merge loop must preserve this variant and
/// refuse to overwrite it with whatever podman reports (see
/// `merge_preserving_transitional` in server.rs).
fn transitional_state(self) -> PackageState {
match self {
Op::Stop => PackageState::Stopping,
Op::Start => PackageState::Starting,
Op::Restart => PackageState::Restarting,
}
}
/// The `PackageState` to set on success. On error the caller reverts to
/// the pre-transition state rather than using these.
fn final_state_on_success(self) -> PackageState {
match self {
Op::Stop => PackageState::Stopped,
Op::Start => PackageState::Running,
Op::Restart => PackageState::Running,
}
}
/// Prefix used in `install_log` entries so post-mortem readers can grep
/// the operation that failed.
fn log_prefix(self) -> &'static str {
match self {
Op::Stop => "STOP",
Op::Start => "START",
Op::Restart => "RESTART",
}
}
/// Call the orchestrator for this op. Kept in one place so the spawned
/// task doesn't repeat the match four times.
async fn dispatch(self, orch: &dyn ContainerOrchestrator, app_id: &str) -> Result<()> {
match self {
Op::Stop => orch.stop(app_id).await,
Op::Start => orch.start(app_id).await,
Op::Restart => orch.restart(app_id).await,
}
}
}
impl RpcHandler {
/// Flip the package state to `op.transitional_state()`, spawn a background
/// task that runs `op.dispatch()`, and return immediately. The spawned
/// task writes the final state on completion or reverts to the
/// pre-transition state on failure.
///
/// If no package entry exists for `app_id` (e.g. Start on a container
/// that was never installed), no pre-state is recorded and the spawn
/// still runs — the post-success path will no-op the state write and
/// the next scan will pick up the newly-created entry with the correct
/// state. This keeps the helper usable for stacks that lazily create
/// their entries.
pub(super) async fn spawn_transitional(&self, op: Op, app_id: String) -> Result<()> {
let orchestrator = self
.orchestrator
.as_ref()
.ok_or_else(|| anyhow::anyhow!("Container orchestrator not available"))?
.clone();
let state_manager = Arc::clone(&self.state_manager);
// Snapshot pre-transition state (for revert on error) and flip to
// transitional variant. Done BEFORE the spawn so the WebSocket push
// beats the RPC response — the UI should see "Stopping…" the moment
// it gets the RPC ok, not on the next scan.
let pre_state = flip_to_transitional(&state_manager, &app_id, op.transitional_state()).await;
let log_prefix = op.log_prefix();
let app_id_log = app_id.clone();
install_log(&format!("{}: {}", log_prefix, app_id_log)).await;
tokio::spawn(async move {
match op.dispatch(orchestrator.as_ref(), &app_id).await {
Ok(()) => {
info!("{} complete: {}", log_prefix, app_id);
set_state(&state_manager, &app_id, op.final_state_on_success()).await;
}
Err(e) => {
error!("{} failed for {}: {:#}", log_prefix, app_id, e);
install_log(&format!("{} FAIL: {}{:#}", log_prefix, app_id, e)).await;
// Revert to pre-transition state if we had one; otherwise
// leave the entry untouched so the next scan reconciles.
if let Some(prev) = pre_state {
set_state(&state_manager, &app_id, prev).await;
} else {
warn!(
"{}: no pre-transition state recorded for {}; leaving entry to next scan",
log_prefix, app_id
);
}
}
}
});
Ok(())
}
}
/// Flip the entry's state to `transitional` and return the previous state.
/// Returns `None` if there is no entry for `app_id`.
async fn flip_to_transitional(
state_manager: &StateManager,
app_id: &str,
transitional: PackageState,
) -> Option<PackageState> {
let (mut data, _) = state_manager.get_snapshot().await;
let prev = data.package_data.get(app_id).map(|e| e.state.clone());
if let Some(entry) = data.package_data.get_mut(app_id) {
entry.state = transitional;
state_manager.update_data(data).await;
}
prev
}
/// Set the entry's state to `new_state`. No-ops if the entry has since been
/// removed (e.g. uninstall ran concurrently).
async fn set_state(state_manager: &StateManager, app_id: &str, new_state: PackageState) {
let (mut data, _) = state_manager.get_snapshot().await;
if let Some(entry) = data.package_data.get_mut(app_id) {
if entry.state != new_state {
entry.state = new_state;
state_manager.update_data(data).await;
}
}
}