fix(kiosk): cap chromium resources + drop GPU rasterization when headless (#36)

The kiosk chromium pinned ~92% of a core (software-compositing spin from
--enable-gpu-rasterization on a GPU-less/headless node), saturating the machine
and starving the backend + container builds — it caused the .198 receive timeout
and the deploy storms.

- archipelago-kiosk.service: CPUQuota=75% + MemoryMax/High + Delegate, so a
  runaway kiosk can never take the whole node down.
- archipelago-kiosk-launcher.sh: detect /dev/dri — use GPU rasterization only
  when a GPU exists, else --disable-gpu (avoids the headless spin).
- bootstrap::ensure_kiosk_hardened: OTA self-heal that installs the updated
  unit+launcher on already-deployed nodes, daemon-reloads, and only try-restarts
  a *running* kiosk (never re-enables an operator-disabled one).

cargo check clean; launcher bash -n clean; unit syntax valid.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
archipelago 2026-06-16 11:10:26 -04:00
parent ab6fcef6f3
commit 45ac9be965
4 changed files with 63 additions and 2 deletions

View File

@ -30,6 +30,15 @@ const DOCTOR_SH_PATH: &str = "/home/archipelago/archy/scripts/container-doctor.s
const DOCTOR_SERVICE_PATH: &str = "/etc/systemd/system/archipelago-doctor.service";
const DOCTOR_TIMER_PATH: &str = "/etc/systemd/system/archipelago-doctor.timer";
// Kiosk hardening (#36): keep the deployed unit + launcher in sync with the
// repo so the CPU/memory cap and the GPU-vs-headless flag selection reach
// already-installed nodes via OTA, not just fresh ISOs.
const KIOSK_SERVICE: &str = include_str!("../../../image-recipe/configs/archipelago-kiosk.service");
const KIOSK_LAUNCHER: &str =
include_str!("../../../image-recipe/configs/archipelago-kiosk-launcher.sh");
const KIOSK_SERVICE_PATH: &str = "/etc/systemd/system/archipelago-kiosk.service";
const KIOSK_LAUNCHER_PATH: &str = "/usr/local/bin/archipelago-kiosk-launcher";
const NGINX_CONF_PATH: &str = "/etc/nginx/sites-available/archipelago";
const NGINX_ENABLED_CONF_PATH: &str = "/etc/nginx/sites-enabled/archipelago";
/// Per-app proxy snippet included by the HTTPS (:443) server block. Carries its
@ -573,6 +582,35 @@ pub async fn ensure_archipelago_mount_ordering() {
}
}
/// #36 self-heal: keep the kiosk unit + launcher current on already-deployed
/// nodes so the CPU/memory cap (a runaway chromium was saturating the node and
/// starving the backend) and the GPU-vs-headless flag selection arrive via OTA.
/// No-op on nodes without the kiosk installed; only restarts the kiosk if it's
/// actually running (so it never re-enables an operator-disabled kiosk).
pub async fn ensure_kiosk_hardened() {
if fs::metadata(KIOSK_SERVICE_PATH).await.is_err() {
return; // kiosk not installed on this node
}
let svc_changed = write_root_if_needed(KIOSK_SERVICE_PATH, KIOSK_SERVICE)
.await
.unwrap_or(false);
let launcher_changed = write_root_if_needed(KIOSK_LAUNCHER_PATH, KIOSK_LAUNCHER)
.await
.unwrap_or(false);
if launcher_changed {
let _ = host_sudo(&["chmod", "+x", KIOSK_LAUNCHER_PATH]).await;
}
if svc_changed || launcher_changed {
if let Err(e) = host_sudo(&["systemctl", "daemon-reload"]).await {
warn!("kiosk hardening: daemon-reload failed: {:#}", e);
}
// try-restart only restarts a currently-active unit — leaves a stopped/
// disabled kiosk alone.
let _ = host_sudo(&["systemctl", "try-restart", "archipelago-kiosk.service"]).await;
info!("kiosk: applied resource cap + GPU-flag hardening (#36)");
}
}
/// Patch the nginx site config to add missing backend proxy blocks. Older ISO
/// configs shipped individual per-endpoint `location` blocks, so missing
/// endpoints silently fell through to the SPA `index.html` and the frontend

View File

@ -277,6 +277,10 @@ async fn main() -> Result<()> {
// only — effective next reboot; never restarts the running service.
tokio::spawn(bootstrap::ensure_archipelago_mount_ordering());
// #36: keep the kiosk unit + launcher hardened (CPU/mem cap + GPU-vs-headless
// flags) on already-deployed nodes via OTA; no-op if the kiosk isn't installed.
tokio::spawn(bootstrap::ensure_kiosk_hardened());
// Spawn periodic container snapshot (for crash recovery)
crash_recovery::spawn_snapshot_task(config.data_dir.clone());

View File

@ -79,6 +79,16 @@ xset s noblank 2>/dev/null || true
pkill -u archipelago -f 'chromium.*localhost' 2>/dev/null || true
sleep 1
# GPU vs headless (#36). On a real kiosk display with a GPU, GPU rasterization is
# fast. On a GPU-less / headless server (no /dev/dri), --enable-gpu-rasterization
# forces GPU paths that fall back to software compositing and SPIN a full core at
# ~92% CPU, saturating the node. Detect the GPU and pick safe flags accordingly.
if [ -e /dev/dri/card0 ] || [ -e /dev/dri/renderD128 ]; then
GPU_FLAGS="--enable-gpu-rasterization --num-raster-threads=2"
else
GPU_FLAGS="--disable-gpu --num-raster-threads=1"
fi
while true; do
sudo -u archipelago env DISPLAY=:0 HOME=/home/archipelago chromium --kiosk \
--app=http://localhost/kiosk?safe_area_x=${KIOSK_SAFE_AREA_X_PX:-0}\&safe_area_y=${KIOSK_SAFE_AREA_Y_PX:-0} \
@ -92,8 +102,7 @@ while true; do
--disable-save-password-bubble \
--disable-suggestions-service \
--disable-component-update \
--enable-gpu-rasterization \
--num-raster-threads=2 \
$GPU_FLAGS \
--renderer-process-limit=2 \
--window-size=1920,1080 \
--window-position=0,0 \

View File

@ -20,5 +20,15 @@ TimeoutStartSec=360
Restart=always
RestartSec=5
# Resource guardrail (#36). On GPU-less / headless hardware chromium could spin
# software compositing at ~92% of a core, saturating the node and starving the
# backend (it caused the .198 receive timeout + deploy storms). Cap CPU + memory
# so a runaway kiosk can never take the whole machine down; Delegate so the cap
# also binds the chromium/Xorg children in this unit's cgroup.
Delegate=yes
CPUQuota=75%
MemoryMax=1500M
MemoryHigh=1200M
[Install]
WantedBy=multi-user.target