From 45ac9be965550d3e342667e8a31851b027b6c840 Mon Sep 17 00:00:00 2001 From: archipelago Date: Tue, 16 Jun 2026 11:10:26 -0400 Subject: [PATCH] fix(kiosk): cap chromium resources + drop GPU rasterization when headless (#36) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kiosk chromium pinned ~92% of a core (software-compositing spin from --enable-gpu-rasterization on a GPU-less/headless node), saturating the machine and starving the backend + container builds — it caused the .198 receive timeout and the deploy storms. - archipelago-kiosk.service: CPUQuota=75% + MemoryMax/High + Delegate, so a runaway kiosk can never take the whole node down. - archipelago-kiosk-launcher.sh: detect /dev/dri — use GPU rasterization only when a GPU exists, else --disable-gpu (avoids the headless spin). - bootstrap::ensure_kiosk_hardened: OTA self-heal that installs the updated unit+launcher on already-deployed nodes, daemon-reloads, and only try-restarts a *running* kiosk (never re-enables an operator-disabled one). cargo check clean; launcher bash -n clean; unit syntax valid. Co-Authored-By: Claude Opus 4.8 (1M context) --- core/archipelago/src/bootstrap.rs | 38 +++++++++++++++++++ core/archipelago/src/main.rs | 4 ++ .../configs/archipelago-kiosk-launcher.sh | 13 ++++++- .../configs/archipelago-kiosk.service | 10 +++++ 4 files changed, 63 insertions(+), 2 deletions(-) diff --git a/core/archipelago/src/bootstrap.rs b/core/archipelago/src/bootstrap.rs index cbccf2dd..a39e8254 100644 --- a/core/archipelago/src/bootstrap.rs +++ b/core/archipelago/src/bootstrap.rs @@ -30,6 +30,15 @@ const DOCTOR_SH_PATH: &str = "/home/archipelago/archy/scripts/container-doctor.s const DOCTOR_SERVICE_PATH: &str = "/etc/systemd/system/archipelago-doctor.service"; const DOCTOR_TIMER_PATH: &str = "/etc/systemd/system/archipelago-doctor.timer"; +// Kiosk hardening (#36): keep the deployed unit + launcher in sync with the +// repo so the CPU/memory cap and the GPU-vs-headless flag selection reach +// already-installed nodes via OTA, not just fresh ISOs. +const KIOSK_SERVICE: &str = include_str!("../../../image-recipe/configs/archipelago-kiosk.service"); +const KIOSK_LAUNCHER: &str = + include_str!("../../../image-recipe/configs/archipelago-kiosk-launcher.sh"); +const KIOSK_SERVICE_PATH: &str = "/etc/systemd/system/archipelago-kiosk.service"; +const KIOSK_LAUNCHER_PATH: &str = "/usr/local/bin/archipelago-kiosk-launcher"; + const NGINX_CONF_PATH: &str = "/etc/nginx/sites-available/archipelago"; const NGINX_ENABLED_CONF_PATH: &str = "/etc/nginx/sites-enabled/archipelago"; /// Per-app proxy snippet included by the HTTPS (:443) server block. Carries its @@ -573,6 +582,35 @@ pub async fn ensure_archipelago_mount_ordering() { } } +/// #36 self-heal: keep the kiosk unit + launcher current on already-deployed +/// nodes so the CPU/memory cap (a runaway chromium was saturating the node and +/// starving the backend) and the GPU-vs-headless flag selection arrive via OTA. +/// No-op on nodes without the kiosk installed; only restarts the kiosk if it's +/// actually running (so it never re-enables an operator-disabled kiosk). +pub async fn ensure_kiosk_hardened() { + if fs::metadata(KIOSK_SERVICE_PATH).await.is_err() { + return; // kiosk not installed on this node + } + let svc_changed = write_root_if_needed(KIOSK_SERVICE_PATH, KIOSK_SERVICE) + .await + .unwrap_or(false); + let launcher_changed = write_root_if_needed(KIOSK_LAUNCHER_PATH, KIOSK_LAUNCHER) + .await + .unwrap_or(false); + if launcher_changed { + let _ = host_sudo(&["chmod", "+x", KIOSK_LAUNCHER_PATH]).await; + } + if svc_changed || launcher_changed { + if let Err(e) = host_sudo(&["systemctl", "daemon-reload"]).await { + warn!("kiosk hardening: daemon-reload failed: {:#}", e); + } + // try-restart only restarts a currently-active unit — leaves a stopped/ + // disabled kiosk alone. + let _ = host_sudo(&["systemctl", "try-restart", "archipelago-kiosk.service"]).await; + info!("kiosk: applied resource cap + GPU-flag hardening (#36)"); + } +} + /// Patch the nginx site config to add missing backend proxy blocks. Older ISO /// configs shipped individual per-endpoint `location` blocks, so missing /// endpoints silently fell through to the SPA `index.html` and the frontend diff --git a/core/archipelago/src/main.rs b/core/archipelago/src/main.rs index cfea4950..0850a813 100644 --- a/core/archipelago/src/main.rs +++ b/core/archipelago/src/main.rs @@ -277,6 +277,10 @@ async fn main() -> Result<()> { // only — effective next reboot; never restarts the running service. tokio::spawn(bootstrap::ensure_archipelago_mount_ordering()); + // #36: keep the kiosk unit + launcher hardened (CPU/mem cap + GPU-vs-headless + // flags) on already-deployed nodes via OTA; no-op if the kiosk isn't installed. + tokio::spawn(bootstrap::ensure_kiosk_hardened()); + // Spawn periodic container snapshot (for crash recovery) crash_recovery::spawn_snapshot_task(config.data_dir.clone()); diff --git a/image-recipe/configs/archipelago-kiosk-launcher.sh b/image-recipe/configs/archipelago-kiosk-launcher.sh index 17a60203..90d77e43 100644 --- a/image-recipe/configs/archipelago-kiosk-launcher.sh +++ b/image-recipe/configs/archipelago-kiosk-launcher.sh @@ -79,6 +79,16 @@ xset s noblank 2>/dev/null || true pkill -u archipelago -f 'chromium.*localhost' 2>/dev/null || true sleep 1 +# GPU vs headless (#36). On a real kiosk display with a GPU, GPU rasterization is +# fast. On a GPU-less / headless server (no /dev/dri), --enable-gpu-rasterization +# forces GPU paths that fall back to software compositing and SPIN a full core at +# ~92% CPU, saturating the node. Detect the GPU and pick safe flags accordingly. +if [ -e /dev/dri/card0 ] || [ -e /dev/dri/renderD128 ]; then + GPU_FLAGS="--enable-gpu-rasterization --num-raster-threads=2" +else + GPU_FLAGS="--disable-gpu --num-raster-threads=1" +fi + while true; do sudo -u archipelago env DISPLAY=:0 HOME=/home/archipelago chromium --kiosk \ --app=http://localhost/kiosk?safe_area_x=${KIOSK_SAFE_AREA_X_PX:-0}\&safe_area_y=${KIOSK_SAFE_AREA_Y_PX:-0} \ @@ -92,8 +102,7 @@ while true; do --disable-save-password-bubble \ --disable-suggestions-service \ --disable-component-update \ - --enable-gpu-rasterization \ - --num-raster-threads=2 \ + $GPU_FLAGS \ --renderer-process-limit=2 \ --window-size=1920,1080 \ --window-position=0,0 \ diff --git a/image-recipe/configs/archipelago-kiosk.service b/image-recipe/configs/archipelago-kiosk.service index 6047fc6e..5db41f3c 100644 --- a/image-recipe/configs/archipelago-kiosk.service +++ b/image-recipe/configs/archipelago-kiosk.service @@ -20,5 +20,15 @@ TimeoutStartSec=360 Restart=always RestartSec=5 +# Resource guardrail (#36). On GPU-less / headless hardware chromium could spin +# software compositing at ~92% of a core, saturating the node and starving the +# backend (it caused the .198 receive timeout + deploy storms). Cap CPU + memory +# so a runaway kiosk can never take the whole machine down; Delegate so the cap +# also binds the chromium/Xorg children in this unit's cgroup. +Delegate=yes +CPUQuota=75% +MemoryMax=1500M +MemoryHigh=1200M + [Install] WantedBy=multi-user.target