fix(kiosk): cap chromium resources + drop GPU rasterization when headless (#36)
The kiosk chromium pinned ~92% of a core (software-compositing spin from --enable-gpu-rasterization on a GPU-less/headless node), saturating the machine and starving the backend + container builds — it caused the .198 receive timeout and the deploy storms. - archipelago-kiosk.service: CPUQuota=75% + MemoryMax/High + Delegate, so a runaway kiosk can never take the whole node down. - archipelago-kiosk-launcher.sh: detect /dev/dri — use GPU rasterization only when a GPU exists, else --disable-gpu (avoids the headless spin). - bootstrap::ensure_kiosk_hardened: OTA self-heal that installs the updated unit+launcher on already-deployed nodes, daemon-reloads, and only try-restarts a *running* kiosk (never re-enables an operator-disabled one). cargo check clean; launcher bash -n clean; unit syntax valid. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
ab6fcef6f3
commit
45ac9be965
@ -30,6 +30,15 @@ const DOCTOR_SH_PATH: &str = "/home/archipelago/archy/scripts/container-doctor.s
|
||||
const DOCTOR_SERVICE_PATH: &str = "/etc/systemd/system/archipelago-doctor.service";
|
||||
const DOCTOR_TIMER_PATH: &str = "/etc/systemd/system/archipelago-doctor.timer";
|
||||
|
||||
// Kiosk hardening (#36): keep the deployed unit + launcher in sync with the
|
||||
// repo so the CPU/memory cap and the GPU-vs-headless flag selection reach
|
||||
// already-installed nodes via OTA, not just fresh ISOs.
|
||||
const KIOSK_SERVICE: &str = include_str!("../../../image-recipe/configs/archipelago-kiosk.service");
|
||||
const KIOSK_LAUNCHER: &str =
|
||||
include_str!("../../../image-recipe/configs/archipelago-kiosk-launcher.sh");
|
||||
const KIOSK_SERVICE_PATH: &str = "/etc/systemd/system/archipelago-kiosk.service";
|
||||
const KIOSK_LAUNCHER_PATH: &str = "/usr/local/bin/archipelago-kiosk-launcher";
|
||||
|
||||
const NGINX_CONF_PATH: &str = "/etc/nginx/sites-available/archipelago";
|
||||
const NGINX_ENABLED_CONF_PATH: &str = "/etc/nginx/sites-enabled/archipelago";
|
||||
/// Per-app proxy snippet included by the HTTPS (:443) server block. Carries its
|
||||
@ -573,6 +582,35 @@ pub async fn ensure_archipelago_mount_ordering() {
|
||||
}
|
||||
}
|
||||
|
||||
/// #36 self-heal: keep the kiosk unit + launcher current on already-deployed
|
||||
/// nodes so the CPU/memory cap (a runaway chromium was saturating the node and
|
||||
/// starving the backend) and the GPU-vs-headless flag selection arrive via OTA.
|
||||
/// No-op on nodes without the kiosk installed; only restarts the kiosk if it's
|
||||
/// actually running (so it never re-enables an operator-disabled kiosk).
|
||||
pub async fn ensure_kiosk_hardened() {
|
||||
if fs::metadata(KIOSK_SERVICE_PATH).await.is_err() {
|
||||
return; // kiosk not installed on this node
|
||||
}
|
||||
let svc_changed = write_root_if_needed(KIOSK_SERVICE_PATH, KIOSK_SERVICE)
|
||||
.await
|
||||
.unwrap_or(false);
|
||||
let launcher_changed = write_root_if_needed(KIOSK_LAUNCHER_PATH, KIOSK_LAUNCHER)
|
||||
.await
|
||||
.unwrap_or(false);
|
||||
if launcher_changed {
|
||||
let _ = host_sudo(&["chmod", "+x", KIOSK_LAUNCHER_PATH]).await;
|
||||
}
|
||||
if svc_changed || launcher_changed {
|
||||
if let Err(e) = host_sudo(&["systemctl", "daemon-reload"]).await {
|
||||
warn!("kiosk hardening: daemon-reload failed: {:#}", e);
|
||||
}
|
||||
// try-restart only restarts a currently-active unit — leaves a stopped/
|
||||
// disabled kiosk alone.
|
||||
let _ = host_sudo(&["systemctl", "try-restart", "archipelago-kiosk.service"]).await;
|
||||
info!("kiosk: applied resource cap + GPU-flag hardening (#36)");
|
||||
}
|
||||
}
|
||||
|
||||
/// Patch the nginx site config to add missing backend proxy blocks. Older ISO
|
||||
/// configs shipped individual per-endpoint `location` blocks, so missing
|
||||
/// endpoints silently fell through to the SPA `index.html` and the frontend
|
||||
|
||||
@ -277,6 +277,10 @@ async fn main() -> Result<()> {
|
||||
// only — effective next reboot; never restarts the running service.
|
||||
tokio::spawn(bootstrap::ensure_archipelago_mount_ordering());
|
||||
|
||||
// #36: keep the kiosk unit + launcher hardened (CPU/mem cap + GPU-vs-headless
|
||||
// flags) on already-deployed nodes via OTA; no-op if the kiosk isn't installed.
|
||||
tokio::spawn(bootstrap::ensure_kiosk_hardened());
|
||||
|
||||
// Spawn periodic container snapshot (for crash recovery)
|
||||
crash_recovery::spawn_snapshot_task(config.data_dir.clone());
|
||||
|
||||
|
||||
@ -79,6 +79,16 @@ xset s noblank 2>/dev/null || true
|
||||
pkill -u archipelago -f 'chromium.*localhost' 2>/dev/null || true
|
||||
sleep 1
|
||||
|
||||
# GPU vs headless (#36). On a real kiosk display with a GPU, GPU rasterization is
|
||||
# fast. On a GPU-less / headless server (no /dev/dri), --enable-gpu-rasterization
|
||||
# forces GPU paths that fall back to software compositing and SPIN a full core at
|
||||
# ~92% CPU, saturating the node. Detect the GPU and pick safe flags accordingly.
|
||||
if [ -e /dev/dri/card0 ] || [ -e /dev/dri/renderD128 ]; then
|
||||
GPU_FLAGS="--enable-gpu-rasterization --num-raster-threads=2"
|
||||
else
|
||||
GPU_FLAGS="--disable-gpu --num-raster-threads=1"
|
||||
fi
|
||||
|
||||
while true; do
|
||||
sudo -u archipelago env DISPLAY=:0 HOME=/home/archipelago chromium --kiosk \
|
||||
--app=http://localhost/kiosk?safe_area_x=${KIOSK_SAFE_AREA_X_PX:-0}\&safe_area_y=${KIOSK_SAFE_AREA_Y_PX:-0} \
|
||||
@ -92,8 +102,7 @@ while true; do
|
||||
--disable-save-password-bubble \
|
||||
--disable-suggestions-service \
|
||||
--disable-component-update \
|
||||
--enable-gpu-rasterization \
|
||||
--num-raster-threads=2 \
|
||||
$GPU_FLAGS \
|
||||
--renderer-process-limit=2 \
|
||||
--window-size=1920,1080 \
|
||||
--window-position=0,0 \
|
||||
|
||||
@ -20,5 +20,15 @@ TimeoutStartSec=360
|
||||
Restart=always
|
||||
RestartSec=5
|
||||
|
||||
# Resource guardrail (#36). On GPU-less / headless hardware chromium could spin
|
||||
# software compositing at ~92% of a core, saturating the node and starving the
|
||||
# backend (it caused the .198 receive timeout + deploy storms). Cap CPU + memory
|
||||
# so a runaway kiosk can never take the whole machine down; Delegate so the cap
|
||||
# also binds the chromium/Xorg children in this unit's cgroup.
|
||||
Delegate=yes
|
||||
CPUQuota=75%
|
||||
MemoryMax=1500M
|
||||
MemoryHigh=1200M
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user