fix(kiosk): cap chromium resources + drop GPU rasterization when headless (#36)
The kiosk chromium pinned ~92% of a core (software-compositing spin from --enable-gpu-rasterization on a GPU-less/headless node), saturating the machine and starving the backend + container builds — it caused the .198 receive timeout and the deploy storms. - archipelago-kiosk.service: CPUQuota=75% + MemoryMax/High + Delegate, so a runaway kiosk can never take the whole node down. - archipelago-kiosk-launcher.sh: detect /dev/dri — use GPU rasterization only when a GPU exists, else --disable-gpu (avoids the headless spin). - bootstrap::ensure_kiosk_hardened: OTA self-heal that installs the updated unit+launcher on already-deployed nodes, daemon-reloads, and only try-restarts a *running* kiosk (never re-enables an operator-disabled one). cargo check clean; launcher bash -n clean; unit syntax valid. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
ab6fcef6f3
commit
45ac9be965
@ -30,6 +30,15 @@ const DOCTOR_SH_PATH: &str = "/home/archipelago/archy/scripts/container-doctor.s
|
|||||||
const DOCTOR_SERVICE_PATH: &str = "/etc/systemd/system/archipelago-doctor.service";
|
const DOCTOR_SERVICE_PATH: &str = "/etc/systemd/system/archipelago-doctor.service";
|
||||||
const DOCTOR_TIMER_PATH: &str = "/etc/systemd/system/archipelago-doctor.timer";
|
const DOCTOR_TIMER_PATH: &str = "/etc/systemd/system/archipelago-doctor.timer";
|
||||||
|
|
||||||
|
// Kiosk hardening (#36): keep the deployed unit + launcher in sync with the
|
||||||
|
// repo so the CPU/memory cap and the GPU-vs-headless flag selection reach
|
||||||
|
// already-installed nodes via OTA, not just fresh ISOs.
|
||||||
|
const KIOSK_SERVICE: &str = include_str!("../../../image-recipe/configs/archipelago-kiosk.service");
|
||||||
|
const KIOSK_LAUNCHER: &str =
|
||||||
|
include_str!("../../../image-recipe/configs/archipelago-kiosk-launcher.sh");
|
||||||
|
const KIOSK_SERVICE_PATH: &str = "/etc/systemd/system/archipelago-kiosk.service";
|
||||||
|
const KIOSK_LAUNCHER_PATH: &str = "/usr/local/bin/archipelago-kiosk-launcher";
|
||||||
|
|
||||||
const NGINX_CONF_PATH: &str = "/etc/nginx/sites-available/archipelago";
|
const NGINX_CONF_PATH: &str = "/etc/nginx/sites-available/archipelago";
|
||||||
const NGINX_ENABLED_CONF_PATH: &str = "/etc/nginx/sites-enabled/archipelago";
|
const NGINX_ENABLED_CONF_PATH: &str = "/etc/nginx/sites-enabled/archipelago";
|
||||||
/// Per-app proxy snippet included by the HTTPS (:443) server block. Carries its
|
/// Per-app proxy snippet included by the HTTPS (:443) server block. Carries its
|
||||||
@ -573,6 +582,35 @@ pub async fn ensure_archipelago_mount_ordering() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// #36 self-heal: keep the kiosk unit + launcher current on already-deployed
|
||||||
|
/// nodes so the CPU/memory cap (a runaway chromium was saturating the node and
|
||||||
|
/// starving the backend) and the GPU-vs-headless flag selection arrive via OTA.
|
||||||
|
/// No-op on nodes without the kiosk installed; only restarts the kiosk if it's
|
||||||
|
/// actually running (so it never re-enables an operator-disabled kiosk).
|
||||||
|
pub async fn ensure_kiosk_hardened() {
|
||||||
|
if fs::metadata(KIOSK_SERVICE_PATH).await.is_err() {
|
||||||
|
return; // kiosk not installed on this node
|
||||||
|
}
|
||||||
|
let svc_changed = write_root_if_needed(KIOSK_SERVICE_PATH, KIOSK_SERVICE)
|
||||||
|
.await
|
||||||
|
.unwrap_or(false);
|
||||||
|
let launcher_changed = write_root_if_needed(KIOSK_LAUNCHER_PATH, KIOSK_LAUNCHER)
|
||||||
|
.await
|
||||||
|
.unwrap_or(false);
|
||||||
|
if launcher_changed {
|
||||||
|
let _ = host_sudo(&["chmod", "+x", KIOSK_LAUNCHER_PATH]).await;
|
||||||
|
}
|
||||||
|
if svc_changed || launcher_changed {
|
||||||
|
if let Err(e) = host_sudo(&["systemctl", "daemon-reload"]).await {
|
||||||
|
warn!("kiosk hardening: daemon-reload failed: {:#}", e);
|
||||||
|
}
|
||||||
|
// try-restart only restarts a currently-active unit — leaves a stopped/
|
||||||
|
// disabled kiosk alone.
|
||||||
|
let _ = host_sudo(&["systemctl", "try-restart", "archipelago-kiosk.service"]).await;
|
||||||
|
info!("kiosk: applied resource cap + GPU-flag hardening (#36)");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Patch the nginx site config to add missing backend proxy blocks. Older ISO
|
/// Patch the nginx site config to add missing backend proxy blocks. Older ISO
|
||||||
/// configs shipped individual per-endpoint `location` blocks, so missing
|
/// configs shipped individual per-endpoint `location` blocks, so missing
|
||||||
/// endpoints silently fell through to the SPA `index.html` and the frontend
|
/// endpoints silently fell through to the SPA `index.html` and the frontend
|
||||||
|
|||||||
@ -277,6 +277,10 @@ async fn main() -> Result<()> {
|
|||||||
// only — effective next reboot; never restarts the running service.
|
// only — effective next reboot; never restarts the running service.
|
||||||
tokio::spawn(bootstrap::ensure_archipelago_mount_ordering());
|
tokio::spawn(bootstrap::ensure_archipelago_mount_ordering());
|
||||||
|
|
||||||
|
// #36: keep the kiosk unit + launcher hardened (CPU/mem cap + GPU-vs-headless
|
||||||
|
// flags) on already-deployed nodes via OTA; no-op if the kiosk isn't installed.
|
||||||
|
tokio::spawn(bootstrap::ensure_kiosk_hardened());
|
||||||
|
|
||||||
// Spawn periodic container snapshot (for crash recovery)
|
// Spawn periodic container snapshot (for crash recovery)
|
||||||
crash_recovery::spawn_snapshot_task(config.data_dir.clone());
|
crash_recovery::spawn_snapshot_task(config.data_dir.clone());
|
||||||
|
|
||||||
|
|||||||
@ -79,6 +79,16 @@ xset s noblank 2>/dev/null || true
|
|||||||
pkill -u archipelago -f 'chromium.*localhost' 2>/dev/null || true
|
pkill -u archipelago -f 'chromium.*localhost' 2>/dev/null || true
|
||||||
sleep 1
|
sleep 1
|
||||||
|
|
||||||
|
# GPU vs headless (#36). On a real kiosk display with a GPU, GPU rasterization is
|
||||||
|
# fast. On a GPU-less / headless server (no /dev/dri), --enable-gpu-rasterization
|
||||||
|
# forces GPU paths that fall back to software compositing and SPIN a full core at
|
||||||
|
# ~92% CPU, saturating the node. Detect the GPU and pick safe flags accordingly.
|
||||||
|
if [ -e /dev/dri/card0 ] || [ -e /dev/dri/renderD128 ]; then
|
||||||
|
GPU_FLAGS="--enable-gpu-rasterization --num-raster-threads=2"
|
||||||
|
else
|
||||||
|
GPU_FLAGS="--disable-gpu --num-raster-threads=1"
|
||||||
|
fi
|
||||||
|
|
||||||
while true; do
|
while true; do
|
||||||
sudo -u archipelago env DISPLAY=:0 HOME=/home/archipelago chromium --kiosk \
|
sudo -u archipelago env DISPLAY=:0 HOME=/home/archipelago chromium --kiosk \
|
||||||
--app=http://localhost/kiosk?safe_area_x=${KIOSK_SAFE_AREA_X_PX:-0}\&safe_area_y=${KIOSK_SAFE_AREA_Y_PX:-0} \
|
--app=http://localhost/kiosk?safe_area_x=${KIOSK_SAFE_AREA_X_PX:-0}\&safe_area_y=${KIOSK_SAFE_AREA_Y_PX:-0} \
|
||||||
@ -92,8 +102,7 @@ while true; do
|
|||||||
--disable-save-password-bubble \
|
--disable-save-password-bubble \
|
||||||
--disable-suggestions-service \
|
--disable-suggestions-service \
|
||||||
--disable-component-update \
|
--disable-component-update \
|
||||||
--enable-gpu-rasterization \
|
$GPU_FLAGS \
|
||||||
--num-raster-threads=2 \
|
|
||||||
--renderer-process-limit=2 \
|
--renderer-process-limit=2 \
|
||||||
--window-size=1920,1080 \
|
--window-size=1920,1080 \
|
||||||
--window-position=0,0 \
|
--window-position=0,0 \
|
||||||
|
|||||||
@ -20,5 +20,15 @@ TimeoutStartSec=360
|
|||||||
Restart=always
|
Restart=always
|
||||||
RestartSec=5
|
RestartSec=5
|
||||||
|
|
||||||
|
# Resource guardrail (#36). On GPU-less / headless hardware chromium could spin
|
||||||
|
# software compositing at ~92% of a core, saturating the node and starving the
|
||||||
|
# backend (it caused the .198 receive timeout + deploy storms). Cap CPU + memory
|
||||||
|
# so a runaway kiosk can never take the whole machine down; Delegate so the cap
|
||||||
|
# also binds the chromium/Xorg children in this unit's cgroup.
|
||||||
|
Delegate=yes
|
||||||
|
CPUQuota=75%
|
||||||
|
MemoryMax=1500M
|
||||||
|
MemoryHigh=1200M
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user