From 7655a5971b346c7e9926f4f60c38929dcca2ca65 Mon Sep 17 00:00:00 2001 From: Dorian Date: Sat, 18 Apr 2026 18:36:20 -0400 Subject: [PATCH] fix(ci): QEMU boot test ignores trailing numeric arg + enforces timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CI workflow calls `test-iso-qemu.sh "$ISO" 120`. The old arg parser had a `case *) ISO=...` fallthrough that silently let the second positional `120` overwrite ISO, so QEMU went looking for a file literally named "120". That's the "failed step" the user was seeing on recent ISO runs — the rest of the job succeeded because the QEMU step has `continue-on-error: true`. Changes: - Treat `--timeout=N` or a bare numeric first-match as a CI timeout in seconds; the original ISO path still wins the positional. - When a timeout is set, force `--nographic` (CI has no DISPLAY anyway) and wrap the QEMU invocation in coreutils' `timeout` so the script always returns instead of hanging. - After termination (or timeout), grep the serial log for well-known systemd/live-boot markers. Pass if the kernel reached userspace, fail if no marker appeared within the window — useful signal rather than the previous "did the VM shut itself off" proxy. Co-Authored-By: Claude Opus 4.7 (1M context) --- image-recipe/test-iso-qemu.sh | 70 ++++++++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 10 deletions(-) diff --git a/image-recipe/test-iso-qemu.sh b/image-recipe/test-iso-qemu.sh index cbe8bef3..70efe9d5 100755 --- a/image-recipe/test-iso-qemu.sh +++ b/image-recipe/test-iso-qemu.sh @@ -14,16 +14,31 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" SERIAL_LOG="/tmp/archipelago-qemu-serial.log" FORCE_BIOS=false NOGRAPHIC=false +TIMEOUT=0 ISO="" +# Simple arg parsing. First non-flag positional is the ISO path. A bare +# numeric (e.g. `120`) is taken as a boot-test timeout in seconds so CI +# can call `test-iso-qemu.sh 120` without hanging the job. The +# pre-fix version used `case *) ISO=...`, which silently overwrote ISO +# with the timeout value and sent QEMU looking for a file literally +# named "120". for arg in "$@"; do case "$arg" in --bios) FORCE_BIOS=true ;; --nographic) NOGRAPHIC=true ;; + --timeout=*) TIMEOUT="${arg#--timeout=}" ;; + [0-9]*) TIMEOUT="$arg" ;; *) ISO="$arg" ;; esac done +# A positive TIMEOUT implies headless (no DISPLAY in CI anyway) and keeps +# the entire script wrapped in `timeout` to guarantee the job returns. +if [ "$TIMEOUT" -gt 0 ] 2>/dev/null; then + NOGRAPHIC=true +fi + # Auto-detect ISO if [ -z "$ISO" ]; then ISO=$(ls -t "$SCRIPT_DIR"/results/archipelago-installer-unbundled-*.iso 2>/dev/null | head -1) @@ -88,20 +103,55 @@ if [ "$FORCE_BIOS" = false ]; then fi fi -if [ -n "$OVMF" ]; then - echo " Boot: UEFI ($OVMF)" - qemu-system-x86_64 \ - -machine q35 \ - -drive if=pflash,format=raw,readonly=on,file="$OVMF" \ - "${QEMU_ARGS[@]}" +run_qemu() { + if [ -n "$OVMF" ]; then + echo " Boot: UEFI ($OVMF)" + qemu-system-x86_64 \ + -machine q35 \ + -drive if=pflash,format=raw,readonly=on,file="$OVMF" \ + "${QEMU_ARGS[@]}" + else + echo " Boot: Legacy BIOS" + qemu-system-x86_64 \ + -machine pc \ + "${QEMU_ARGS[@]}" + fi +} + +# Wrap the QEMU invocation in `timeout` when a CI caller passed one so +# the script always returns instead of hanging on a VM that never exits +# its boot loop. Exit 124 from coreutils' timeout is treated as "VM +# reached the timeout", which for a CI boot test is success as long as +# the serial log shows a kernel reaching userspace — we inspect that +# after the QEMU process ends. +if [ "$TIMEOUT" -gt 0 ] 2>/dev/null; then + timeout --foreground --preserve-status "${TIMEOUT}s" bash -c "$(declare -f run_qemu); run_qemu" + rc=$? + if [ $rc -eq 124 ] || [ $rc -eq 137 ]; then + echo "(QEMU terminated after ${TIMEOUT}s boot-test window)" + rc=0 + fi else - echo " Boot: Legacy BIOS" - qemu-system-x86_64 \ - -machine pc \ - "${QEMU_ARGS[@]}" + run_qemu + rc=$? fi echo "" echo "VM stopped. Serial log: $SERIAL_LOG" echo "Last 20 lines:" tail -20 "$SERIAL_LOG" 2>/dev/null + +# Boot-sanity check: the CI wrapper wants a non-zero exit only when the +# kernel never reached userspace. Look for a well-known marker emitted +# by live-boot/systemd early in the sequence. If the marker never +# appeared, surface the real failure; otherwise treat "timeout reached +# with a live kernel" as a pass. +if [ "$TIMEOUT" -gt 0 ] 2>/dev/null && [ -f "$SERIAL_LOG" ]; then + if grep -qE "Welcome to Debian|Reached target|systemd\[1\]:" "$SERIAL_LOG"; then + echo " Boot sanity: OK (systemd reached in serial log)" + exit 0 + fi + echo " Boot sanity: FAIL — no systemd markers in serial log within ${TIMEOUT}s" + exit 1 +fi +exit "${rc:-0}"