fix(ci): QEMU boot test ignores trailing numeric arg + enforces timeout

The CI workflow calls `test-iso-qemu.sh "$ISO" 120`. The old arg parser
had a `case *) ISO=...` fallthrough that silently let the second
positional `120` overwrite ISO, so QEMU went looking for a file literally
named "120". That's the "failed step" the user was seeing on recent ISO
runs — the rest of the job succeeded because the QEMU step has
`continue-on-error: true`.

Changes:
- Treat `--timeout=N` or a bare numeric first-match as a CI timeout in
  seconds; the original ISO path still wins the positional.
- When a timeout is set, force `--nographic` (CI has no DISPLAY anyway)
  and wrap the QEMU invocation in coreutils' `timeout` so the script
  always returns instead of hanging.
- After termination (or timeout), grep the serial log for well-known
  systemd/live-boot markers. Pass if the kernel reached userspace, fail
  if no marker appeared within the window — useful signal rather than
  the previous "did the VM shut itself off" proxy.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dorian 2026-04-18 18:36:20 -04:00
parent f5c581a725
commit 7655a5971b

View File

@ -14,16 +14,31 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
SERIAL_LOG="/tmp/archipelago-qemu-serial.log"
FORCE_BIOS=false
NOGRAPHIC=false
TIMEOUT=0
ISO=""
# Simple arg parsing. First non-flag positional is the ISO path. A bare
# numeric (e.g. `120`) is taken as a boot-test timeout in seconds so CI
# can call `test-iso-qemu.sh <iso> 120` without hanging the job. The
# pre-fix version used `case *) ISO=...`, which silently overwrote ISO
# with the timeout value and sent QEMU looking for a file literally
# named "120".
for arg in "$@"; do
case "$arg" in
--bios) FORCE_BIOS=true ;;
--nographic) NOGRAPHIC=true ;;
--timeout=*) TIMEOUT="${arg#--timeout=}" ;;
[0-9]*) TIMEOUT="$arg" ;;
*) ISO="$arg" ;;
esac
done
# A positive TIMEOUT implies headless (no DISPLAY in CI anyway) and keeps
# the entire script wrapped in `timeout` to guarantee the job returns.
if [ "$TIMEOUT" -gt 0 ] 2>/dev/null; then
NOGRAPHIC=true
fi
# Auto-detect ISO
if [ -z "$ISO" ]; then
ISO=$(ls -t "$SCRIPT_DIR"/results/archipelago-installer-unbundled-*.iso 2>/dev/null | head -1)
@ -88,6 +103,7 @@ if [ "$FORCE_BIOS" = false ]; then
fi
fi
run_qemu() {
if [ -n "$OVMF" ]; then
echo " Boot: UEFI ($OVMF)"
qemu-system-x86_64 \
@ -100,8 +116,42 @@ else
-machine pc \
"${QEMU_ARGS[@]}"
fi
}
# Wrap the QEMU invocation in `timeout` when a CI caller passed one so
# the script always returns instead of hanging on a VM that never exits
# its boot loop. Exit 124 from coreutils' timeout is treated as "VM
# reached the timeout", which for a CI boot test is success as long as
# the serial log shows a kernel reaching userspace — we inspect that
# after the QEMU process ends.
if [ "$TIMEOUT" -gt 0 ] 2>/dev/null; then
timeout --foreground --preserve-status "${TIMEOUT}s" bash -c "$(declare -f run_qemu); run_qemu"
rc=$?
if [ $rc -eq 124 ] || [ $rc -eq 137 ]; then
echo "(QEMU terminated after ${TIMEOUT}s boot-test window)"
rc=0
fi
else
run_qemu
rc=$?
fi
echo ""
echo "VM stopped. Serial log: $SERIAL_LOG"
echo "Last 20 lines:"
tail -20 "$SERIAL_LOG" 2>/dev/null
# Boot-sanity check: the CI wrapper wants a non-zero exit only when the
# kernel never reached userspace. Look for a well-known marker emitted
# by live-boot/systemd early in the sequence. If the marker never
# appeared, surface the real failure; otherwise treat "timeout reached
# with a live kernel" as a pass.
if [ "$TIMEOUT" -gt 0 ] 2>/dev/null && [ -f "$SERIAL_LOG" ]; then
if grep -qE "Welcome to Debian|Reached target|systemd\[1\]:" "$SERIAL_LOG"; then
echo " Boot sanity: OK (systemd reached in serial log)"
exit 0
fi
echo " Boot sanity: FAIL — no systemd markers in serial log within ${TIMEOUT}s"
exit 1
fi
exit "${rc:-0}"