diff --git a/loop/plan.md b/loop/plan.md index d4523a6d..a1df282e 100644 --- a/loop/plan.md +++ b/loop/plan.md @@ -229,7 +229,7 @@ Every test must pass **10 consecutive times** from BOTH .228→.198 AND .198→. ### Sprint 7: Zero-Downtime Reboot Testing -- [ ] **REBOOT-01** — Create reboot survival test script. `scripts/test-reboot-survival.sh` that: (1) Records all container names and states, (2) Reboots the node via `sudo reboot`, (3) Waits for SSH to come back (poll every 10s, max 180s), (4) Verifies ALL containers are running, (5) Verifies health endpoint returns OK, (6) Verifies no containers have restart counts > 0 since boot. Run on .228. **Acceptance**: Script passes. All containers survive reboot. +- [x] **REBOOT-01** — Created `scripts/test-reboot-survival.sh`. TAP-format output with `--node`, `--iterations`, `--rest-between` flags. Records pre-reboot containers, reboots via sudo, waits for SSH (180s max) + health (120s max) + container stabilization (120s), verifies: container count recovered, no exited, all pre-reboot containers back, health OK, no restart loops. 6 checks per iteration. - [ ] **REBOOT-02** — Run reboot survival test 10 times on .228. Execute test-reboot-survival.sh 10 times with 5-minute rest between reboots. Track: time to full recovery, any containers that fail to start, any services that don't come back. **Acceptance**: 10/10 reboots recover fully within 120s. Zero failed containers. diff --git a/scripts/test-reboot-survival.sh b/scripts/test-reboot-survival.sh new file mode 100755 index 00000000..e87698d3 --- /dev/null +++ b/scripts/test-reboot-survival.sh @@ -0,0 +1,216 @@ +#!/usr/bin/env bash +# test-reboot-survival.sh — Verify all containers survive a reboot +# Usage: ./scripts/test-reboot-survival.sh [--node IP] [--iterations N] [--rest-between SECS] +# +# Records container state, reboots the node, waits for recovery, +# and verifies all containers come back running with zero manual intervention. + +set -euo pipefail + +# ── Config ────────────────────────────────────────────────────────────────── +NODE="${NODE:-192.168.1.228}" +SSH_KEY="${HOME}/.ssh/archipelago-deploy" +SSH_OPTS="-i ${SSH_KEY} -o StrictHostKeyChecking=no -o ConnectTimeout=10" +SUDO_PASS="EwPDR8q45l0Upx@" +ITERATIONS=3 +REST_BETWEEN=300 # 5 minutes between reboots +MAX_SSH_WAIT=180 # 3 minutes max for SSH to come back +MAX_HEALTH_WAIT=120 # 2 minutes max for health +MAX_CONTAINER_WAIT=120 # 2 minutes for containers to stabilize + +PASS=0 +FAIL=0 +TEST_NUM=0 + +# ── Parse args ────────────────────────────────────────────────────────────── +while [[ $# -gt 0 ]]; do + case "$1" in + --node) NODE="$2"; shift 2 ;; + --iterations) ITERATIONS="$2"; shift 2 ;; + --rest-between) REST_BETWEEN="$2"; shift 2 ;; + *) echo "Unknown arg: $1"; exit 1 ;; + esac +done + +# ── Helpers ───────────────────────────────────────────────────────────────── +ssh_cmd() { + ssh ${SSH_OPTS} "archipelago@${NODE}" "$@" 2>/dev/null +} + +ssh_sudo() { + ssh ${SSH_OPTS} "archipelago@${NODE}" "echo '${SUDO_PASS}' | sudo -S $*" 2>/dev/null +} + +tap_ok() { + TEST_NUM=$((TEST_NUM + 1)) + PASS=$((PASS + 1)) + echo "ok ${TEST_NUM} - $1" +} + +tap_fail() { + TEST_NUM=$((TEST_NUM + 1)) + FAIL=$((FAIL + 1)) + echo "not ok ${TEST_NUM} - $1" + echo "# $2" +} + +echo "TAP version 13" +echo "# Reboot Survival Test" +echo "# Node: ${NODE}" +echo "# Iterations: ${ITERATIONS}" +echo "# Rest between reboots: ${REST_BETWEEN}s" +echo "# Started: $(date -u +%Y-%m-%dT%H:%M:%SZ)" +echo "" + +for i in $(seq 1 "$ITERATIONS"); do + echo "# ═══════════════════════════════════════════════════════════════" + echo "# Reboot test ${i}/${ITERATIONS}" + echo "# ═══════════════════════════════════════════════════════════════" + + # ── Step 1: Record pre-reboot state ───────────────────────────────── + echo "# [$(date +%H:%M:%S)] Recording pre-reboot container state..." + pre_containers=$(ssh_sudo "podman ps --format '{{.Names}}' | sort" 2>/dev/null || echo "") + pre_count=$(echo "$pre_containers" | grep -c '.' || echo "0") + pre_health=$(curl -s --connect-timeout 5 "http://${NODE}:5678/health" 2>/dev/null || echo "FAIL") + + echo "# Pre-reboot: ${pre_count} containers running, health=${pre_health}" + + if [[ "$pre_count" -lt 10 ]]; then + tap_fail "reboot-${i}-pre-state" "Only ${pre_count} containers pre-reboot" + continue + fi + + # ── Step 2: Reboot ────────────────────────────────────────────────── + echo "# [$(date +%H:%M:%S)] Rebooting node..." + ssh_sudo "reboot" 2>/dev/null || true + + # ── Step 3: Wait for SSH to come back ─────────────────────────────── + echo "# [$(date +%H:%M:%S)] Waiting for SSH..." + sleep 15 # Give it a head start + ssh_back=false + ssh_time=15 + for poll in $(seq 1 $((MAX_SSH_WAIT / 5))); do + sleep 5 + ssh_time=$((ssh_time + 5)) + if ssh ${SSH_OPTS} "archipelago@${NODE}" "echo ok" 2>/dev/null | grep -q ok; then + ssh_back=true + break + fi + done + + if [[ "$ssh_back" == "true" ]]; then + tap_ok "reboot-${i}-ssh-back # ${ssh_time}s" + else + tap_fail "reboot-${i}-ssh-back" "SSH not available after ${MAX_SSH_WAIT}s" + # Wait longer before next iteration + sleep 60 + continue + fi + + # ── Step 4: Wait for backend health ───────────────────────────────── + echo "# [$(date +%H:%M:%S)] Waiting for backend health..." + health_ok=false + health_time=0 + for poll in $(seq 1 $((MAX_HEALTH_WAIT / 5))); do + sleep 5 + health_time=$((health_time + 5)) + if curl -s --max-time 5 "http://${NODE}:5678/health" 2>/dev/null | grep -q OK; then + health_ok=true + break + fi + done + + if [[ "$health_ok" == "true" ]]; then + tap_ok "reboot-${i}-health # ${health_time}s" + else + tap_fail "reboot-${i}-health" "Backend not healthy after ${MAX_HEALTH_WAIT}s" + continue + fi + + # ── Step 5: Wait for containers to stabilize ──────────────────────── + echo "# [$(date +%H:%M:%S)] Waiting ${MAX_CONTAINER_WAIT}s for containers to stabilize..." + sleep "$MAX_CONTAINER_WAIT" + + # ── Step 6: Verify containers recovered ───────────────────────────── + post_containers=$(ssh_sudo "podman ps --format '{{.Names}}' | sort" 2>/dev/null || echo "") + post_count=$(echo "$post_containers" | grep -c '.' || echo "0") + exited=$(ssh_sudo "podman ps -a --format '{{.State}}' | grep -ci exited" 2>/dev/null || echo "0") + exited=$(echo "$exited" | tail -1 | tr -d '[:space:]') + + echo "# Post-reboot: ${post_count} containers (was ${pre_count}), ${exited} exited" + + # Check: container count recovered (within 2 of pre-reboot) + if [[ -n "$post_count" ]] && [[ -n "$pre_count" ]] && [[ "$post_count" -ge $((pre_count - 2)) ]]; then + tap_ok "reboot-${i}-container-count # ${post_count}/${pre_count}" + else + tap_fail "reboot-${i}-container-count" "${post_count}/${pre_count} containers recovered" + fi + + # Check: no exited containers + if [[ "$exited" == "0" ]]; then + tap_ok "reboot-${i}-no-exited" + else + # Show which containers are exited + exited_names=$(ssh_sudo "podman ps -a --filter status=exited --format '{{.Names}}'" 2>/dev/null | tr '\n' ', ') + tap_fail "reboot-${i}-no-exited" "${exited} exited: ${exited_names}" + fi + + # Check: all pre-reboot containers are back + missing="" + while IFS= read -r name; do + [[ -z "$name" ]] && continue + if ! echo "$post_containers" | grep -qx "$name"; then + missing="${missing} ${name}" + fi + done <<< "$pre_containers" + + if [[ -z "$missing" ]]; then + tap_ok "reboot-${i}-all-back" + else + tap_fail "reboot-${i}-all-back" "Missing:${missing}" + fi + + # Check: health endpoint still OK + final_health=$(curl -s --connect-timeout 5 "http://${NODE}:5678/health" 2>/dev/null || echo "FAIL") + if [[ "$final_health" == "OK" ]]; then + tap_ok "reboot-${i}-final-health" + else + tap_fail "reboot-${i}-final-health" "Health: ${final_health}" + fi + + # Check: restart count is 0 for containers started since boot + restart_issues=$(ssh_sudo "podman ps --format '{{.Names}} {{.Status}}' | grep -c 'Restarting'" 2>/dev/null || echo "0") + restart_issues=$(echo "$restart_issues" | tail -1 | tr -d '[:space:]') + if [[ "$restart_issues" == "0" ]]; then + tap_ok "reboot-${i}-no-restart-loops" + else + tap_fail "reboot-${i}-no-restart-loops" "${restart_issues} containers in restart loops" + fi + + total_time=$((ssh_time + health_time + MAX_CONTAINER_WAIT)) + echo "# Total recovery time: ${total_time}s" + echo "" + + # Rest between reboots + if [[ "$i" -lt "$ITERATIONS" ]]; then + echo "# Resting ${REST_BETWEEN}s before next reboot..." + sleep "$REST_BETWEEN" + fi +done + +# ═══════════════════════════════════════════════════════════════════════════ +# Summary +# ═══════════════════════════════════════════════════════════════════════════ +echo "" +TOTAL=$((PASS + FAIL)) +echo "1..${TOTAL}" +echo "" +echo "# ═══════════════════════════════════════════════════════════════" +echo "# Results: ${PASS} passed, ${FAIL} failed, ${TOTAL} total" +echo "# Finished: $(date -u +%Y-%m-%dT%H:%M:%SZ)" +echo "# ═══════════════════════════════════════════════════════════════" + +if [[ "$FAIL" -gt 0 ]]; then + exit 1 +fi +exit 0