#!/usr/bin/env bash # test-reboot-survival.sh — Verify all containers survive a reboot # Usage: ./scripts/test-reboot-survival.sh [--node IP] [--iterations N] [--rest-between SECS] # # Records container state, reboots the node, waits for recovery, # and verifies all containers come back running with zero manual intervention. set -euo pipefail # ── Config ────────────────────────────────────────────────────────────────── NODE="${NODE:-192.168.1.228}" SSH_KEY="${HOME}/.ssh/archipelago-deploy" SSH_OPTS="-i ${SSH_KEY} -o StrictHostKeyChecking=no -o ConnectTimeout=10" SUDO_PASS="EwPDR8q45l0Upx@" ITERATIONS=3 REST_BETWEEN=300 # 5 minutes between reboots MAX_SSH_WAIT=180 # 3 minutes max for SSH to come back MAX_HEALTH_WAIT=120 # 2 minutes max for health MAX_CONTAINER_WAIT=120 # 2 minutes for containers to stabilize PASS=0 FAIL=0 TEST_NUM=0 # ── Parse args ────────────────────────────────────────────────────────────── while [[ $# -gt 0 ]]; do case "$1" in --node) NODE="$2"; shift 2 ;; --iterations) ITERATIONS="$2"; shift 2 ;; --rest-between) REST_BETWEEN="$2"; shift 2 ;; *) echo "Unknown arg: $1"; exit 1 ;; esac done # ── Helpers ───────────────────────────────────────────────────────────────── ssh_cmd() { ssh ${SSH_OPTS} "archipelago@${NODE}" "$@" 2>/dev/null } ssh_sudo() { ssh ${SSH_OPTS} "archipelago@${NODE}" "echo '${SUDO_PASS}' | sudo -S $*" 2>/dev/null } tap_ok() { TEST_NUM=$((TEST_NUM + 1)) PASS=$((PASS + 1)) echo "ok ${TEST_NUM} - $1" } tap_fail() { TEST_NUM=$((TEST_NUM + 1)) FAIL=$((FAIL + 1)) echo "not ok ${TEST_NUM} - $1" echo "# $2" } echo "TAP version 13" echo "# Reboot Survival Test" echo "# Node: ${NODE}" echo "# Iterations: ${ITERATIONS}" echo "# Rest between reboots: ${REST_BETWEEN}s" echo "# Started: $(date -u +%Y-%m-%dT%H:%M:%SZ)" echo "" for i in $(seq 1 "$ITERATIONS"); do echo "# ═══════════════════════════════════════════════════════════════" echo "# Reboot test ${i}/${ITERATIONS}" echo "# ═══════════════════════════════════════════════════════════════" # ── Step 1: Record pre-reboot state ───────────────────────────────── echo "# [$(date +%H:%M:%S)] Recording pre-reboot container state..." pre_containers=$(ssh_sudo "podman ps --format '{{.Names}}' | sort" 2>/dev/null || echo "") pre_count=$(echo "$pre_containers" | grep -c '.' || echo "0") pre_health=$(curl -s --connect-timeout 5 "http://${NODE}:5678/health" 2>/dev/null || echo "FAIL") echo "# Pre-reboot: ${pre_count} containers running, health=${pre_health}" if [[ "$pre_count" -lt 10 ]]; then tap_fail "reboot-${i}-pre-state" "Only ${pre_count} containers pre-reboot" continue fi # ── Step 2: Reboot ────────────────────────────────────────────────── echo "# [$(date +%H:%M:%S)] Rebooting node..." ssh_sudo "reboot" 2>/dev/null || true # ── Step 3: Wait for SSH to come back ─────────────────────────────── echo "# [$(date +%H:%M:%S)] Waiting for SSH..." sleep 15 # Give it a head start ssh_back=false ssh_time=15 for poll in $(seq 1 $((MAX_SSH_WAIT / 5))); do sleep 5 ssh_time=$((ssh_time + 5)) if ssh ${SSH_OPTS} "archipelago@${NODE}" "echo ok" 2>/dev/null | grep -q ok; then ssh_back=true break fi done if [[ "$ssh_back" == "true" ]]; then tap_ok "reboot-${i}-ssh-back # ${ssh_time}s" else tap_fail "reboot-${i}-ssh-back" "SSH not available after ${MAX_SSH_WAIT}s" # Wait longer before next iteration sleep 60 continue fi # ── Step 4: Wait for backend health ───────────────────────────────── echo "# [$(date +%H:%M:%S)] Waiting for backend health..." health_ok=false health_time=0 for poll in $(seq 1 $((MAX_HEALTH_WAIT / 5))); do sleep 5 health_time=$((health_time + 5)) if curl -s --max-time 5 "http://${NODE}:5678/health" 2>/dev/null | grep -q OK; then health_ok=true break fi done if [[ "$health_ok" == "true" ]]; then tap_ok "reboot-${i}-health # ${health_time}s" else tap_fail "reboot-${i}-health" "Backend not healthy after ${MAX_HEALTH_WAIT}s" continue fi # ── Step 5: Wait for containers to stabilize ──────────────────────── echo "# [$(date +%H:%M:%S)] Waiting ${MAX_CONTAINER_WAIT}s for containers to stabilize..." sleep "$MAX_CONTAINER_WAIT" # ── Step 6: Verify containers recovered ───────────────────────────── post_containers=$(ssh_sudo "podman ps --format '{{.Names}}' | sort" 2>/dev/null || echo "") post_count=$(echo "$post_containers" | grep -c '.' || echo "0") exited=$(ssh_sudo "podman ps -a --format '{{.State}}' | grep -ci exited" 2>/dev/null || echo "0") exited=$(echo "$exited" | tail -1 | tr -d '[:space:]') echo "# Post-reboot: ${post_count} containers (was ${pre_count}), ${exited} exited" # Check: container count recovered (within 2 of pre-reboot) if [[ -n "$post_count" ]] && [[ -n "$pre_count" ]] && [[ "$post_count" -ge $((pre_count - 2)) ]]; then tap_ok "reboot-${i}-container-count # ${post_count}/${pre_count}" else tap_fail "reboot-${i}-container-count" "${post_count}/${pre_count} containers recovered" fi # Check: no exited containers if [[ "$exited" == "0" ]]; then tap_ok "reboot-${i}-no-exited" else # Show which containers are exited exited_names=$(ssh_sudo "podman ps -a --filter status=exited --format '{{.Names}}'" 2>/dev/null | tr '\n' ', ') tap_fail "reboot-${i}-no-exited" "${exited} exited: ${exited_names}" fi # Check: all pre-reboot containers are back missing="" while IFS= read -r name; do [[ -z "$name" ]] && continue if ! echo "$post_containers" | grep -qx "$name"; then missing="${missing} ${name}" fi done <<< "$pre_containers" if [[ -z "$missing" ]]; then tap_ok "reboot-${i}-all-back" else tap_fail "reboot-${i}-all-back" "Missing:${missing}" fi # Check: health endpoint still OK final_health=$(curl -s --connect-timeout 5 "http://${NODE}:5678/health" 2>/dev/null || echo "FAIL") if [[ "$final_health" == "OK" ]]; then tap_ok "reboot-${i}-final-health" else tap_fail "reboot-${i}-final-health" "Health: ${final_health}" fi # Check: restart count is 0 for containers started since boot restart_issues=$(ssh_sudo "podman ps --format '{{.Names}} {{.Status}}' | grep -c 'Restarting'" 2>/dev/null || echo "0") restart_issues=$(echo "$restart_issues" | tail -1 | tr -d '[:space:]') if [[ "$restart_issues" == "0" ]]; then tap_ok "reboot-${i}-no-restart-loops" else tap_fail "reboot-${i}-no-restart-loops" "${restart_issues} containers in restart loops" fi total_time=$((ssh_time + health_time + MAX_CONTAINER_WAIT)) echo "# Total recovery time: ${total_time}s" echo "" # Rest between reboots if [[ "$i" -lt "$ITERATIONS" ]]; then echo "# Resting ${REST_BETWEEN}s before next reboot..." sleep "$REST_BETWEEN" fi done # ═══════════════════════════════════════════════════════════════════════════ # Summary # ═══════════════════════════════════════════════════════════════════════════ echo "" TOTAL=$((PASS + FAIL)) echo "1..${TOTAL}" echo "" echo "# ═══════════════════════════════════════════════════════════════" echo "# Results: ${PASS} passed, ${FAIL} failed, ${TOTAL} total" echo "# Finished: $(date -u +%Y-%m-%dT%H:%M:%SZ)" echo "# ═══════════════════════════════════════════════════════════════" if [[ "$FAIL" -gt 0 ]]; then exit 1 fi exit 0