217 lines
9.2 KiB
Bash
217 lines
9.2 KiB
Bash
|
|
#!/usr/bin/env bash
|
||
|
|
# test-reboot-survival.sh — Verify all containers survive a reboot
|
||
|
|
# Usage: ./scripts/test-reboot-survival.sh [--node IP] [--iterations N] [--rest-between SECS]
|
||
|
|
#
|
||
|
|
# Records container state, reboots the node, waits for recovery,
|
||
|
|
# and verifies all containers come back running with zero manual intervention.
|
||
|
|
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
# ── Config ──────────────────────────────────────────────────────────────────
|
||
|
|
NODE="${NODE:-192.168.1.228}"
|
||
|
|
SSH_KEY="${HOME}/.ssh/archipelago-deploy"
|
||
|
|
SSH_OPTS="-i ${SSH_KEY} -o StrictHostKeyChecking=no -o ConnectTimeout=10"
|
||
|
|
SUDO_PASS="EwPDR8q45l0Upx@"
|
||
|
|
ITERATIONS=3
|
||
|
|
REST_BETWEEN=300 # 5 minutes between reboots
|
||
|
|
MAX_SSH_WAIT=180 # 3 minutes max for SSH to come back
|
||
|
|
MAX_HEALTH_WAIT=120 # 2 minutes max for health
|
||
|
|
MAX_CONTAINER_WAIT=120 # 2 minutes for containers to stabilize
|
||
|
|
|
||
|
|
PASS=0
|
||
|
|
FAIL=0
|
||
|
|
TEST_NUM=0
|
||
|
|
|
||
|
|
# ── Parse args ──────────────────────────────────────────────────────────────
|
||
|
|
while [[ $# -gt 0 ]]; do
|
||
|
|
case "$1" in
|
||
|
|
--node) NODE="$2"; shift 2 ;;
|
||
|
|
--iterations) ITERATIONS="$2"; shift 2 ;;
|
||
|
|
--rest-between) REST_BETWEEN="$2"; shift 2 ;;
|
||
|
|
*) echo "Unknown arg: $1"; exit 1 ;;
|
||
|
|
esac
|
||
|
|
done
|
||
|
|
|
||
|
|
# ── Helpers ─────────────────────────────────────────────────────────────────
|
||
|
|
ssh_cmd() {
|
||
|
|
ssh ${SSH_OPTS} "archipelago@${NODE}" "$@" 2>/dev/null
|
||
|
|
}
|
||
|
|
|
||
|
|
ssh_sudo() {
|
||
|
|
ssh ${SSH_OPTS} "archipelago@${NODE}" "echo '${SUDO_PASS}' | sudo -S $*" 2>/dev/null
|
||
|
|
}
|
||
|
|
|
||
|
|
tap_ok() {
|
||
|
|
TEST_NUM=$((TEST_NUM + 1))
|
||
|
|
PASS=$((PASS + 1))
|
||
|
|
echo "ok ${TEST_NUM} - $1"
|
||
|
|
}
|
||
|
|
|
||
|
|
tap_fail() {
|
||
|
|
TEST_NUM=$((TEST_NUM + 1))
|
||
|
|
FAIL=$((FAIL + 1))
|
||
|
|
echo "not ok ${TEST_NUM} - $1"
|
||
|
|
echo "# $2"
|
||
|
|
}
|
||
|
|
|
||
|
|
echo "TAP version 13"
|
||
|
|
echo "# Reboot Survival Test"
|
||
|
|
echo "# Node: ${NODE}"
|
||
|
|
echo "# Iterations: ${ITERATIONS}"
|
||
|
|
echo "# Rest between reboots: ${REST_BETWEEN}s"
|
||
|
|
echo "# Started: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||
|
|
echo ""
|
||
|
|
|
||
|
|
for i in $(seq 1 "$ITERATIONS"); do
|
||
|
|
echo "# ═══════════════════════════════════════════════════════════════"
|
||
|
|
echo "# Reboot test ${i}/${ITERATIONS}"
|
||
|
|
echo "# ═══════════════════════════════════════════════════════════════"
|
||
|
|
|
||
|
|
# ── Step 1: Record pre-reboot state ─────────────────────────────────
|
||
|
|
echo "# [$(date +%H:%M:%S)] Recording pre-reboot container state..."
|
||
|
|
pre_containers=$(ssh_sudo "podman ps --format '{{.Names}}' | sort" 2>/dev/null || echo "")
|
||
|
|
pre_count=$(echo "$pre_containers" | grep -c '.' || echo "0")
|
||
|
|
pre_health=$(curl -s --connect-timeout 5 "http://${NODE}:5678/health" 2>/dev/null || echo "FAIL")
|
||
|
|
|
||
|
|
echo "# Pre-reboot: ${pre_count} containers running, health=${pre_health}"
|
||
|
|
|
||
|
|
if [[ "$pre_count" -lt 10 ]]; then
|
||
|
|
tap_fail "reboot-${i}-pre-state" "Only ${pre_count} containers pre-reboot"
|
||
|
|
continue
|
||
|
|
fi
|
||
|
|
|
||
|
|
# ── Step 2: Reboot ──────────────────────────────────────────────────
|
||
|
|
echo "# [$(date +%H:%M:%S)] Rebooting node..."
|
||
|
|
ssh_sudo "reboot" 2>/dev/null || true
|
||
|
|
|
||
|
|
# ── Step 3: Wait for SSH to come back ───────────────────────────────
|
||
|
|
echo "# [$(date +%H:%M:%S)] Waiting for SSH..."
|
||
|
|
sleep 15 # Give it a head start
|
||
|
|
ssh_back=false
|
||
|
|
ssh_time=15
|
||
|
|
for poll in $(seq 1 $((MAX_SSH_WAIT / 5))); do
|
||
|
|
sleep 5
|
||
|
|
ssh_time=$((ssh_time + 5))
|
||
|
|
if ssh ${SSH_OPTS} "archipelago@${NODE}" "echo ok" 2>/dev/null | grep -q ok; then
|
||
|
|
ssh_back=true
|
||
|
|
break
|
||
|
|
fi
|
||
|
|
done
|
||
|
|
|
||
|
|
if [[ "$ssh_back" == "true" ]]; then
|
||
|
|
tap_ok "reboot-${i}-ssh-back # ${ssh_time}s"
|
||
|
|
else
|
||
|
|
tap_fail "reboot-${i}-ssh-back" "SSH not available after ${MAX_SSH_WAIT}s"
|
||
|
|
# Wait longer before next iteration
|
||
|
|
sleep 60
|
||
|
|
continue
|
||
|
|
fi
|
||
|
|
|
||
|
|
# ── Step 4: Wait for backend health ─────────────────────────────────
|
||
|
|
echo "# [$(date +%H:%M:%S)] Waiting for backend health..."
|
||
|
|
health_ok=false
|
||
|
|
health_time=0
|
||
|
|
for poll in $(seq 1 $((MAX_HEALTH_WAIT / 5))); do
|
||
|
|
sleep 5
|
||
|
|
health_time=$((health_time + 5))
|
||
|
|
if curl -s --max-time 5 "http://${NODE}:5678/health" 2>/dev/null | grep -q OK; then
|
||
|
|
health_ok=true
|
||
|
|
break
|
||
|
|
fi
|
||
|
|
done
|
||
|
|
|
||
|
|
if [[ "$health_ok" == "true" ]]; then
|
||
|
|
tap_ok "reboot-${i}-health # ${health_time}s"
|
||
|
|
else
|
||
|
|
tap_fail "reboot-${i}-health" "Backend not healthy after ${MAX_HEALTH_WAIT}s"
|
||
|
|
continue
|
||
|
|
fi
|
||
|
|
|
||
|
|
# ── Step 5: Wait for containers to stabilize ────────────────────────
|
||
|
|
echo "# [$(date +%H:%M:%S)] Waiting ${MAX_CONTAINER_WAIT}s for containers to stabilize..."
|
||
|
|
sleep "$MAX_CONTAINER_WAIT"
|
||
|
|
|
||
|
|
# ── Step 6: Verify containers recovered ─────────────────────────────
|
||
|
|
post_containers=$(ssh_sudo "podman ps --format '{{.Names}}' | sort" 2>/dev/null || echo "")
|
||
|
|
post_count=$(echo "$post_containers" | grep -c '.' || echo "0")
|
||
|
|
exited=$(ssh_sudo "podman ps -a --format '{{.State}}' | grep -ci exited" 2>/dev/null || echo "0")
|
||
|
|
exited=$(echo "$exited" | tail -1 | tr -d '[:space:]')
|
||
|
|
|
||
|
|
echo "# Post-reboot: ${post_count} containers (was ${pre_count}), ${exited} exited"
|
||
|
|
|
||
|
|
# Check: container count recovered (within 2 of pre-reboot)
|
||
|
|
if [[ -n "$post_count" ]] && [[ -n "$pre_count" ]] && [[ "$post_count" -ge $((pre_count - 2)) ]]; then
|
||
|
|
tap_ok "reboot-${i}-container-count # ${post_count}/${pre_count}"
|
||
|
|
else
|
||
|
|
tap_fail "reboot-${i}-container-count" "${post_count}/${pre_count} containers recovered"
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Check: no exited containers
|
||
|
|
if [[ "$exited" == "0" ]]; then
|
||
|
|
tap_ok "reboot-${i}-no-exited"
|
||
|
|
else
|
||
|
|
# Show which containers are exited
|
||
|
|
exited_names=$(ssh_sudo "podman ps -a --filter status=exited --format '{{.Names}}'" 2>/dev/null | tr '\n' ', ')
|
||
|
|
tap_fail "reboot-${i}-no-exited" "${exited} exited: ${exited_names}"
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Check: all pre-reboot containers are back
|
||
|
|
missing=""
|
||
|
|
while IFS= read -r name; do
|
||
|
|
[[ -z "$name" ]] && continue
|
||
|
|
if ! echo "$post_containers" | grep -qx "$name"; then
|
||
|
|
missing="${missing} ${name}"
|
||
|
|
fi
|
||
|
|
done <<< "$pre_containers"
|
||
|
|
|
||
|
|
if [[ -z "$missing" ]]; then
|
||
|
|
tap_ok "reboot-${i}-all-back"
|
||
|
|
else
|
||
|
|
tap_fail "reboot-${i}-all-back" "Missing:${missing}"
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Check: health endpoint still OK
|
||
|
|
final_health=$(curl -s --connect-timeout 5 "http://${NODE}:5678/health" 2>/dev/null || echo "FAIL")
|
||
|
|
if [[ "$final_health" == "OK" ]]; then
|
||
|
|
tap_ok "reboot-${i}-final-health"
|
||
|
|
else
|
||
|
|
tap_fail "reboot-${i}-final-health" "Health: ${final_health}"
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Check: restart count is 0 for containers started since boot
|
||
|
|
restart_issues=$(ssh_sudo "podman ps --format '{{.Names}} {{.Status}}' | grep -c 'Restarting'" 2>/dev/null || echo "0")
|
||
|
|
restart_issues=$(echo "$restart_issues" | tail -1 | tr -d '[:space:]')
|
||
|
|
if [[ "$restart_issues" == "0" ]]; then
|
||
|
|
tap_ok "reboot-${i}-no-restart-loops"
|
||
|
|
else
|
||
|
|
tap_fail "reboot-${i}-no-restart-loops" "${restart_issues} containers in restart loops"
|
||
|
|
fi
|
||
|
|
|
||
|
|
total_time=$((ssh_time + health_time + MAX_CONTAINER_WAIT))
|
||
|
|
echo "# Total recovery time: ${total_time}s"
|
||
|
|
echo ""
|
||
|
|
|
||
|
|
# Rest between reboots
|
||
|
|
if [[ "$i" -lt "$ITERATIONS" ]]; then
|
||
|
|
echo "# Resting ${REST_BETWEEN}s before next reboot..."
|
||
|
|
sleep "$REST_BETWEEN"
|
||
|
|
fi
|
||
|
|
done
|
||
|
|
|
||
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
||
|
|
# Summary
|
||
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
||
|
|
echo ""
|
||
|
|
TOTAL=$((PASS + FAIL))
|
||
|
|
echo "1..${TOTAL}"
|
||
|
|
echo ""
|
||
|
|
echo "# ═══════════════════════════════════════════════════════════════"
|
||
|
|
echo "# Results: ${PASS} passed, ${FAIL} failed, ${TOTAL} total"
|
||
|
|
echo "# Finished: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||
|
|
echo "# ═══════════════════════════════════════════════════════════════"
|
||
|
|
|
||
|
|
if [[ "$FAIL" -gt 0 ]]; then
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
exit 0
|