#!/usr/bin/env bash # test-failure-recovery.sh — Inject failures and verify auto-recovery # # Tests resilience scenarios on the primary server: # 1. Container crash → health monitor auto-restart # 2. Backend restart → service recovers, containers intact # 3. Tor restart → hidden services recover # 4. Full reboot → everything comes back up # # Usage: ./scripts/test-failure-recovery.sh [target-ip] # --skip-reboot: skip the reboot test (default: included) set -uo pipefail TARGET="${1:-192.168.1.228}" SKIP_REBOOT="${2:-}" SSH_KEY="${ARCHIPELAGO_SSH_KEY:-$HOME/.ssh/archipelago-deploy}" SSH="ssh -i $SSH_KEY -o StrictHostKeyChecking=no -o ConnectTimeout=10 archipelago@$TARGET" PASS=0 FAIL=0 check() { local name="$1" local ok="$2" if [ "$ok" = "true" ]; then echo " ✅ $name" ((PASS++)) else echo " ❌ $name" ((FAIL++)) fi } wait_for_health() { local max_wait="$1" local desc="$2" echo " Waiting for health (max ${max_wait}s)..." for i in $(seq 1 $max_wait); do STATUS=$($SSH "curl -s -o /dev/null -w '%{http_code}' --max-time 5 http://localhost/health" 2>/dev/null || echo "000") if [ "$STATUS" = "200" ]; then echo " Healthy after ${i}s" return 0 fi sleep 1 done echo " NOT healthy after ${max_wait}s" return 1 } wait_for_container() { local name="$1" local max_wait="$2" echo " Waiting for $name to be running (max ${max_wait}s)..." for i in $(seq 1 $((max_wait / 5))); do STATUS=$($SSH "sudo podman inspect $name --format '{{.State.Status}}' 2>/dev/null" 2>/dev/null | tr -d '[:space:]') if [ "$STATUS" = "running" ]; then echo " $name running after ~$((i * 5))s" return 0 fi sleep 5 done echo " $name NOT running after ${max_wait}s" return 1 } echo "💥 Failure Recovery Test — $TARGET" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" # ━━━ Scenario 1: Container crash (bitcoin-knots) ━━━ echo "" echo "Scenario 1: Container crash — bitcoin-knots" echo " Stopping bitcoin-knots..." $SSH "sudo podman stop bitcoin-knots 2>/dev/null" >/dev/null 2>&1 BK_STATUS=$($SSH "sudo podman inspect bitcoin-knots --format '{{.State.Status}}' 2>/dev/null" 2>/dev/null | tr -d '[:space:]') check "bitcoin-knots stopped" "$([ "$BK_STATUS" != "running" ] && echo true || echo false)" # Wait for health monitor to detect and restart if wait_for_container "bitcoin-knots" 120; then check "Health monitor auto-restarted bitcoin-knots" "true" else check "Health monitor auto-restarted bitcoin-knots" "false" fi # ━━━ Scenario 2: Backend restart ━━━ echo "" echo "Scenario 2: Backend restart — systemctl restart archipelago" CONTAINERS_BEFORE=$($SSH "sudo podman ps --format '{{.Names}}' 2>/dev/null | wc -l" 2>/dev/null | tr -d '[:space:]') echo " Containers before: $CONTAINERS_BEFORE" $SSH "sudo systemctl restart archipelago" >/dev/null 2>&1 sleep 3 if wait_for_health 30 "backend"; then check "Backend recovered" "true" else check "Backend recovered" "false" fi CONTAINERS_AFTER=$($SSH "sudo podman ps --format '{{.Names}}' 2>/dev/null | wc -l" 2>/dev/null | tr -d '[:space:]') check "Containers intact after backend restart ($CONTAINERS_AFTER)" "$([ "$CONTAINERS_AFTER" -ge "$((CONTAINERS_BEFORE - 1))" ] && echo true || echo false)" # ━━━ Scenario 3: Tor restart ━━━ echo "" echo "Scenario 3: Tor restart — systemctl restart tor" $SSH "sudo systemctl restart tor" >/dev/null 2>&1 sleep 5 TOR_STATUS=$($SSH "sudo systemctl is-active tor" 2>/dev/null | tr -d '[:space:]') check "Tor service active" "$([ "$TOR_STATUS" = "active" ] && echo true || echo false)" # Verify hostname still exists TOR_ADDR=$($SSH "cat /var/lib/archipelago/tor-hostnames/archipelago 2>/dev/null" 2>/dev/null | tr -d '[:space:]') check "Tor address still valid" "$(echo "$TOR_ADDR" | grep -q '.onion$' && echo true || echo false)" # ━━━ Scenario 4: Full reboot ━━━ if [ "$SKIP_REBOOT" = "--skip-reboot" ]; then echo "" echo "Scenario 4: Full reboot — SKIPPED (--skip-reboot)" else echo "" echo "Scenario 4: Full reboot" echo " Rebooting server..." $SSH "sudo reboot" >/dev/null 2>&1 || true # Wait for server to go down sleep 15 # Wait for server to come back (max 180s) echo " Waiting for server to come back online..." BACK_ONLINE="false" for i in $(seq 1 36); do if $SSH "echo ok" >/dev/null 2>&1; then BACK_ONLINE="true" echo " SSH accessible after ~$((i * 5 + 15))s" break fi sleep 5 done check "Server back online after reboot" "$BACK_ONLINE" if [ "$BACK_ONLINE" = "true" ]; then # Wait for health if wait_for_health 120 "post-reboot"; then check "Backend healthy after reboot" "true" else check "Backend healthy after reboot" "false" fi # Check containers — boot startup may take 30-60s to start all containers echo " Waiting 60s for boot container startup..." sleep 60 CONTAINERS_REBOOT=$($SSH "sudo podman ps --format '{{.Names}}' 2>/dev/null | wc -l" 2>/dev/null | tr -d '[:space:]') check "Containers running after reboot ($CONTAINERS_REBOOT)" "$([ "$CONTAINERS_REBOOT" -ge 10 ] && echo true || echo false)" # Check Tor TOR_REBOOT=$($SSH "sudo systemctl is-active tor" 2>/dev/null | tr -d '[:space:]') check "Tor active after reboot" "$([ "$TOR_REBOOT" = "active" ] && echo true || echo false)" fi fi # ━━━ Scenario 5: Tor traffic block ━━━ echo "" echo "Scenario 5: Tor traffic block (10s)" echo " Blocking Tor traffic..." $SSH "sudo iptables -A OUTPUT -p tcp --dport 9001 -j DROP && sudo iptables -A OUTPUT -p tcp --dport 9050 -j DROP" 2>/dev/null sleep 10 echo " Unblocking Tor traffic..." $SSH "sudo iptables -D OUTPUT -p tcp --dport 9001 -j DROP 2>/dev/null; sudo iptables -D OUTPUT -p tcp --dport 9050 -j DROP 2>/dev/null" 2>/dev/null sleep 5 TOR_AFTER_BLOCK=$($SSH "sudo systemctl is-active tor" 2>/dev/null | tr -d '[:space:]') check "Tor recovered after traffic block" "$([ "$TOR_AFTER_BLOCK" = "active" ] && echo true || echo false)" HEALTH_AFTER=$($SSH "curl -s -o /dev/null -w '%{http_code}' http://localhost/health" 2>/dev/null) check "Backend healthy after Tor block" "$([ "$HEALTH_AFTER" = "200" ] && echo true || echo false)" # ━━━ SUMMARY ━━━ echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "Results: $PASS passed, $FAIL failed" if [ $FAIL -eq 0 ]; then echo "✅ All failure recovery tests passed!" else echo "❌ $FAIL tests failed" fi [ $FAIL -eq 0 ] && exit 0 || exit 1