Added start_stopped_containers() to crash_recovery.rs that starts all exited/created containers on backend startup, fixing the issue where containers didn't come back after clean reboot (PID marker removed by systemd stop). Created test-failure-recovery.sh covering 5 failure scenarios: container crash, backend restart, Tor restart, full reboot, and Tor traffic block (UPTIME-02). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
192 lines
6.7 KiB
Bash
Executable File
192 lines
6.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# test-failure-recovery.sh — Inject failures and verify auto-recovery
|
|
#
|
|
# Tests resilience scenarios on the primary server:
|
|
# 1. Container crash → health monitor auto-restart
|
|
# 2. Backend restart → service recovers, containers intact
|
|
# 3. Tor restart → hidden services recover
|
|
# 4. Full reboot → everything comes back up
|
|
#
|
|
# Usage: ./scripts/test-failure-recovery.sh [target-ip]
|
|
# --skip-reboot: skip the reboot test (default: included)
|
|
|
|
set -uo pipefail
|
|
|
|
TARGET="${1:-192.168.1.228}"
|
|
SKIP_REBOOT="${2:-}"
|
|
SSH_KEY="${ARCHIPELAGO_SSH_KEY:-$HOME/.ssh/archipelago-deploy}"
|
|
SSH="ssh -i $SSH_KEY -o StrictHostKeyChecking=no -o ConnectTimeout=10 archipelago@$TARGET"
|
|
PASS=0
|
|
FAIL=0
|
|
|
|
check() {
|
|
local name="$1"
|
|
local ok="$2"
|
|
if [ "$ok" = "true" ]; then
|
|
echo " ✅ $name"
|
|
((PASS++))
|
|
else
|
|
echo " ❌ $name"
|
|
((FAIL++))
|
|
fi
|
|
}
|
|
|
|
wait_for_health() {
|
|
local max_wait="$1"
|
|
local desc="$2"
|
|
echo " Waiting for health (max ${max_wait}s)..."
|
|
for i in $(seq 1 $max_wait); do
|
|
STATUS=$($SSH "curl -s -o /dev/null -w '%{http_code}' --max-time 5 http://localhost/health" 2>/dev/null || echo "000")
|
|
if [ "$STATUS" = "200" ]; then
|
|
echo " Healthy after ${i}s"
|
|
return 0
|
|
fi
|
|
sleep 1
|
|
done
|
|
echo " NOT healthy after ${max_wait}s"
|
|
return 1
|
|
}
|
|
|
|
wait_for_container() {
|
|
local name="$1"
|
|
local max_wait="$2"
|
|
echo " Waiting for $name to be running (max ${max_wait}s)..."
|
|
for i in $(seq 1 $((max_wait / 5))); do
|
|
STATUS=$($SSH "sudo podman inspect $name --format '{{.State.Status}}' 2>/dev/null" 2>/dev/null | tr -d '[:space:]')
|
|
if [ "$STATUS" = "running" ]; then
|
|
echo " $name running after ~$((i * 5))s"
|
|
return 0
|
|
fi
|
|
sleep 5
|
|
done
|
|
echo " $name NOT running after ${max_wait}s"
|
|
return 1
|
|
}
|
|
|
|
echo "💥 Failure Recovery Test — $TARGET"
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
|
|
# ━━━ Scenario 1: Container crash (bitcoin-knots) ━━━
|
|
echo ""
|
|
echo "Scenario 1: Container crash — bitcoin-knots"
|
|
echo " Stopping bitcoin-knots..."
|
|
$SSH "sudo podman stop bitcoin-knots 2>/dev/null" >/dev/null 2>&1
|
|
|
|
BK_STATUS=$($SSH "sudo podman inspect bitcoin-knots --format '{{.State.Status}}' 2>/dev/null" 2>/dev/null | tr -d '[:space:]')
|
|
check "bitcoin-knots stopped" "$([ "$BK_STATUS" != "running" ] && echo true || echo false)"
|
|
|
|
# Wait for health monitor to detect and restart
|
|
if wait_for_container "bitcoin-knots" 120; then
|
|
check "Health monitor auto-restarted bitcoin-knots" "true"
|
|
else
|
|
check "Health monitor auto-restarted bitcoin-knots" "false"
|
|
fi
|
|
|
|
# ━━━ Scenario 2: Backend restart ━━━
|
|
echo ""
|
|
echo "Scenario 2: Backend restart — systemctl restart archipelago"
|
|
CONTAINERS_BEFORE=$($SSH "sudo podman ps --format '{{.Names}}' 2>/dev/null | wc -l" 2>/dev/null | tr -d '[:space:]')
|
|
echo " Containers before: $CONTAINERS_BEFORE"
|
|
|
|
$SSH "sudo systemctl restart archipelago" >/dev/null 2>&1
|
|
sleep 3
|
|
|
|
if wait_for_health 30 "backend"; then
|
|
check "Backend recovered" "true"
|
|
else
|
|
check "Backend recovered" "false"
|
|
fi
|
|
|
|
CONTAINERS_AFTER=$($SSH "sudo podman ps --format '{{.Names}}' 2>/dev/null | wc -l" 2>/dev/null | tr -d '[:space:]')
|
|
check "Containers intact after backend restart ($CONTAINERS_AFTER)" "$([ "$CONTAINERS_AFTER" -ge "$((CONTAINERS_BEFORE - 1))" ] && echo true || echo false)"
|
|
|
|
# ━━━ Scenario 3: Tor restart ━━━
|
|
echo ""
|
|
echo "Scenario 3: Tor restart — systemctl restart tor"
|
|
$SSH "sudo systemctl restart tor" >/dev/null 2>&1
|
|
sleep 5
|
|
|
|
TOR_STATUS=$($SSH "sudo systemctl is-active tor" 2>/dev/null | tr -d '[:space:]')
|
|
check "Tor service active" "$([ "$TOR_STATUS" = "active" ] && echo true || echo false)"
|
|
|
|
# Verify hostname still exists
|
|
TOR_ADDR=$($SSH "cat /var/lib/archipelago/tor-hostnames/archipelago 2>/dev/null" 2>/dev/null | tr -d '[:space:]')
|
|
check "Tor address still valid" "$(echo "$TOR_ADDR" | grep -q '.onion$' && echo true || echo false)"
|
|
|
|
# ━━━ Scenario 4: Full reboot ━━━
|
|
if [ "$SKIP_REBOOT" = "--skip-reboot" ]; then
|
|
echo ""
|
|
echo "Scenario 4: Full reboot — SKIPPED (--skip-reboot)"
|
|
else
|
|
echo ""
|
|
echo "Scenario 4: Full reboot"
|
|
echo " Rebooting server..."
|
|
$SSH "sudo reboot" >/dev/null 2>&1 || true
|
|
|
|
# Wait for server to go down
|
|
sleep 15
|
|
|
|
# Wait for server to come back (max 180s)
|
|
echo " Waiting for server to come back online..."
|
|
BACK_ONLINE="false"
|
|
for i in $(seq 1 36); do
|
|
if $SSH "echo ok" >/dev/null 2>&1; then
|
|
BACK_ONLINE="true"
|
|
echo " SSH accessible after ~$((i * 5 + 15))s"
|
|
break
|
|
fi
|
|
sleep 5
|
|
done
|
|
check "Server back online after reboot" "$BACK_ONLINE"
|
|
|
|
if [ "$BACK_ONLINE" = "true" ]; then
|
|
# Wait for health
|
|
if wait_for_health 120 "post-reboot"; then
|
|
check "Backend healthy after reboot" "true"
|
|
else
|
|
check "Backend healthy after reboot" "false"
|
|
fi
|
|
|
|
# Check containers — boot startup may take 30-60s to start all containers
|
|
echo " Waiting 60s for boot container startup..."
|
|
sleep 60
|
|
CONTAINERS_REBOOT=$($SSH "sudo podman ps --format '{{.Names}}' 2>/dev/null | wc -l" 2>/dev/null | tr -d '[:space:]')
|
|
check "Containers running after reboot ($CONTAINERS_REBOOT)" "$([ "$CONTAINERS_REBOOT" -ge 10 ] && echo true || echo false)"
|
|
|
|
# Check Tor
|
|
TOR_REBOOT=$($SSH "sudo systemctl is-active tor" 2>/dev/null | tr -d '[:space:]')
|
|
check "Tor active after reboot" "$([ "$TOR_REBOOT" = "active" ] && echo true || echo false)"
|
|
fi
|
|
fi
|
|
|
|
# ━━━ Scenario 5: Tor traffic block ━━━
|
|
echo ""
|
|
echo "Scenario 5: Tor traffic block (10s)"
|
|
echo " Blocking Tor traffic..."
|
|
$SSH "sudo iptables -A OUTPUT -p tcp --dport 9001 -j DROP && sudo iptables -A OUTPUT -p tcp --dport 9050 -j DROP" 2>/dev/null
|
|
|
|
sleep 10
|
|
|
|
echo " Unblocking Tor traffic..."
|
|
$SSH "sudo iptables -D OUTPUT -p tcp --dport 9001 -j DROP 2>/dev/null; sudo iptables -D OUTPUT -p tcp --dport 9050 -j DROP 2>/dev/null" 2>/dev/null
|
|
|
|
sleep 5
|
|
TOR_AFTER_BLOCK=$($SSH "sudo systemctl is-active tor" 2>/dev/null | tr -d '[:space:]')
|
|
check "Tor recovered after traffic block" "$([ "$TOR_AFTER_BLOCK" = "active" ] && echo true || echo false)"
|
|
|
|
HEALTH_AFTER=$($SSH "curl -s -o /dev/null -w '%{http_code}' http://localhost/health" 2>/dev/null)
|
|
check "Backend healthy after Tor block" "$([ "$HEALTH_AFTER" = "200" ] && echo true || echo false)"
|
|
|
|
# ━━━ SUMMARY ━━━
|
|
echo ""
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
echo "Results: $PASS passed, $FAIL failed"
|
|
|
|
if [ $FAIL -eq 0 ]; then
|
|
echo "✅ All failure recovery tests passed!"
|
|
else
|
|
echo "❌ $FAIL tests failed"
|
|
fi
|
|
|
|
[ $FAIL -eq 0 ] && exit 0 || exit 1
|