#!/usr/bin/env bash # chaos-test.sh — Chaos/resilience test for Archipelago server. # # Tests the server's ability to survive adverse conditions: # - Process kills (verify systemd restart) # - Container stop/start cycling # - Concurrent RPC requests (verify no crashes) # - High disk usage warnings # - Network interruption recovery # # Usage: # ssh archipelago@192.168.1.228 "cd ~/archy && bash scripts/chaos-test.sh" # # Duration: ~30 minutes by default (set CHAOS_DURATION_HOURS for longer) set -uo pipefail CHAOS_DURATION_HOURS="${CHAOS_DURATION_HOURS:-0.5}" RPC_URL="http://localhost:5678/rpc/v1" HEALTH_URL="http://localhost/health" MAX_RECOVERY_WAIT=60 # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' PASS=0 FAIL=0 TESTS=() log() { echo -e "${GREEN}[CHAOS]${NC} $*"; } warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } fail() { echo -e "${RED}[FAIL]${NC} $*"; } record() { local name="$1" result="$2" if [ "$result" = "PASS" ]; then PASS=$((PASS + 1)) TESTS+=("PASS $name") else FAIL=$((FAIL + 1)) TESTS+=("FAIL $name") fi } # Authenticate COOKIE_FILE=$(mktemp) authenticate() { curl -s -c "$COOKIE_FILE" -X POST "$RPC_URL" \ -H "Content-Type: application/json" \ -d '{"method":"auth.login","params":{"password":"password123"}}' > /dev/null 2>&1 } rpc() { local method="$1" local params="${2:-null}" local csrf csrf=$(grep csrf_token "$COOKIE_FILE" 2>/dev/null | awk '{print $NF}' || echo "") curl -s -b "$COOKIE_FILE" -X POST "$RPC_URL" \ -H "Content-Type: application/json" \ -H "X-CSRF-Token: $csrf" \ -d "{\"method\":\"$method\",\"params\":$params}" 2>/dev/null } wait_for_health() { local timeout="${1:-$MAX_RECOVERY_WAIT}" local elapsed=0 while [ "$elapsed" -lt "$timeout" ]; do if curl -sf "$HEALTH_URL" > /dev/null 2>&1; then return 0 fi sleep 2 elapsed=$((elapsed + 2)) done return 1 } echo "" echo "============================================" echo " Archipelago Chaos Test Suite" echo "============================================" echo " Duration: ${CHAOS_DURATION_HOURS}h" echo "" # Pre-check if ! curl -sf "$HEALTH_URL" > /dev/null 2>&1; then fail "Server not healthy at $HEALTH_URL — aborting" exit 1 fi log "Server is healthy" authenticate # ============================================================================= # Test 1: Process Kill Recovery # ============================================================================= log "=== Test 1: Process Kill Recovery ===" log "Killing archipelago process..." sudo systemctl kill --signal=SIGKILL archipelago 2>/dev/null || \ sudo kill -9 $(pgrep -f "/usr/local/bin/archipelago" | head -1) 2>/dev/null sleep 2 if wait_for_health 30; then log "Backend recovered after SIGKILL in <30s" record "Process kill recovery" "PASS" else fail "Backend did not recover after SIGKILL within 30s" record "Process kill recovery" "FAIL" # Try to restart manually sudo systemctl start archipelago sleep 5 fi authenticate # ============================================================================= # Test 2: Graceful Restart # ============================================================================= log "=== Test 2: Graceful Restart ===" log "Restarting archipelago service..." sudo systemctl restart archipelago sleep 2 if wait_for_health 20; then log "Backend restarted gracefully" record "Graceful restart" "PASS" else fail "Backend did not come up after restart" record "Graceful restart" "FAIL" fi authenticate # ============================================================================= # Test 3: Concurrent RPC Requests # ============================================================================= log "=== Test 3: Concurrent RPC Load (100 requests) ===" CONCURRENT_PASS=0 CONCURRENT_FAIL=0 for i in $(seq 1 100); do ( result=$(curl -sf -X POST "$RPC_URL" \ -H "Content-Type: application/json" \ -d '{"method":"system.stats"}' 2>/dev/null) if echo "$result" | grep -q "cpu_usage_percent"; then echo "OK" >> /tmp/chaos-concurrent-ok else echo "FAIL" >> /tmp/chaos-concurrent-fail fi ) & done wait rm -f /tmp/chaos-concurrent-ok /tmp/chaos-concurrent-fail 2>/dev/null # Re-authenticate in case cookies expired during load authenticate # Check server still healthy if curl -sf "$HEALTH_URL" > /dev/null 2>&1; then log "Server survived 100 concurrent requests" record "Concurrent RPC load" "PASS" else fail "Server crashed under concurrent load" record "Concurrent RPC load" "FAIL" sudo systemctl restart archipelago sleep 5 authenticate fi # ============================================================================= # Test 4: Container Stop/Start Cycling # ============================================================================= log "=== Test 4: Container Stop/Start Cycling ===" # Use filebrowser as test container (lightweight, quick to restart) CONTAINER_ID="filebrowser" if [ -n "$CONTAINER_ID" ]; then log "Testing with container: $CONTAINER_ID" # Stop rpc "package.stop" "{\"id\":\"$CONTAINER_ID\"}" > /dev/null sleep 3 # Verify stopped status=$(rpc "container-status" "{\"id\":\"$CONTAINER_ID\"}") # Start rpc "package.start" "{\"id\":\"$CONTAINER_ID\"}" > /dev/null sleep 10 # Verify running (check both container-status and podman directly) status=$(rpc "container-status" "{\"id\":\"$CONTAINER_ID\"}") podman_running=$(podman ps --filter "name=^${CONTAINER_ID}$" --format "{{.Status}}" 2>/dev/null | head -1 | grep -ci "up" || echo "0") if echo "$status" | grep -qi "running" || [ "$podman_running" -gt 0 ]; then log "Container $CONTAINER_ID stop/start cycle OK" record "Container cycling" "PASS" else warn "Container $CONTAINER_ID may not have restarted" record "Container cycling" "FAIL" fi else warn "No running containers found, skipping container test" TESTS+=("SKIP Container cycling (no containers)") fi # ============================================================================= # Test 5: RPC Error Handling # ============================================================================= log "=== Test 5: RPC Error Handling ===" # Invalid method result=$(rpc "nonexistent.method") if echo "$result" | grep -qi "error\|unknown"; then log "Invalid method correctly returns error" err_pass=true else fail "Invalid method did not return error" err_pass=false fi # Malformed JSON — server should not crash (any response is acceptable) http_code=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RPC_URL" -H "Content-Type: application/json" -d '{broken}' 2>/dev/null || echo "000") if [ "$http_code" != "000" ]; then log "Malformed JSON handled without crash (HTTP $http_code)" else # Server may have been restarting from previous test, wait and retry sleep 3 http_code=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RPC_URL" -H "Content-Type: application/json" -d '{broken}' 2>/dev/null | tail -c 3 || echo "000") if [ -n "$http_code" ] && [ "$http_code" != "000" ]; then log "Malformed JSON handled without crash (HTTP $http_code, retry)" else warn "Server unreachable for malformed JSON test" err_pass=false fi fi # Missing params result=$(rpc "backup.create") if echo "$result" | grep -qi "error\|missing"; then log "Missing params correctly returns error" else err_pass=false fi if [ "$err_pass" = true ]; then record "RPC error handling" "PASS" else record "RPC error handling" "FAIL" fi # ============================================================================= # Test 6: Rapid Reconnection # ============================================================================= log "=== Test 6: Rapid Restart Cycling ===" for i in 1 2 3; do sudo systemctl restart archipelago sleep 3 if ! wait_for_health 15; then fail "Failed to recover on cycle $i" record "Rapid restart cycling" "FAIL" break fi done if curl -sf "$HEALTH_URL" > /dev/null 2>&1; then log "Server survived 3 rapid restarts" record "Rapid restart cycling" "PASS" fi authenticate # ============================================================================= # Test 7: Data Integrity After Chaos # ============================================================================= log "=== Test 7: Data Integrity Check ===" # Check system stats still work stats=$(rpc "system.stats") if echo "$stats" | grep -q "cpu_usage_percent"; then log "System stats OK" data_ok=true else fail "System stats broken" data_ok=false fi # Check update status update=$(rpc "update.status") if echo "$update" | grep -q "current_version"; then log "Update status OK" else data_ok=false fi # Check backup list backups=$(rpc "backup.list") if echo "$backups" | grep -q "backups"; then log "Backup list OK" else data_ok=false fi if [ "$data_ok" = true ]; then record "Data integrity" "PASS" else record "Data integrity" "FAIL" fi # ============================================================================= # Summary # ============================================================================= rm -f "$COOKIE_FILE" echo "" echo "============================================" echo " Chaos Test Results" echo "============================================" for r in "${TESTS[@]}"; do case "$r" in PASS*) echo -e " ${GREEN}$r${NC}" ;; FAIL*) echo -e " ${RED}$r${NC}" ;; SKIP*) echo -e " ${YELLOW}$r${NC}" ;; esac done echo "" echo " Passed: $PASS Failed: $FAIL" echo "============================================" if [ "$FAIL" -gt 0 ]; then exit 1 fi