archy/scripts/chaos-test.sh
Dorian f07ce10b1a refactor: update dependencies and remove unused code
- Added new dependencies: `adler2`, `crc32fast`, `flate2`, `miniz_oxide`, and `libredox`.
- Updated existing dependencies: `tokio-rustls` to version 0.26.4 and `filetime` to version 0.2.27.
- Removed the `backup.rs` file as it is no longer needed.
- Introduced tests for configuration and credential management.
- Enhanced the `identity` module to generate W3C compliant DID documents.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 00:19:30 +00:00

340 lines
9.7 KiB
Bash
Executable File

#!/usr/bin/env bash
# chaos-test.sh — Chaos/resilience test for Archipelago server.
#
# Tests the server's ability to survive adverse conditions:
# - Process kills (verify systemd restart)
# - Container stop/start cycling
# - Concurrent RPC requests (verify no crashes)
# - High disk usage warnings
# - Network interruption recovery
#
# Usage:
# ssh archipelago@192.168.1.228 "cd ~/archy && bash scripts/chaos-test.sh"
#
# Duration: ~30 minutes by default (set CHAOS_DURATION_HOURS for longer)
set -uo pipefail
CHAOS_DURATION_HOURS="${CHAOS_DURATION_HOURS:-0.5}"
RPC_URL="http://localhost:5678/rpc/v1"
HEALTH_URL="http://localhost/health"
MAX_RECOVERY_WAIT=60
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
PASS=0
FAIL=0
TESTS=()
log() { echo -e "${GREEN}[CHAOS]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
fail() { echo -e "${RED}[FAIL]${NC} $*"; }
record() {
local name="$1" result="$2"
if [ "$result" = "PASS" ]; then
PASS=$((PASS + 1))
TESTS+=("PASS $name")
else
FAIL=$((FAIL + 1))
TESTS+=("FAIL $name")
fi
}
# Authenticate
COOKIE_FILE=$(mktemp)
authenticate() {
curl -s -c "$COOKIE_FILE" -X POST "$RPC_URL" \
-H "Content-Type: application/json" \
-d '{"method":"auth.login","params":{"password":"password123"}}' > /dev/null 2>&1
}
rpc() {
local method="$1"
local params="${2:-null}"
local csrf
csrf=$(grep csrf_token "$COOKIE_FILE" 2>/dev/null | awk '{print $NF}' || echo "")
curl -s -b "$COOKIE_FILE" -X POST "$RPC_URL" \
-H "Content-Type: application/json" \
-H "X-CSRF-Token: $csrf" \
-d "{\"method\":\"$method\",\"params\":$params}" 2>/dev/null
}
wait_for_health() {
local timeout="${1:-$MAX_RECOVERY_WAIT}"
local elapsed=0
while [ "$elapsed" -lt "$timeout" ]; do
if curl -sf "$HEALTH_URL" > /dev/null 2>&1; then
return 0
fi
sleep 2
elapsed=$((elapsed + 2))
done
return 1
}
echo ""
echo "============================================"
echo " Archipelago Chaos Test Suite"
echo "============================================"
echo " Duration: ${CHAOS_DURATION_HOURS}h"
echo ""
# Pre-check
if ! curl -sf "$HEALTH_URL" > /dev/null 2>&1; then
fail "Server not healthy at $HEALTH_URL — aborting"
exit 1
fi
log "Server is healthy"
authenticate
# =============================================================================
# Test 1: Process Kill Recovery
# =============================================================================
log "=== Test 1: Process Kill Recovery ==="
log "Killing archipelago process..."
sudo systemctl kill --signal=SIGKILL archipelago 2>/dev/null || \
sudo kill -9 $(pgrep -f "/usr/local/bin/archipelago" | head -1) 2>/dev/null
sleep 2
if wait_for_health 30; then
log "Backend recovered after SIGKILL in <30s"
record "Process kill recovery" "PASS"
else
fail "Backend did not recover after SIGKILL within 30s"
record "Process kill recovery" "FAIL"
# Try to restart manually
sudo systemctl start archipelago
sleep 5
fi
authenticate
# =============================================================================
# Test 2: Graceful Restart
# =============================================================================
log "=== Test 2: Graceful Restart ==="
log "Restarting archipelago service..."
sudo systemctl restart archipelago
sleep 2
if wait_for_health 20; then
log "Backend restarted gracefully"
record "Graceful restart" "PASS"
else
fail "Backend did not come up after restart"
record "Graceful restart" "FAIL"
fi
authenticate
# =============================================================================
# Test 3: Concurrent RPC Requests
# =============================================================================
log "=== Test 3: Concurrent RPC Load (100 requests) ==="
CONCURRENT_PASS=0
CONCURRENT_FAIL=0
for i in $(seq 1 100); do
(
result=$(curl -sf -X POST "$RPC_URL" \
-H "Content-Type: application/json" \
-d '{"method":"system.stats"}' 2>/dev/null)
if echo "$result" | grep -q "cpu_usage_percent"; then
echo "OK" >> /tmp/chaos-concurrent-ok
else
echo "FAIL" >> /tmp/chaos-concurrent-fail
fi
) &
done
wait
rm -f /tmp/chaos-concurrent-ok /tmp/chaos-concurrent-fail 2>/dev/null
# Re-authenticate in case cookies expired during load
authenticate
# Check server still healthy
if curl -sf "$HEALTH_URL" > /dev/null 2>&1; then
log "Server survived 100 concurrent requests"
record "Concurrent RPC load" "PASS"
else
fail "Server crashed under concurrent load"
record "Concurrent RPC load" "FAIL"
sudo systemctl restart archipelago
sleep 5
authenticate
fi
# =============================================================================
# Test 4: Container Stop/Start Cycling
# =============================================================================
log "=== Test 4: Container Stop/Start Cycling ==="
# Use filebrowser as test container (lightweight, quick to restart)
CONTAINER_ID="filebrowser"
if [ -n "$CONTAINER_ID" ]; then
log "Testing with container: $CONTAINER_ID"
# Stop
rpc "package.stop" "{\"id\":\"$CONTAINER_ID\"}" > /dev/null
sleep 3
# Verify stopped
status=$(rpc "container-status" "{\"id\":\"$CONTAINER_ID\"}")
# Start
rpc "package.start" "{\"id\":\"$CONTAINER_ID\"}" > /dev/null
sleep 10
# Verify running (check both container-status and podman directly)
status=$(rpc "container-status" "{\"id\":\"$CONTAINER_ID\"}")
podman_running=$(podman ps --filter "name=^${CONTAINER_ID}$" --format "{{.Status}}" 2>/dev/null | head -1 | grep -ci "up" || echo "0")
if echo "$status" | grep -qi "running" || [ "$podman_running" -gt 0 ]; then
log "Container $CONTAINER_ID stop/start cycle OK"
record "Container cycling" "PASS"
else
warn "Container $CONTAINER_ID may not have restarted"
record "Container cycling" "FAIL"
fi
else
warn "No running containers found, skipping container test"
TESTS+=("SKIP Container cycling (no containers)")
fi
# =============================================================================
# Test 5: RPC Error Handling
# =============================================================================
log "=== Test 5: RPC Error Handling ==="
# Invalid method
result=$(rpc "nonexistent.method")
if echo "$result" | grep -qi "error\|unknown"; then
log "Invalid method correctly returns error"
err_pass=true
else
fail "Invalid method did not return error"
err_pass=false
fi
# Malformed JSON — server should not crash (any response is acceptable)
http_code=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RPC_URL" -H "Content-Type: application/json" -d '{broken}' 2>/dev/null || echo "000")
if [ "$http_code" != "000" ]; then
log "Malformed JSON handled without crash (HTTP $http_code)"
else
# Server may have been restarting from previous test, wait and retry
sleep 3
http_code=$(curl -s -o /dev/null -w "%{http_code}" -X POST "$RPC_URL" -H "Content-Type: application/json" -d '{broken}' 2>/dev/null | tail -c 3 || echo "000")
if [ -n "$http_code" ] && [ "$http_code" != "000" ]; then
log "Malformed JSON handled without crash (HTTP $http_code, retry)"
else
warn "Server unreachable for malformed JSON test"
err_pass=false
fi
fi
# Missing params
result=$(rpc "backup.create")
if echo "$result" | grep -qi "error\|missing"; then
log "Missing params correctly returns error"
else
err_pass=false
fi
if [ "$err_pass" = true ]; then
record "RPC error handling" "PASS"
else
record "RPC error handling" "FAIL"
fi
# =============================================================================
# Test 6: Rapid Reconnection
# =============================================================================
log "=== Test 6: Rapid Restart Cycling ==="
for i in 1 2 3; do
sudo systemctl restart archipelago
sleep 3
if ! wait_for_health 15; then
fail "Failed to recover on cycle $i"
record "Rapid restart cycling" "FAIL"
break
fi
done
if curl -sf "$HEALTH_URL" > /dev/null 2>&1; then
log "Server survived 3 rapid restarts"
record "Rapid restart cycling" "PASS"
fi
authenticate
# =============================================================================
# Test 7: Data Integrity After Chaos
# =============================================================================
log "=== Test 7: Data Integrity Check ==="
# Check system stats still work
stats=$(rpc "system.stats")
if echo "$stats" | grep -q "cpu_usage_percent"; then
log "System stats OK"
data_ok=true
else
fail "System stats broken"
data_ok=false
fi
# Check update status
update=$(rpc "update.status")
if echo "$update" | grep -q "current_version"; then
log "Update status OK"
else
data_ok=false
fi
# Check backup list
backups=$(rpc "backup.list")
if echo "$backups" | grep -q "backups"; then
log "Backup list OK"
else
data_ok=false
fi
if [ "$data_ok" = true ]; then
record "Data integrity" "PASS"
else
record "Data integrity" "FAIL"
fi
# =============================================================================
# Summary
# =============================================================================
rm -f "$COOKIE_FILE"
echo ""
echo "============================================"
echo " Chaos Test Results"
echo "============================================"
for r in "${TESTS[@]}"; do
case "$r" in
PASS*) echo -e " ${GREEN}$r${NC}" ;;
FAIL*) echo -e " ${RED}$r${NC}" ;;
SKIP*) echo -e " ${YELLOW}$r${NC}" ;;
esac
done
echo ""
echo " Passed: $PASS Failed: $FAIL"
echo "============================================"
if [ "$FAIL" -gt 0 ]; then
exit 1
fi