#!/usr/bin/env bash # FINAL-202: 72-Hour Stability Test # Monitors a running Archipelago node for 72 hours, checking health every 5 minutes. # Usage: bash test-stability-72h.sh [password] # Logs results to /tmp/stability-test-.log set -euo pipefail NODE="${1:-192.168.1.228}" BASE="http://${NODE}" PASS="${2:-password123}" DURATION_HOURS="${3:-72}" CHECK_INTERVAL=300 # 5 minutes COOKIE_JAR="/tmp/stability-cookies.txt" TIMESTAMP=$(date +%Y%m%d-%H%M%S) LOG_FILE="/tmp/stability-test-${TIMESTAMP}.log" FAIL_LOG="/tmp/stability-failures-${TIMESTAMP}.log" TOTAL_CHECKS=0 TOTAL_FAILURES=0 CONSECUTIVE_FAILURES=0 MAX_CONSECUTIVE=0 START_TIME=$(date +%s) END_TIME=$((START_TIME + DURATION_HOURS * 3600)) log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"; } fail_log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] FAIL: $*" | tee -a "$LOG_FILE" >> "$FAIL_LOG"; } login() { curl -s -c "$COOKIE_JAR" -H "Content-Type: application/json" \ -d "{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"auth.login\",\"params\":{\"password\":\"$PASS\"}}" \ "${BASE}/rpc/" > /dev/null 2>&1 } rpc() { curl -s -m 10 -b "$COOKIE_JAR" -c "$COOKIE_JAR" \ -H "Content-Type: application/json" \ -d "{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"$1\",\"params\":${2:-{}}}" \ "${BASE}/rpc/" 2>/dev/null } check_health() { local failures=0 # 1. Backend health local health_code health_code=$(curl -s -o /dev/null -w "%{http_code}" -m 10 "${BASE}/health" 2>/dev/null || echo "000") if [ "$health_code" != "200" ]; then fail_log "Backend health endpoint returned $health_code" failures=$((failures + 1)) fi # 2. UI loads local ui_code ui_code=$(curl -s -o /dev/null -w "%{http_code}" -m 10 "${BASE}/" 2>/dev/null || echo "000") if [ "$ui_code" != "200" ] && [ "$ui_code" != "302" ]; then fail_log "Web UI returned $ui_code" failures=$((failures + 1)) fi # 3. RPC responds local rpc_resp rpc_resp=$(rpc "system.info" 2>/dev/null) if ! echo "$rpc_resp" | grep -q '"result"'; then # Try re-login login rpc_resp=$(rpc "system.info" 2>/dev/null) if ! echo "$rpc_resp" | grep -q '"result"'; then fail_log "RPC system.info failed after re-login" failures=$((failures + 1)) fi fi # 4. WebSocket endpoint local ws_code ws_code=$(curl -s -o /dev/null -w "%{http_code}" -m 5 -H "Upgrade: websocket" "${BASE}/ws/" 2>/dev/null || echo "000") if [ "$ws_code" = "000" ]; then fail_log "WebSocket endpoint unreachable" failures=$((failures + 1)) fi # 5. Check containers via SSH (if accessible) local ssh_key="$HOME/.ssh/archipelago-deploy" if [ -f "$ssh_key" ]; then local crashed crashed=$(ssh -i "$ssh_key" -o ConnectTimeout=10 -o StrictHostKeyChecking=no "archipelago@${NODE}" \ 'sudo podman ps -a --format "{{.Names}} {{.Status}}" 2>/dev/null | grep -i "exited\|dead\|oom" | head -5' 2>/dev/null || echo "") if [ -n "$crashed" ]; then fail_log "Crashed/dead containers: $crashed" failures=$((failures + 1)) fi # 6. Check memory usage local mem_info mem_info=$(ssh -i "$ssh_key" -o ConnectTimeout=10 -o StrictHostKeyChecking=no "archipelago@${NODE}" \ 'free -m | grep Mem | awk "{printf \"%d/%dMB (%.0f%%)\", \$3, \$2, \$3/\$2*100}"' 2>/dev/null || echo "unknown") log " Memory: $mem_info" # 7. Check disk usage local disk_info disk_info=$(ssh -i "$ssh_key" -o ConnectTimeout=10 -o StrictHostKeyChecking=no "archipelago@${NODE}" \ 'df -h / | tail -1 | awk "{print \$3\"/\"\$2\" (\"\$5\" used)\"}"' 2>/dev/null || echo "unknown") log " Disk: $disk_info" # 8. Check for OOM kills since start local oom_count oom_count=$(ssh -i "$ssh_key" -o ConnectTimeout=10 -o StrictHostKeyChecking=no "archipelago@${NODE}" \ 'dmesg 2>/dev/null | grep -c "Out of memory" || echo 0' 2>/dev/null || echo "unknown") if [ "$oom_count" != "0" ] && [ "$oom_count" != "unknown" ]; then fail_log "OOM kills detected: $oom_count" failures=$((failures + 1)) fi # 9. Check archipelago service local svc_status svc_status=$(ssh -i "$ssh_key" -o ConnectTimeout=10 -o StrictHostKeyChecking=no "archipelago@${NODE}" \ 'systemctl is-active archipelago 2>/dev/null || echo inactive' 2>/dev/null || echo "unknown") if [ "$svc_status" != "active" ]; then fail_log "Archipelago service status: $svc_status" failures=$((failures + 1)) fi fi # 10. Check Tor services local tor_resp tor_resp=$(rpc "tor.list-services" 2>/dev/null) if echo "$tor_resp" | grep -q '"result"'; then local tor_count tor_count=$(echo "$tor_resp" | python3 -c "import sys,json; r=json.load(sys.stdin); print(len(r.get('result',{}).get('services',[])))" 2>/dev/null || echo "0") log " Tor services: $tor_count" fi # 11. Check peer connections local peers_resp peers_resp=$(rpc "network.list-peers" 2>/dev/null) if echo "$peers_resp" | grep -q '"result"'; then local peer_count peer_count=$(echo "$peers_resp" | python3 -c "import sys,json; r=json.load(sys.stdin); print(len(r.get('result',{}).get('peers',[])))" 2>/dev/null || echo "0") log " Connected peers: $peer_count" fi # 12. Ecash wallet balance check local ecash_resp ecash_resp=$(rpc "wallet.ecash-balance" 2>/dev/null) if echo "$ecash_resp" | grep -q '"result"'; then local balance balance=$(echo "$ecash_resp" | python3 -c "import sys,json; r=json.load(sys.stdin); print(r.get('result',{}).get('balance',0))" 2>/dev/null || echo "0") log " Ecash balance: $balance sats" fi return $failures } # ─── Main Loop ──────────────────────────────────────────────────── log "╔════════════════════════════════════════════════════════════════╗" log "║ 72-Hour Stability Test — Archipelago ║" log "╚════════════════════════════════════════════════════════════════╝" log "Target: $NODE" log "Duration: ${DURATION_HOURS}h (until $(date -r $END_TIME '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date -d @$END_TIME '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo 'unknown'))" log "Check interval: ${CHECK_INTERVAL}s" log "Log file: $LOG_FILE" log "Failure log: $FAIL_LOG" log "" # Initial login login log "Authenticated to node" while [ "$(date +%s)" -lt "$END_TIME" ]; do TOTAL_CHECKS=$((TOTAL_CHECKS + 1)) ELAPSED_H=$(( ($(date +%s) - START_TIME) / 3600 )) ELAPSED_M=$(( (($(date +%s) - START_TIME) % 3600) / 60 )) log "Check #${TOTAL_CHECKS} (${ELAPSED_H}h${ELAPSED_M}m elapsed)" if check_health; then CONSECUTIVE_FAILURES=0 log " Status: OK" else FAIL_RESULT=$? TOTAL_FAILURES=$((TOTAL_FAILURES + FAIL_RESULT)) CONSECUTIVE_FAILURES=$((CONSECUTIVE_FAILURES + 1)) if [ "$CONSECUTIVE_FAILURES" -gt "$MAX_CONSECUTIVE" ]; then MAX_CONSECUTIVE=$CONSECUTIVE_FAILURES fi log " Status: $FAIL_RESULT failure(s) this check" if [ "$CONSECUTIVE_FAILURES" -ge 5 ]; then log "WARNING: 5 consecutive check failures — node may be down!" fi fi sleep "$CHECK_INTERVAL" done # ─── Final Report ───────────────────────────────────────────────── log "" log "╔════════════════════════════════════════════════════════════════╗" log "║ 72-Hour Stability Test — COMPLETE ║" log "╚════════════════════════════════════════════════════════════════╝" log "" log "Duration: ${DURATION_HOURS}h" log "Total checks: $TOTAL_CHECKS" log "Total failures: $TOTAL_FAILURES" log "Max consecutive failures: $MAX_CONSECUTIVE" log "" UPTIME_PCT=0 if [ "$TOTAL_CHECKS" -gt 0 ]; then PASSED=$((TOTAL_CHECKS - TOTAL_FAILURES)) UPTIME_PCT=$(python3 -c "print(f'{${PASSED}/${TOTAL_CHECKS}*100:.1f}')" 2>/dev/null || echo "?") fi log "Uptime: ${UPTIME_PCT}%" if [ "$TOTAL_FAILURES" -eq 0 ]; then log "RESULT: PASS — Zero failures over ${DURATION_HOURS}h" exit 0 else log "RESULT: FAIL — $TOTAL_FAILURES failures detected" log "See failure details: $FAIL_LOG" exit 1 fi