223 lines
8.5 KiB
Bash
223 lines
8.5 KiB
Bash
|
|
#!/usr/bin/env bash
|
||
|
|
# FINAL-202: 72-Hour Stability Test
|
||
|
|
# Monitors a running Archipelago node for 72 hours, checking health every 5 minutes.
|
||
|
|
# Usage: bash test-stability-72h.sh <node-ip> [password]
|
||
|
|
# Logs results to /tmp/stability-test-<timestamp>.log
|
||
|
|
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
NODE="${1:-192.168.1.228}"
|
||
|
|
BASE="http://${NODE}"
|
||
|
|
PASS="${2:-password123}"
|
||
|
|
DURATION_HOURS="${3:-72}"
|
||
|
|
CHECK_INTERVAL=300 # 5 minutes
|
||
|
|
COOKIE_JAR="/tmp/stability-cookies.txt"
|
||
|
|
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
|
||
|
|
LOG_FILE="/tmp/stability-test-${TIMESTAMP}.log"
|
||
|
|
FAIL_LOG="/tmp/stability-failures-${TIMESTAMP}.log"
|
||
|
|
|
||
|
|
TOTAL_CHECKS=0
|
||
|
|
TOTAL_FAILURES=0
|
||
|
|
CONSECUTIVE_FAILURES=0
|
||
|
|
MAX_CONSECUTIVE=0
|
||
|
|
START_TIME=$(date +%s)
|
||
|
|
END_TIME=$((START_TIME + DURATION_HOURS * 3600))
|
||
|
|
|
||
|
|
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"; }
|
||
|
|
fail_log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] FAIL: $*" | tee -a "$LOG_FILE" >> "$FAIL_LOG"; }
|
||
|
|
|
||
|
|
login() {
|
||
|
|
curl -s -c "$COOKIE_JAR" -H "Content-Type: application/json" \
|
||
|
|
-d "{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"auth.login\",\"params\":{\"password\":\"$PASS\"}}" \
|
||
|
|
"${BASE}/rpc/" > /dev/null 2>&1
|
||
|
|
}
|
||
|
|
|
||
|
|
rpc() {
|
||
|
|
curl -s -m 10 -b "$COOKIE_JAR" -c "$COOKIE_JAR" \
|
||
|
|
-H "Content-Type: application/json" \
|
||
|
|
-d "{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"$1\",\"params\":${2:-{}}}" \
|
||
|
|
"${BASE}/rpc/" 2>/dev/null
|
||
|
|
}
|
||
|
|
|
||
|
|
check_health() {
|
||
|
|
local failures=0
|
||
|
|
|
||
|
|
# 1. Backend health
|
||
|
|
local health_code
|
||
|
|
health_code=$(curl -s -o /dev/null -w "%{http_code}" -m 10 "${BASE}/health" 2>/dev/null || echo "000")
|
||
|
|
if [ "$health_code" != "200" ]; then
|
||
|
|
fail_log "Backend health endpoint returned $health_code"
|
||
|
|
failures=$((failures + 1))
|
||
|
|
fi
|
||
|
|
|
||
|
|
# 2. UI loads
|
||
|
|
local ui_code
|
||
|
|
ui_code=$(curl -s -o /dev/null -w "%{http_code}" -m 10 "${BASE}/" 2>/dev/null || echo "000")
|
||
|
|
if [ "$ui_code" != "200" ] && [ "$ui_code" != "302" ]; then
|
||
|
|
fail_log "Web UI returned $ui_code"
|
||
|
|
failures=$((failures + 1))
|
||
|
|
fi
|
||
|
|
|
||
|
|
# 3. RPC responds
|
||
|
|
local rpc_resp
|
||
|
|
rpc_resp=$(rpc "system.info" 2>/dev/null)
|
||
|
|
if ! echo "$rpc_resp" | grep -q '"result"'; then
|
||
|
|
# Try re-login
|
||
|
|
login
|
||
|
|
rpc_resp=$(rpc "system.info" 2>/dev/null)
|
||
|
|
if ! echo "$rpc_resp" | grep -q '"result"'; then
|
||
|
|
fail_log "RPC system.info failed after re-login"
|
||
|
|
failures=$((failures + 1))
|
||
|
|
fi
|
||
|
|
fi
|
||
|
|
|
||
|
|
# 4. WebSocket endpoint
|
||
|
|
local ws_code
|
||
|
|
ws_code=$(curl -s -o /dev/null -w "%{http_code}" -m 5 -H "Upgrade: websocket" "${BASE}/ws/" 2>/dev/null || echo "000")
|
||
|
|
if [ "$ws_code" = "000" ]; then
|
||
|
|
fail_log "WebSocket endpoint unreachable"
|
||
|
|
failures=$((failures + 1))
|
||
|
|
fi
|
||
|
|
|
||
|
|
# 5. Check containers via SSH (if accessible)
|
||
|
|
local ssh_key="$HOME/.ssh/archipelago-deploy"
|
||
|
|
if [ -f "$ssh_key" ]; then
|
||
|
|
local crashed
|
||
|
|
crashed=$(ssh -i "$ssh_key" -o ConnectTimeout=10 -o StrictHostKeyChecking=no "archipelago@${NODE}" \
|
||
|
|
'sudo podman ps -a --format "{{.Names}} {{.Status}}" 2>/dev/null | grep -i "exited\|dead\|oom" | head -5' 2>/dev/null || echo "")
|
||
|
|
if [ -n "$crashed" ]; then
|
||
|
|
fail_log "Crashed/dead containers: $crashed"
|
||
|
|
failures=$((failures + 1))
|
||
|
|
fi
|
||
|
|
|
||
|
|
# 6. Check memory usage
|
||
|
|
local mem_info
|
||
|
|
mem_info=$(ssh -i "$ssh_key" -o ConnectTimeout=10 -o StrictHostKeyChecking=no "archipelago@${NODE}" \
|
||
|
|
'free -m | grep Mem | awk "{printf \"%d/%dMB (%.0f%%)\", \$3, \$2, \$3/\$2*100}"' 2>/dev/null || echo "unknown")
|
||
|
|
log " Memory: $mem_info"
|
||
|
|
|
||
|
|
# 7. Check disk usage
|
||
|
|
local disk_info
|
||
|
|
disk_info=$(ssh -i "$ssh_key" -o ConnectTimeout=10 -o StrictHostKeyChecking=no "archipelago@${NODE}" \
|
||
|
|
'df -h / | tail -1 | awk "{print \$3\"/\"\$2\" (\"\$5\" used)\"}"' 2>/dev/null || echo "unknown")
|
||
|
|
log " Disk: $disk_info"
|
||
|
|
|
||
|
|
# 8. Check for OOM kills since start
|
||
|
|
local oom_count
|
||
|
|
oom_count=$(ssh -i "$ssh_key" -o ConnectTimeout=10 -o StrictHostKeyChecking=no "archipelago@${NODE}" \
|
||
|
|
'dmesg 2>/dev/null | grep -c "Out of memory" || echo 0' 2>/dev/null || echo "unknown")
|
||
|
|
if [ "$oom_count" != "0" ] && [ "$oom_count" != "unknown" ]; then
|
||
|
|
fail_log "OOM kills detected: $oom_count"
|
||
|
|
failures=$((failures + 1))
|
||
|
|
fi
|
||
|
|
|
||
|
|
# 9. Check archipelago service
|
||
|
|
local svc_status
|
||
|
|
svc_status=$(ssh -i "$ssh_key" -o ConnectTimeout=10 -o StrictHostKeyChecking=no "archipelago@${NODE}" \
|
||
|
|
'systemctl is-active archipelago 2>/dev/null || echo inactive' 2>/dev/null || echo "unknown")
|
||
|
|
if [ "$svc_status" != "active" ]; then
|
||
|
|
fail_log "Archipelago service status: $svc_status"
|
||
|
|
failures=$((failures + 1))
|
||
|
|
fi
|
||
|
|
fi
|
||
|
|
|
||
|
|
# 10. Check Tor services
|
||
|
|
local tor_resp
|
||
|
|
tor_resp=$(rpc "tor.list-services" 2>/dev/null)
|
||
|
|
if echo "$tor_resp" | grep -q '"result"'; then
|
||
|
|
local tor_count
|
||
|
|
tor_count=$(echo "$tor_resp" | python3 -c "import sys,json; r=json.load(sys.stdin); print(len(r.get('result',{}).get('services',[])))" 2>/dev/null || echo "0")
|
||
|
|
log " Tor services: $tor_count"
|
||
|
|
fi
|
||
|
|
|
||
|
|
# 11. Check peer connections
|
||
|
|
local peers_resp
|
||
|
|
peers_resp=$(rpc "network.list-peers" 2>/dev/null)
|
||
|
|
if echo "$peers_resp" | grep -q '"result"'; then
|
||
|
|
local peer_count
|
||
|
|
peer_count=$(echo "$peers_resp" | python3 -c "import sys,json; r=json.load(sys.stdin); print(len(r.get('result',{}).get('peers',[])))" 2>/dev/null || echo "0")
|
||
|
|
log " Connected peers: $peer_count"
|
||
|
|
fi
|
||
|
|
|
||
|
|
# 12. Ecash wallet balance check
|
||
|
|
local ecash_resp
|
||
|
|
ecash_resp=$(rpc "wallet.ecash-balance" 2>/dev/null)
|
||
|
|
if echo "$ecash_resp" | grep -q '"result"'; then
|
||
|
|
local balance
|
||
|
|
balance=$(echo "$ecash_resp" | python3 -c "import sys,json; r=json.load(sys.stdin); print(r.get('result',{}).get('balance',0))" 2>/dev/null || echo "0")
|
||
|
|
log " Ecash balance: $balance sats"
|
||
|
|
fi
|
||
|
|
|
||
|
|
return $failures
|
||
|
|
}
|
||
|
|
|
||
|
|
# ─── Main Loop ────────────────────────────────────────────────────
|
||
|
|
log "╔════════════════════════════════════════════════════════════════╗"
|
||
|
|
log "║ 72-Hour Stability Test — Archipelago ║"
|
||
|
|
log "╚════════════════════════════════════════════════════════════════╝"
|
||
|
|
log "Target: $NODE"
|
||
|
|
log "Duration: ${DURATION_HOURS}h (until $(date -r $END_TIME '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date -d @$END_TIME '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo 'unknown'))"
|
||
|
|
log "Check interval: ${CHECK_INTERVAL}s"
|
||
|
|
log "Log file: $LOG_FILE"
|
||
|
|
log "Failure log: $FAIL_LOG"
|
||
|
|
log ""
|
||
|
|
|
||
|
|
# Initial login
|
||
|
|
login
|
||
|
|
log "Authenticated to node"
|
||
|
|
|
||
|
|
while [ "$(date +%s)" -lt "$END_TIME" ]; do
|
||
|
|
TOTAL_CHECKS=$((TOTAL_CHECKS + 1))
|
||
|
|
ELAPSED_H=$(( ($(date +%s) - START_TIME) / 3600 ))
|
||
|
|
ELAPSED_M=$(( (($(date +%s) - START_TIME) % 3600) / 60 ))
|
||
|
|
|
||
|
|
log "Check #${TOTAL_CHECKS} (${ELAPSED_H}h${ELAPSED_M}m elapsed)"
|
||
|
|
|
||
|
|
if check_health; then
|
||
|
|
CONSECUTIVE_FAILURES=0
|
||
|
|
log " Status: OK"
|
||
|
|
else
|
||
|
|
FAIL_RESULT=$?
|
||
|
|
TOTAL_FAILURES=$((TOTAL_FAILURES + FAIL_RESULT))
|
||
|
|
CONSECUTIVE_FAILURES=$((CONSECUTIVE_FAILURES + 1))
|
||
|
|
if [ "$CONSECUTIVE_FAILURES" -gt "$MAX_CONSECUTIVE" ]; then
|
||
|
|
MAX_CONSECUTIVE=$CONSECUTIVE_FAILURES
|
||
|
|
fi
|
||
|
|
log " Status: $FAIL_RESULT failure(s) this check"
|
||
|
|
|
||
|
|
if [ "$CONSECUTIVE_FAILURES" -ge 5 ]; then
|
||
|
|
log "WARNING: 5 consecutive check failures — node may be down!"
|
||
|
|
fi
|
||
|
|
fi
|
||
|
|
|
||
|
|
sleep "$CHECK_INTERVAL"
|
||
|
|
done
|
||
|
|
|
||
|
|
# ─── Final Report ─────────────────────────────────────────────────
|
||
|
|
log ""
|
||
|
|
log "╔════════════════════════════════════════════════════════════════╗"
|
||
|
|
log "║ 72-Hour Stability Test — COMPLETE ║"
|
||
|
|
log "╚════════════════════════════════════════════════════════════════╝"
|
||
|
|
log ""
|
||
|
|
log "Duration: ${DURATION_HOURS}h"
|
||
|
|
log "Total checks: $TOTAL_CHECKS"
|
||
|
|
log "Total failures: $TOTAL_FAILURES"
|
||
|
|
log "Max consecutive failures: $MAX_CONSECUTIVE"
|
||
|
|
log ""
|
||
|
|
|
||
|
|
UPTIME_PCT=0
|
||
|
|
if [ "$TOTAL_CHECKS" -gt 0 ]; then
|
||
|
|
PASSED=$((TOTAL_CHECKS - TOTAL_FAILURES))
|
||
|
|
UPTIME_PCT=$(python3 -c "print(f'{${PASSED}/${TOTAL_CHECKS}*100:.1f}')" 2>/dev/null || echo "?")
|
||
|
|
fi
|
||
|
|
log "Uptime: ${UPTIME_PCT}%"
|
||
|
|
|
||
|
|
if [ "$TOTAL_FAILURES" -eq 0 ]; then
|
||
|
|
log "RESULT: PASS — Zero failures over ${DURATION_HOURS}h"
|
||
|
|
exit 0
|
||
|
|
else
|
||
|
|
log "RESULT: FAIL — $TOTAL_FAILURES failures detected"
|
||
|
|
log "See failure details: $FAIL_LOG"
|
||
|
|
exit 1
|
||
|
|
fi
|