archy/scripts/test-stability-72h.sh
Dorian f07ce10b1a refactor: update dependencies and remove unused code
- Added new dependencies: `adler2`, `crc32fast`, `flate2`, `miniz_oxide`, and `libredox`.
- Updated existing dependencies: `tokio-rustls` to version 0.26.4 and `filetime` to version 0.2.27.
- Removed the `backup.rs` file as it is no longer needed.
- Introduced tests for configuration and credential management.
- Enhanced the `identity` module to generate W3C compliant DID documents.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 00:19:30 +00:00

223 lines
8.6 KiB
Bash
Executable File

#!/usr/bin/env bash
# FINAL-202: 72-Hour Stability Test
# Monitors a running Archipelago node for 72 hours, checking health every 5 minutes.
# Usage: bash test-stability-72h.sh <node-ip> [password]
# Logs results to /tmp/stability-test-<timestamp>.log
set -euo pipefail
NODE="${1:-192.168.1.228}"
BASE="http://${NODE}"
PASS="${2:-password123}"
DURATION_HOURS="${3:-72}"
CHECK_INTERVAL=300 # 5 minutes
COOKIE_JAR="/tmp/stability-cookies.txt"
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
LOG_FILE="/tmp/stability-test-${TIMESTAMP}.log"
FAIL_LOG="/tmp/stability-failures-${TIMESTAMP}.log"
TOTAL_CHECKS=0
TOTAL_FAILURES=0
CONSECUTIVE_FAILURES=0
MAX_CONSECUTIVE=0
START_TIME=$(date +%s)
END_TIME=$((START_TIME + DURATION_HOURS * 3600))
log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"; }
fail_log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] FAIL: $*" | tee -a "$LOG_FILE" >> "$FAIL_LOG"; }
login() {
curl -s -c "$COOKIE_JAR" -H "Content-Type: application/json" \
-d "{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"auth.login\",\"params\":{\"password\":\"$PASS\"}}" \
"${BASE}/rpc/" > /dev/null 2>&1
}
rpc() {
curl -s -m 10 -b "$COOKIE_JAR" -c "$COOKIE_JAR" \
-H "Content-Type: application/json" \
-d "{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"$1\",\"params\":${2:-{}}}" \
"${BASE}/rpc/" 2>/dev/null
}
check_health() {
local failures=0
# 1. Backend health
local health_code
health_code=$(curl -s -o /dev/null -w "%{http_code}" -m 10 "${BASE}/health" 2>/dev/null || echo "000")
if [ "$health_code" != "200" ]; then
fail_log "Backend health endpoint returned $health_code"
failures=$((failures + 1))
fi
# 2. UI loads
local ui_code
ui_code=$(curl -s -o /dev/null -w "%{http_code}" -m 10 "${BASE}/" 2>/dev/null || echo "000")
if [ "$ui_code" != "200" ] && [ "$ui_code" != "302" ]; then
fail_log "Web UI returned $ui_code"
failures=$((failures + 1))
fi
# 3. RPC responds
local rpc_resp
rpc_resp=$(rpc "server.echo" '{"message":"stability-check"}' 2>/dev/null)
if ! echo "$rpc_resp" | grep -q '"result"'; then
# Try re-login
login
rpc_resp=$(rpc "server.echo" '{"message":"stability-check"}' 2>/dev/null)
if ! echo "$rpc_resp" | grep -q '"result"'; then
fail_log "RPC server.echo failed after re-login"
failures=$((failures + 1))
fi
fi
# 4. WebSocket endpoint
local ws_code
ws_code=$(curl -s -o /dev/null -w "%{http_code}" -m 5 -H "Upgrade: websocket" "${BASE}/ws/" 2>/dev/null || echo "000")
if [ "$ws_code" = "000" ]; then
fail_log "WebSocket endpoint unreachable"
failures=$((failures + 1))
fi
# 5. Check containers via SSH (if accessible)
local ssh_key="$HOME/.ssh/archipelago-deploy"
if [ -f "$ssh_key" ]; then
local crashed
crashed=$(ssh -i "$ssh_key" -o ConnectTimeout=10 -o StrictHostKeyChecking=no "archipelago@${NODE}" \
'sudo podman ps -a --format "{{.Names}} {{.Status}}" 2>/dev/null | grep -i "exited\|dead\|oom" | head -5' 2>/dev/null || echo "")
if [ -n "$crashed" ]; then
fail_log "Crashed/dead containers: $crashed"
failures=$((failures + 1))
fi
# 6. Check memory usage
local mem_info
mem_info=$(ssh -i "$ssh_key" -o ConnectTimeout=10 -o StrictHostKeyChecking=no "archipelago@${NODE}" \
'free -m | grep Mem | awk "{printf \"%d/%dMB (%.0f%%)\", \$3, \$2, \$3/\$2*100}"' 2>/dev/null || echo "unknown")
log " Memory: $mem_info"
# 7. Check disk usage
local disk_info
disk_info=$(ssh -i "$ssh_key" -o ConnectTimeout=10 -o StrictHostKeyChecking=no "archipelago@${NODE}" \
'df -h / | tail -1 | awk "{print \$3\"/\"\$2\" (\"\$5\" used)\"}"' 2>/dev/null || echo "unknown")
log " Disk: $disk_info"
# 8. Check for OOM kills since start
local oom_count
oom_count=$(ssh -i "$ssh_key" -o ConnectTimeout=10 -o StrictHostKeyChecking=no "archipelago@${NODE}" \
'dmesg 2>/dev/null | grep -c "Out of memory" || echo 0' 2>/dev/null || echo "unknown")
if [ "$oom_count" != "0" ] && [ "$oom_count" != "unknown" ]; then
fail_log "OOM kills detected: $oom_count"
failures=$((failures + 1))
fi
# 9. Check archipelago service
local svc_status
svc_status=$(ssh -i "$ssh_key" -o ConnectTimeout=10 -o StrictHostKeyChecking=no "archipelago@${NODE}" \
'systemctl is-active archipelago 2>/dev/null || echo inactive' 2>/dev/null || echo "unknown")
if [ "$svc_status" != "active" ]; then
fail_log "Archipelago service status: $svc_status"
failures=$((failures + 1))
fi
fi
# 10. Check Tor services
local tor_resp
tor_resp=$(rpc "tor.list-services" 2>/dev/null)
if echo "$tor_resp" | grep -q '"result"'; then
local tor_count
tor_count=$(echo "$tor_resp" | python3 -c "import sys,json; r=json.load(sys.stdin); print(len(r.get('result',{}).get('services',[])))" 2>/dev/null || echo "0")
log " Tor services: $tor_count"
fi
# 11. Check peer connections
local peers_resp
peers_resp=$(rpc "network.list-peers" 2>/dev/null)
if echo "$peers_resp" | grep -q '"result"'; then
local peer_count
peer_count=$(echo "$peers_resp" | python3 -c "import sys,json; r=json.load(sys.stdin); print(len(r.get('result',{}).get('peers',[])))" 2>/dev/null || echo "0")
log " Connected peers: $peer_count"
fi
# 12. Ecash wallet balance check
local ecash_resp
ecash_resp=$(rpc "wallet.ecash-balance" 2>/dev/null)
if echo "$ecash_resp" | grep -q '"result"'; then
local balance
balance=$(echo "$ecash_resp" | python3 -c "import sys,json; r=json.load(sys.stdin); print(r.get('result',{}).get('balance',0))" 2>/dev/null || echo "0")
log " Ecash balance: $balance sats"
fi
return $failures
}
# ─── Main Loop ────────────────────────────────────────────────────
log "╔════════════════════════════════════════════════════════════════╗"
log "║ 72-Hour Stability Test — Archipelago ║"
log "╚════════════════════════════════════════════════════════════════╝"
log "Target: $NODE"
log "Duration: ${DURATION_HOURS}h (until $(date -r $END_TIME '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date -d @$END_TIME '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo 'unknown'))"
log "Check interval: ${CHECK_INTERVAL}s"
log "Log file: $LOG_FILE"
log "Failure log: $FAIL_LOG"
log ""
# Initial login
login
log "Authenticated to node"
while [ "$(date +%s)" -lt "$END_TIME" ]; do
TOTAL_CHECKS=$((TOTAL_CHECKS + 1))
ELAPSED_H=$(( ($(date +%s) - START_TIME) / 3600 ))
ELAPSED_M=$(( (($(date +%s) - START_TIME) % 3600) / 60 ))
log "Check #${TOTAL_CHECKS} (${ELAPSED_H}h${ELAPSED_M}m elapsed)"
if check_health; then
CONSECUTIVE_FAILURES=0
log " Status: OK"
else
FAIL_RESULT=$?
TOTAL_FAILURES=$((TOTAL_FAILURES + FAIL_RESULT))
CONSECUTIVE_FAILURES=$((CONSECUTIVE_FAILURES + 1))
if [ "$CONSECUTIVE_FAILURES" -gt "$MAX_CONSECUTIVE" ]; then
MAX_CONSECUTIVE=$CONSECUTIVE_FAILURES
fi
log " Status: $FAIL_RESULT failure(s) this check"
if [ "$CONSECUTIVE_FAILURES" -ge 5 ]; then
log "WARNING: 5 consecutive check failures — node may be down!"
fi
fi
sleep "$CHECK_INTERVAL"
done
# ─── Final Report ─────────────────────────────────────────────────
log ""
log "╔════════════════════════════════════════════════════════════════╗"
log "║ 72-Hour Stability Test — COMPLETE ║"
log "╚════════════════════════════════════════════════════════════════╝"
log ""
log "Duration: ${DURATION_HOURS}h"
log "Total checks: $TOTAL_CHECKS"
log "Total failures: $TOTAL_FAILURES"
log "Max consecutive failures: $MAX_CONSECUTIVE"
log ""
UPTIME_PCT=0
if [ "$TOTAL_CHECKS" -gt 0 ]; then
PASSED=$((TOTAL_CHECKS - TOTAL_FAILURES))
UPTIME_PCT=$(python3 -c "print(f'{${PASSED}/${TOTAL_CHECKS}*100:.1f}')" 2>/dev/null || echo "?")
fi
log "Uptime: ${UPTIME_PCT}%"
if [ "$TOTAL_FAILURES" -eq 0 ]; then
log "RESULT: PASS — Zero failures over ${DURATION_HOURS}h"
exit 0
else
log "RESULT: FAIL — $TOTAL_FAILURES failures detected"
log "See failure details: $FAIL_LOG"
exit 1
fi