#!/bin/bash # generate-stability-report.sh — Compile stability report from monitoring data # Run after 30-day soak test period # Usage: ./scripts/generate-stability-report.sh [TARGET_IP] TARGET="${1:-192.168.1.228}" SSH_KEY="${HOME}/.ssh/archipelago-deploy" SSH_OPTS="-i ${SSH_KEY} -o StrictHostKeyChecking=no -o ConnectTimeout=10" echo "╔════════════════════════════════════════════════════════════════╗" echo "║ Archipelago Stability Report ║" echo "╚════════════════════════════════════════════════════════════════╝" echo "" echo "Node: ${TARGET}" echo "Generated: $(date -u +%Y-%m-%dT%H:%M:%SZ)" echo "" # Uptime metrics echo "═══ Uptime Metrics ═══" ssh $SSH_OPTS "archipelago@${TARGET}" " if [ -f /var/lib/archipelago/uptime-monitor/metrics.csv ]; then TOTAL=\$(tail -n +2 /var/lib/archipelago/uptime-monitor/metrics.csv | wc -l) OK=\$(grep -c ',200,' /var/lib/archipelago/uptime-monitor/metrics.csv 2>/dev/null || echo 0) if [ \$TOTAL -gt 0 ]; then PCT=\$(python3 -c \"print(round(\$OK / \$TOTAL * 100, 3))\" 2>/dev/null || echo '?') echo \" Total checks: \$TOTAL\" echo \" Healthy: \$OK\" echo \" Uptime: \${PCT}%\" FIRST=\$(head -2 /var/lib/archipelago/uptime-monitor/metrics.csv | tail -1 | cut -d, -f1) LAST=\$(tail -1 /var/lib/archipelago/uptime-monitor/metrics.csv | cut -d, -f1) echo \" Period: \$FIRST to \$LAST\" fi else echo ' No uptime data found' fi " 2>/dev/null echo "" # Reboot test results echo "═══ Daily Reboot Tests ═══" ssh $SSH_OPTS "archipelago@${TARGET}" " if [ -f /var/lib/archipelago/monitoring/reboot-test.csv ]; then REBOOTS=\$(grep -c ',reboot,' /var/lib/archipelago/monitoring/reboot-test.csv 2>/dev/null || echo 0) VERIFIED=\$(grep -c ',verify,' /var/lib/archipelago/monitoring/reboot-test.csv 2>/dev/null || echo 0) OK=\$(grep ',verify,.*,OK,' /var/lib/archipelago/monitoring/reboot-test.csv 2>/dev/null | wc -l || echo 0) if [ \$VERIFIED -gt 0 ]; then AVG=\$(grep ',verify,' /var/lib/archipelago/monitoring/reboot-test.csv | awk -F, '{sum+=\$7; n++} END {if(n>0) print int(sum/n); else print 0}') echo \" Total reboots: \$REBOOTS\" echo \" Verified recoveries: \$VERIFIED\" echo \" Successful: \$OK\" echo \" Avg recovery time: \${AVG}s\" fi else echo ' No reboot test data (starts at 4 AM daily)' fi " 2>/dev/null echo "" # Federation sync echo "═══ Federation Sync ═══" ssh $SSH_OPTS "archipelago@${TARGET}" " if [ -f /var/lib/archipelago/monitoring/sync-check.csv ]; then TOTAL=\$(tail -n +2 /var/lib/archipelago/monitoring/sync-check.csv | wc -l) OK=\$(awk -F, '\$2 > 0' /var/lib/archipelago/monitoring/sync-check.csv | wc -l) if [ \$TOTAL -gt 0 ]; then PCT=\$(python3 -c \"print(round(\$OK / \$TOTAL * 100, 1))\" 2>/dev/null || echo '?') echo \" Total syncs: \$TOTAL\" echo \" Successful: \$OK\" echo \" Success rate: \${PCT}%\" fi else echo ' No sync data yet' fi " 2>/dev/null echo "" # Memory trend echo "═══ Memory Trend ═══" ssh $SSH_OPTS "archipelago@${TARGET}" " if [ -f /var/lib/archipelago/uptime-monitor/metrics.csv ]; then echo ' First reading:' head -2 /var/lib/archipelago/uptime-monitor/metrics.csv | tail -1 | awk -F, '{printf \" %s: %s/%s MB used\\n\", \$1, \$7, \$8}' echo ' Latest reading:' tail -1 /var/lib/archipelago/uptime-monitor/metrics.csv | awk -F, '{printf \" %s: %s/%s MB used\\n\", \$1, \$7, \$8}' fi " 2>/dev/null echo "" # Disk trend echo "═══ Disk Trend ═══" ssh $SSH_OPTS "archipelago@${TARGET}" " if [ -f /var/lib/archipelago/uptime-monitor/metrics.csv ]; then echo ' First reading:' head -2 /var/lib/archipelago/uptime-monitor/metrics.csv | tail -1 | awk -F, '{printf \" %s: %s/%s GB\\n\", \$1, \$9, \$10}' echo ' Latest reading:' tail -1 /var/lib/archipelago/uptime-monitor/metrics.csv | awk -F, '{printf \" %s: %s/%s GB\\n\", \$1, \$9, \$10}' fi " 2>/dev/null echo "" # Container health echo "═══ Container Health ═══" ssh $SSH_OPTS "archipelago@${TARGET}" " DOCKER=podman; command -v podman >/dev/null 2>&1 || DOCKER=docker RUNNING=\$(sudo \$DOCKER ps --format '{{.Names}}' 2>/dev/null | wc -l) EXITED=\$(sudo \$DOCKER ps -a --filter status=exited --format '{{.Names}}' 2>/dev/null | wc -l) echo \" Running: \$RUNNING\" echo \" Exited: \$EXITED\" if [ \$EXITED -gt 0 ]; then echo ' Exited containers:' sudo \$DOCKER ps -a --filter status=exited --format ' {{.Names}}: {{.Status}}' 2>/dev/null fi " 2>/dev/null echo "" # OOM kills echo "═══ OOM Kills ═══" ssh $SSH_OPTS "archipelago@${TARGET}" " OOM=\$(sudo dmesg --level=err,crit 2>/dev/null | grep -c 'oom-kill' || echo 0) echo \" OOM kills since boot: \$OOM\" " 2>/dev/null echo "" echo "═══ Report Complete ═══"