SOAK-03: daily-reboot-test.sh deployed on both nodes via cron (4 AM). Systemd oneshot verifies recovery on boot, logs to reboot-test.csv. SOAK-04: generate-stability-report.sh compiles metrics from uptime-monitor, reboot-test, sync-check CSVs. Initial .228 report: 99.847% uptime, 0 OOM kills, 32/32 containers. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
87 lines
2.9 KiB
Bash
Executable File
87 lines
2.9 KiB
Bash
Executable File
#!/bin/bash
|
|
# daily-reboot-test.sh — Automated daily reboot with recovery verification
|
|
# Run via cron: 0 4 * * * /opt/archipelago/scripts/daily-reboot-test.sh
|
|
#
|
|
# 1. Records pre-reboot state
|
|
# 2. Reboots the node
|
|
# 3. After reboot, systemd runs this again via a oneshot service
|
|
# that verifies recovery and logs the result
|
|
#
|
|
# Logs to /var/lib/archipelago/monitoring/reboot-test.csv
|
|
|
|
LOG_DIR="/var/lib/archipelago/monitoring"
|
|
LOG_FILE="${LOG_DIR}/reboot-test.csv"
|
|
STATE_FILE="${LOG_DIR}/reboot-test-state.json"
|
|
HEALTH_URL="http://localhost:5678/health"
|
|
|
|
mkdir -p "$LOG_DIR"
|
|
|
|
# Create CSV header if needed
|
|
if [ ! -f "$LOG_FILE" ]; then
|
|
echo "timestamp,phase,containers_pre,containers_post,exited,health,recovery_secs" > "$LOG_FILE"
|
|
fi
|
|
|
|
TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
|
|
|
# Check if we're in the verification phase (state file exists from pre-reboot)
|
|
if [ -f "$STATE_FILE" ]; then
|
|
# POST-REBOOT VERIFICATION
|
|
PRE_COUNT=$(python3 -c "import json; print(json.load(open('${STATE_FILE}')).get('containers',0))" 2>/dev/null || echo 0)
|
|
REBOOT_TIME=$(python3 -c "import json; print(json.load(open('${STATE_FILE}')).get('timestamp',''))" 2>/dev/null || echo "")
|
|
|
|
# Wait for backend health (max 5 min)
|
|
HEALTH="fail"
|
|
START_WAIT=$(date +%s)
|
|
for i in $(seq 1 60); do
|
|
sleep 5
|
|
HEALTH=$(curl -s --max-time 5 "$HEALTH_URL" 2>/dev/null || echo "fail")
|
|
if [ "$HEALTH" = "OK" ]; then
|
|
break
|
|
fi
|
|
done
|
|
WAIT_SECS=$(( $(date +%s) - START_WAIT ))
|
|
|
|
# Wait another 60s for containers to stabilize
|
|
sleep 60
|
|
|
|
# Count containers
|
|
DOCKER=podman; command -v podman >/dev/null 2>&1 || DOCKER=docker
|
|
POST_COUNT=$(sudo $DOCKER ps --format '{{.Names}}' 2>/dev/null | wc -l)
|
|
EXITED=$(sudo $DOCKER ps -a --format '{{.State}}' 2>/dev/null | grep -ci exited || echo 0)
|
|
|
|
RECOVERY_SECS=$((WAIT_SECS + 60))
|
|
|
|
echo "${TIMESTAMP},verify,${PRE_COUNT},${POST_COUNT},${EXITED},${HEALTH},${RECOVERY_SECS}" >> "$LOG_FILE"
|
|
|
|
# Clean up state file
|
|
rm -f "$STATE_FILE"
|
|
|
|
# Update summary
|
|
TOTAL=$(grep -c ",verify," "$LOG_FILE" 2>/dev/null || echo 0)
|
|
OK=$(grep ",verify,.*,OK," "$LOG_FILE" 2>/dev/null | wc -l || echo 0)
|
|
cat > "${LOG_DIR}/reboot-test-summary.json" << EOF
|
|
{
|
|
"total_reboots": ${TOTAL},
|
|
"successful": ${OK},
|
|
"last_test": "${TIMESTAMP}",
|
|
"last_recovery_secs": ${RECOVERY_SECS},
|
|
"last_health": "${HEALTH}",
|
|
"last_containers": "${POST_COUNT}/${PRE_COUNT}"
|
|
}
|
|
EOF
|
|
else
|
|
# PRE-REBOOT: Record state and schedule reboot
|
|
DOCKER=podman; command -v podman >/dev/null 2>&1 || DOCKER=docker
|
|
CONTAINERS=$(sudo $DOCKER ps --format '{{.Names}}' 2>/dev/null | wc -l)
|
|
|
|
# Save state for post-reboot verification
|
|
cat > "$STATE_FILE" << EOF
|
|
{"timestamp": "${TIMESTAMP}", "containers": ${CONTAINERS}}
|
|
EOF
|
|
|
|
echo "${TIMESTAMP},reboot,${CONTAINERS},,,," >> "$LOG_FILE"
|
|
|
|
# Reboot in 30 seconds (allows cron to finish cleanly)
|
|
(sleep 30 && sudo reboot) &
|
|
fi
|