#!/bin/bash # daily-reboot-test.sh — Automated daily reboot with recovery verification # Run via cron: 0 4 * * * /opt/archipelago/scripts/daily-reboot-test.sh # # 1. Records pre-reboot state # 2. Reboots the node # 3. After reboot, systemd runs this again via a oneshot service # that verifies recovery and logs the result # # Logs to /var/lib/archipelago/monitoring/reboot-test.csv LOG_DIR="/var/lib/archipelago/monitoring" LOG_FILE="${LOG_DIR}/reboot-test.csv" STATE_FILE="${LOG_DIR}/reboot-test-state.json" HEALTH_URL="http://localhost:5678/health" mkdir -p "$LOG_DIR" # Create CSV header if needed if [ ! -f "$LOG_FILE" ]; then echo "timestamp,phase,containers_pre,containers_post,exited,health,recovery_secs" > "$LOG_FILE" fi TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ) # Check if we're in the verification phase (state file exists from pre-reboot) if [ -f "$STATE_FILE" ]; then # POST-REBOOT VERIFICATION PRE_COUNT=$(python3 -c "import json; print(json.load(open('${STATE_FILE}')).get('containers',0))" 2>/dev/null || echo 0) REBOOT_TIME=$(python3 -c "import json; print(json.load(open('${STATE_FILE}')).get('timestamp',''))" 2>/dev/null || echo "") # Wait for backend health (max 5 min) HEALTH="fail" START_WAIT=$(date +%s) for i in $(seq 1 60); do sleep 5 HEALTH=$(curl -s --max-time 5 "$HEALTH_URL" 2>/dev/null || echo "fail") if [ "$HEALTH" = "OK" ]; then break fi done WAIT_SECS=$(( $(date +%s) - START_WAIT )) # Wait another 60s for containers to stabilize sleep 60 # Count containers DOCKER=podman; command -v podman >/dev/null 2>&1 || DOCKER=docker POST_COUNT=$(sudo $DOCKER ps --format '{{.Names}}' 2>/dev/null | wc -l) EXITED=$(sudo $DOCKER ps -a --format '{{.State}}' 2>/dev/null | grep -ci exited || echo 0) RECOVERY_SECS=$((WAIT_SECS + 60)) echo "${TIMESTAMP},verify,${PRE_COUNT},${POST_COUNT},${EXITED},${HEALTH},${RECOVERY_SECS}" >> "$LOG_FILE" # Clean up state file rm -f "$STATE_FILE" # Update summary TOTAL=$(grep -c ",verify," "$LOG_FILE" 2>/dev/null || echo 0) OK=$(grep ",verify,.*,OK," "$LOG_FILE" 2>/dev/null | wc -l || echo 0) cat > "${LOG_DIR}/reboot-test-summary.json" << EOF { "total_reboots": ${TOTAL}, "successful": ${OK}, "last_test": "${TIMESTAMP}", "last_recovery_secs": ${RECOVERY_SECS}, "last_health": "${HEALTH}", "last_containers": "${POST_COUNT}/${PRE_COUNT}" } EOF else # PRE-REBOOT: Record state and schedule reboot DOCKER=podman; command -v podman >/dev/null 2>&1 || DOCKER=docker CONTAINERS=$(sudo $DOCKER ps --format '{{.Names}}' 2>/dev/null | wc -l) # Save state for post-reboot verification cat > "$STATE_FILE" << EOF {"timestamp": "${TIMESTAMP}", "containers": ${CONTAINERS}} EOF echo "${TIMESTAMP},reboot,${CONTAINERS},,,," >> "$LOG_FILE" # Reboot in 30 seconds (allows cron to finish cleanly) (sleep 30 && sudo reboot) & fi