#!/usr/bin/env bash # federation-health-check.sh — Track federation and DWN sync state # # Runs every 5 minutes via cron. Records peer online/offline state, # federation sync results, and DWN sync status to CSV. # # Install: */5 * * * * /opt/archipelago/scripts/federation-health-check.sh # # Output: /var/lib/archipelago/federation-health/ # - checks.csv: timestamp, peer_onion, peer_online, sync_ok, dwn_status # - summary.json: aggregate stats set -uo pipefail LOG_DIR="/var/lib/archipelago/federation-health" LOG_FILE="$LOG_DIR/checks.csv" RPC_URL="http://localhost:5678/rpc/v1" mkdir -p "$LOG_DIR" # Write CSV header if file doesn't exist if [ ! -f "$LOG_FILE" ]; then echo "timestamp,peer_count,peers_online,peers_offline,dwn_sync_status,dwn_messages_synced,federation_ok,error" > "$LOG_FILE" fi TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ") # RPC helper (unauthenticated — system.stats doesn't need auth, but federation does) # Login first LOGIN_RESP=$(curl -s -c /tmp/fed-health-cookies http://localhost:5678/rpc/v1 \ -H "Content-Type: application/json" \ -d '{"method":"auth.login","params":{"password":"password123"}}' 2>/dev/null || echo '{}') CSRF=$(grep csrf_token /tmp/fed-health-cookies 2>/dev/null | awk '{print $NF}') rpc() { curl -s --max-time 30 -b /tmp/fed-health-cookies \ -H "Content-Type: application/json" \ -H "X-CSRF-Token: $CSRF" \ -X POST "$RPC_URL" \ -d "{\"method\":\"$1\"}" 2>/dev/null || echo '{"result":null,"error":{"message":"RPC timeout"}}' } rpc_params() { curl -s --max-time 30 -b /tmp/fed-health-cookies \ -H "Content-Type: application/json" \ -H "X-CSRF-Token: $CSRF" \ -X POST "$RPC_URL" \ -d "{\"method\":\"$1\",\"params\":$2}" 2>/dev/null || echo '{"result":null,"error":{"message":"RPC timeout"}}' } # Get federation node list FED_RESP=$(rpc "federation.list-nodes") FED_ERR=$(echo "$FED_RESP" | python3 -c "import sys,json; e=json.load(sys.stdin).get('error'); print(e.get('message','') if e else '')" 2>/dev/null) if [ -n "$FED_ERR" ]; then echo "$TIMESTAMP,0,0,0,error,0,false,$FED_ERR" >> "$LOG_FILE" else PEER_COUNT=$(echo "$FED_RESP" | python3 -c "import sys,json; print(len(json.load(sys.stdin).get('result',{}).get('nodes',[])))" 2>/dev/null) PEERS_ONLINE=$(echo "$FED_RESP" | python3 -c " import sys, json, datetime nodes = json.load(sys.stdin).get('result', {}).get('nodes', []) online = 0 for n in nodes: ls = n.get('last_seen', '') if ls: try: dt = datetime.datetime.fromisoformat(ls.replace('Z', '+00:00')) if (datetime.datetime.now(datetime.timezone.utc) - dt).total_seconds() < 600: online += 1 except: pass print(online) " 2>/dev/null) PEERS_OFFLINE=$((PEER_COUNT - PEERS_ONLINE)) # Get DWN sync status DWN_RESP=$(rpc "dwn.status") DWN_STATUS=$(echo "$DWN_RESP" | python3 -c "import sys,json; r=json.load(sys.stdin).get('result',{}); print(r.get('sync_status','unknown'))" 2>/dev/null) DWN_SYNCED=$(echo "$DWN_RESP" | python3 -c "import sys,json; r=json.load(sys.stdin).get('result',{}); print(r.get('messages_synced',0))" 2>/dev/null) echo "$TIMESTAMP,$PEER_COUNT,$PEERS_ONLINE,$PEERS_OFFLINE,$DWN_STATUS,$DWN_SYNCED,true," >> "$LOG_FILE" fi # Generate summary report TOTAL_CHECKS=$(wc -l < "$LOG_FILE") TOTAL_CHECKS=$((TOTAL_CHECKS - 1)) if [ "$TOTAL_CHECKS" -gt 0 ]; then FED_OK=$(grep -c ",true," "$LOG_FILE" 2>/dev/null || echo "0") FED_PCT=$(python3 -c "print(round($FED_OK / $TOTAL_CHECKS * 100, 2))" 2>/dev/null || echo "0") # Count checks where all peers were online ALL_ONLINE=$(awk -F, 'NR>1 && $4==0 {count++} END {print count+0}' "$LOG_FILE") ALL_ONLINE_PCT=$(python3 -c "print(round($ALL_ONLINE / $TOTAL_CHECKS * 100, 2))" 2>/dev/null || echo "0") cat > "$LOG_DIR/summary.json" << EOF { "start": "$(head -2 "$LOG_FILE" | tail -1 | cut -d',' -f1)", "last_check": "$TIMESTAMP", "total_checks": $TOTAL_CHECKS, "federation_ok_count": $FED_OK, "federation_success_rate": $FED_PCT, "all_peers_online_count": $ALL_ONLINE, "all_peers_online_rate": $ALL_ONLINE_PCT, "current_peer_count": ${PEER_COUNT:-0}, "current_peers_online": ${PEERS_ONLINE:-0} } EOF fi