#!/bin/bash # # Container Doctor — diagnose and fix common container health issues # # Usage: # sudo ./scripts/container-doctor.sh # Run locally on node # ./scripts/container-doctor.sh user@host # Run remotely via SSH # # Fixes: # 1. Stale podman ps/stats processes (>10 = pileup) # 2. Orphaned conmon/crun processes holding ports # 3. System tor conflicting with container tor # 4. Tor hidden service directory permissions (must be 700) # 5. SearXNG read-only root / cap-drop ALL # 6. Bitcoin Knots prune+txindex conflict # 7. Containers stuck with exit code 127 (binary not found) # # Safe to run multiple times (idempotent). Never blocks deploy (exit 0 always). # set -o pipefail # Source pinned image versions (single source of truth) SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" [ -f "$SCRIPT_DIR/image-versions.sh" ] && . "$SCRIPT_DIR/image-versions.sh" FIXES_APPLIED=0 CHECKS_PASSED=0 FIX_NAMES=() log() { echo "[$(date +%H:%M:%S)] DOCTOR: $*"; } run_fix() { local name="$1" shift if "$@"; then FIXES_APPLIED=$((FIXES_APPLIED + 1)) FIX_NAMES+=("$name") else CHECKS_PASSED=$((CHECKS_PASSED + 1)) fi } # ── Fix 1: Stale podman processes ──────────────────────────── fix_stale_podman() { local count count=$(pgrep -f "podman (ps|stats)" 2>/dev/null | wc -l) count=${count:-0} if [ "$count" -gt 10 ]; then log "Killing $count stale podman ps/stats processes" pkill -f "podman (ps|stats)" 2>/dev/null || true sleep 2 local after after=$(pgrep -f "podman (ps|stats)" 2>/dev/null | wc -l) after=${after:-0} log "Reduced from $count to $after" return 0 fi return 1 } # ── Fix 2: Orphaned conmon holding ports ───────────────────── fix_orphaned_conmon() { local fixed=false # Find conmon processes whose containers no longer exist local pids pids=$(pgrep -f "conmon.*--exit-command" 2>/dev/null || true) if [ -z "$pids" ]; then return 1 fi for pid in $pids; do # Extract container ID from conmon args local cid cid=$(tr '\0' ' ' < /proc/"$pid"/cmdline 2>/dev/null | grep -oP '(?<=-c )[a-f0-9]{64}' || true) if [ -z "$cid" ]; then continue fi # Check if container still exists if ! podman inspect "$cid" &>/dev/null; then local port_info port_info=$(ss -tlnp 2>/dev/null | grep "pid=$pid" | grep -oP ':\K\d+' | head -3 | tr '\n' ',' | sed 's/,$//') log "Killing orphaned conmon pid=$pid (ports: ${port_info:-none})" kill "$pid" 2>/dev/null || kill -9 "$pid" 2>/dev/null || true fixed=true fi done $fixed && return 0 || return 1 } # ── Fix 3: Ensure system Tor is running (preferred over container) ── fix_system_tor_conflict() { # System Tor is preferred over container Tor. # If archy-tor container exists, remove it and use system Tor instead. if podman ps -a --format '{{.Names}}' 2>/dev/null | grep -qE '^archy-tor$'; then podman stop archy-tor 2>/dev/null || true podman rm -f archy-tor 2>/dev/null || true log "Removed archy-tor container (system Tor is preferred)" fi # Ensure system Tor is enabled and running if command -v tor >/dev/null 2>&1; then if ! systemctl is-active tor@default >/dev/null 2>&1; then systemctl enable tor tor@default 2>/dev/null || true systemctl start tor tor@default 2>/dev/null || true log "Started system Tor" return 0 fi fi return 1 } # ── Fix 4: Tor hidden service permissions ──────────────────── fix_tor_permissions() { local fixed=false local tor_dirs=("/var/lib/archipelago/tor" "/var/lib/tor") for base in "${tor_dirs[@]}"; do if [ ! -d "$base" ]; then continue fi while IFS= read -r dir; do local perms perms=$(stat -c '%a' "$dir" 2>/dev/null) if [ "$perms" != "700" ]; then chmod 700 "$dir" log "Fixed permissions on $dir ($perms -> 700)" fixed=true fi done < <(find "$base" -maxdepth 1 -name "hidden_service_*" -type d 2>/dev/null) done # If we fixed permissions, restart system Tor to pick up the changes if $fixed; then systemctl restart tor@default 2>/dev/null || true return 0 fi return 1 } # ── Fix 5: SearXNG read-only / cap-drop ───────────────────── fix_searxng() { if ! podman ps -a --format '{{.Names}}' 2>/dev/null | grep -q '^searxng$'; then return 1 fi local state state=$(podman inspect searxng --format '{{.State.Status}}' 2>/dev/null || true) local readonly_root readonly_root=$(podman inspect searxng --format '{{.HostConfig.ReadonlyRootfs}}' 2>/dev/null || true) local cap_drop cap_drop=$(podman inspect searxng --format '{{.HostConfig.CapDrop}}' 2>/dev/null || true) # Fix if: exited, or has read-only root, or has cap-drop ALL local needs_fix=false if [ "$state" = "exited" ]; then needs_fix=true fi if [ "$readonly_root" = "true" ]; then needs_fix=true fi if [[ "$cap_drop" == *"ALL"* ]] || [[ "$cap_drop" == *"all"* ]]; then needs_fix=true fi if ! $needs_fix; then return 1 fi log "Recreating SearXNG (readonly=$readonly_root, cap_drop=$cap_drop, state=$state)" # Get current port mapping local port port=$(podman inspect searxng --format '{{range $k,$v := .HostConfig.PortBindings}}{{$k}}={{range $v}}{{.HostPort}}{{end}}{{println}}{{end}}' 2>/dev/null | head -1) local host_port="${port##*=}" host_port="${host_port:-8888}" # Kill any stale conmon holding the port local conmon_pid conmon_pid=$(ss -tlnp 2>/dev/null | grep ":${host_port} " | grep -oP 'pid=\K\d+' | head -1) podman stop searxng 2>/dev/null || true podman rm -f searxng 2>/dev/null || true if [ -n "$conmon_pid" ]; then kill -9 "$conmon_pid" 2>/dev/null || true sleep 2 fi podman run -d \ --name searxng \ --restart=unless-stopped \ --security-opt=no-new-privileges:true \ --tmpfs /tmp:rw,noexec,nosuid,size=256m \ -v searxng-config:/etc/searxng:rw \ -v searxng-cache:/var/cache/searxng:rw \ -p "${host_port}:8080" \ --memory=512m \ "${SEARXNG_IMAGE}" 2>&1 || true log "SearXNG recreated (no readonly, no cap-drop ALL)" return 0 } # ── Fix 6: Bitcoin Knots prune+txindex conflict ────────────── fix_bitcoin_txindex() { if ! podman ps -a --format '{{.Names}}' 2>/dev/null | grep -q '^bitcoin-knots$'; then return 1 fi # Check if bitcoin.conf has prune enabled local conf="/var/lib/archipelago/bitcoin/bitcoin.conf" if [ ! -f "$conf" ] || ! grep -q '^prune=' "$conf"; then return 1 fi # Check if container args include txindex local cmd cmd=$(podman inspect bitcoin-knots --format '{{json .Config.Cmd}}' 2>/dev/null || true) if ! echo "$cmd" | grep -q "txindex"; then return 1 fi log "Bitcoin Knots: prune+txindex conflict detected" # Get current config local image image=$(podman inspect bitcoin-knots --format '{{.ImageName}}' 2>/dev/null) local network network=$(podman inspect bitcoin-knots --format '{{.HostConfig.NetworkMode}}' 2>/dev/null) # Read per-installation RPC password local SECRETS_DIR="/var/lib/archipelago/secrets" local BTC_RPC_PASS="archipelago" if [ -f "$SECRETS_DIR/bitcoin-rpc-password" ]; then BTC_RPC_PASS=$(cat "$SECRETS_DIR/bitcoin-rpc-password") fi # Ensure bitcoin.conf has all RPC settings if ! grep -q 'rpcuser=' "$conf"; then cat > "$conf" </dev/null rmdir /var/lib/archipelago/bitcoin/indexes/txindex 2>/dev/null || true log "Removed stale txindex directory" fi # Recreate without txindex podman stop bitcoin-knots 2>/dev/null || true podman rm -f bitcoin-knots 2>/dev/null || true sleep 2 # Kill stale conmon on port 8332/8333 for p in 8332 8333; do local cpid cpid=$(ss -tlnp 2>/dev/null | grep ":${p} " | grep -oP 'pid=\K\d+' | head -1) if [ -n "$cpid" ]; then kill -9 "$cpid" 2>/dev/null || true fi done sleep 1 local net_arg="" if [ -n "$network" ] && [ "$network" != "bridge" ] && [ "$network" != "host" ]; then net_arg="--network=$network" elif [ "$network" = "host" ]; then net_arg="--network=host" else net_arg="--network=archy-net" fi podman run -d \ --name bitcoin-knots \ --restart=always \ $net_arg \ -p 8332:8332 \ -p 8333:8333 \ -v /var/lib/archipelago/bitcoin:/home/bitcoin/.bitcoin \ --memory=2g \ --cap-drop=ALL \ --cap-add=CHOWN \ --cap-add=FOWNER \ --cap-add=SETUID \ --cap-add=SETGID \ --cap-add=DAC_OVERRIDE \ --security-opt=no-new-privileges:true \ --health-cmd="bitcoin-cli -rpcuser=archipelago -rpcpassword=$BTC_RPC_PASS getblockchaininfo || exit 1" \ --health-interval=30s \ --health-retries=3 \ "$image" 2>&1 || true log "Bitcoin Knots recreated without txindex (prune mode)" return 0 } # ── Fix 7: Exit code 127 containers ───────────────────────── fix_exit_127() { local containers containers=$(podman ps -a --format '{{.Names}} {{.Status}}' 2>/dev/null | grep 'Exited (127)' | awk '{print $1}' || true) if [ -z "$containers" ]; then return 1 fi local fixed_names=() for name in $containers; do # Skip containers handled by other fixes if [ "$name" = "searxng" ]; then continue fi log "Container $name has exit code 127 — recreating" # Get image and create command for recreation local image image=$(podman inspect "$name" --format '{{.ImageName}}' 2>/dev/null || true) local create_cmd create_cmd=$(podman inspect "$name" --format '{{json .Config.CreateCommand}}' 2>/dev/null || true) podman rm -f "$name" 2>/dev/null || true if [ -n "$create_cmd" ] && [ "$create_cmd" != "null" ]; then # Re-run the original create command (strip the leading "podman" and "run") local recreate_args recreate_args=$(echo "$create_cmd" | python3 -c " import json, sys args = json.load(sys.stdin) # Skip 'podman' and 'run', output the rest print(' '.join(['\"' + a + '\"' if ' ' in a else a for a in args[2:]])) " 2>/dev/null || true) if [ -n "$recreate_args" ]; then eval "podman run $recreate_args" 2>&1 || true fixed_names+=("$name") log "Recreated $name from original args" else fixed_names+=("$name(removed)") log "Removed $name — will be recreated on next deploy" fi else fixed_names+=("$name(removed)") log "Removed $name — will be recreated on next deploy" fi done [ ${#fixed_names[@]} -gt 0 ] && return 0 || return 1 } # ── Main ───────────────────────────────────────────────────── # If remote host provided, run via SSH if [ -n "$1" ] && [ "$1" != "--local" ]; then REMOTE_HOST="$1" SSH_KEY="${ARCHIPELAGO_SSH_KEY:-$HOME/.ssh/archipelago-deploy}" SSH_OPTS="-o StrictHostKeyChecking=no -o ServerAliveInterval=15 -o ServerAliveCountMax=4 -i $SSH_KEY" log "Running container doctor on $REMOTE_HOST" # Copy script to remote and execute scp $SSH_OPTS "$0" "$REMOTE_HOST:/tmp/container-doctor.sh" 2>/dev/null ssh $SSH_OPTS "$REMOTE_HOST" "sudo bash /tmp/container-doctor.sh --local" 2>&1 exit 0 fi # Running locally (on the node itself) log "Starting container health check" run_fix "stale-podman" fix_stale_podman run_fix "orphaned-conmon" fix_orphaned_conmon run_fix "system-tor" fix_system_tor_conflict run_fix "tor-permissions" fix_tor_permissions run_fix "searxng" fix_searxng run_fix "bitcoin-txindex" fix_bitcoin_txindex run_fix "exit-127" fix_exit_127 echo "" if [ $FIXES_APPLIED -gt 0 ]; then log "Done: $FIXES_APPLIED fixes applied (${FIX_NAMES[*]}), $CHECKS_PASSED checks passed" else log "Done: all $CHECKS_PASSED checks passed — no fixes needed" fi exit 0