Critical bug: the doctor runs as root but containers are rootless under the archipelago user. When checking if a conmon process has an associated container, the root podman database was queried (empty), causing ALL conmon processes to be identified as orphaned and killed. This terminated running containers every 30 minutes. Fix: use sudo -u archipelago to query the rootless podman database. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
427 lines
14 KiB
Bash
Executable File
427 lines
14 KiB
Bash
Executable File
#!/bin/bash
|
|
#
|
|
# Container Doctor — diagnose and fix common container health issues
|
|
#
|
|
# Usage:
|
|
# sudo ./scripts/container-doctor.sh # Run locally on node
|
|
# ./scripts/container-doctor.sh user@host # Run remotely via SSH
|
|
#
|
|
# Fixes:
|
|
# 1. Stale podman ps/stats processes (>10 = pileup)
|
|
# 2. Orphaned conmon/crun processes holding ports
|
|
# 3. System tor conflicting with container tor
|
|
# 4. Tor hidden service directory permissions (must be 700)
|
|
# 5. SearXNG read-only root / cap-drop ALL
|
|
# 6. Bitcoin Knots prune+txindex conflict
|
|
# 7. Containers stuck with exit code 127 (binary not found)
|
|
# 8. Stopped core containers (rootless restart policy workaround)
|
|
#
|
|
# Safe to run multiple times (idempotent). Never blocks deploy (exit 0 always).
|
|
#
|
|
|
|
set -o pipefail
|
|
|
|
# Source pinned image versions (single source of truth)
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
[ -f "$SCRIPT_DIR/image-versions.sh" ] && . "$SCRIPT_DIR/image-versions.sh"
|
|
|
|
FIXES_APPLIED=0
|
|
CHECKS_PASSED=0
|
|
FIX_NAMES=()
|
|
|
|
log() { echo "[$(date +%H:%M:%S)] DOCTOR: $*"; }
|
|
|
|
run_fix() {
|
|
local name="$1"
|
|
shift
|
|
if "$@"; then
|
|
FIXES_APPLIED=$((FIXES_APPLIED + 1))
|
|
FIX_NAMES+=("$name")
|
|
else
|
|
CHECKS_PASSED=$((CHECKS_PASSED + 1))
|
|
fi
|
|
}
|
|
|
|
# ── Fix 1: Stale podman processes ────────────────────────────
|
|
fix_stale_podman() {
|
|
local count
|
|
count=$(pgrep -f "podman (ps|stats)" 2>/dev/null | wc -l)
|
|
count=${count:-0}
|
|
if [ "$count" -gt 10 ]; then
|
|
log "Killing $count stale podman ps/stats processes"
|
|
pkill -f "podman (ps|stats)" 2>/dev/null || true
|
|
sleep 2
|
|
local after
|
|
after=$(pgrep -f "podman (ps|stats)" 2>/dev/null | wc -l)
|
|
after=${after:-0}
|
|
log "Reduced from $count to $after"
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
# ── Fix 2: Orphaned conmon holding ports ─────────────────────
|
|
fix_orphaned_conmon() {
|
|
local fixed=false
|
|
# Find conmon processes whose containers no longer exist
|
|
local pids
|
|
pids=$(pgrep -f "conmon.*--exit-command" 2>/dev/null || true)
|
|
if [ -z "$pids" ]; then
|
|
return 1
|
|
fi
|
|
|
|
# Doctor runs as root but containers are rootless under archipelago user.
|
|
# Must check container existence using the rootless podman database.
|
|
local PODMANCMD="sudo -u archipelago XDG_RUNTIME_DIR=/run/user/1000 podman"
|
|
|
|
for pid in $pids; do
|
|
# Extract container ID from conmon args
|
|
local cid
|
|
cid=$(tr '\0' ' ' < /proc/"$pid"/cmdline 2>/dev/null | grep -oP '(?<=-c )[a-f0-9]{64}' || true)
|
|
if [ -z "$cid" ]; then
|
|
continue
|
|
fi
|
|
# Check if container still exists in rootless podman
|
|
if ! $PODMANCMD inspect "$cid" &>/dev/null; then
|
|
local port_info
|
|
port_info=$(ss -tlnp 2>/dev/null | grep "pid=$pid" | grep -oP ':\K\d+' | head -3 | tr '\n' ',' | sed 's/,$//')
|
|
log "Killing orphaned conmon pid=$pid (ports: ${port_info:-none})"
|
|
kill "$pid" 2>/dev/null || kill -9 "$pid" 2>/dev/null || true
|
|
fixed=true
|
|
fi
|
|
done
|
|
|
|
$fixed && return 0 || return 1
|
|
}
|
|
|
|
# ── Fix 3: Ensure system Tor is running (preferred over container) ──
|
|
fix_system_tor_conflict() {
|
|
# System Tor is preferred over container Tor.
|
|
# If archy-tor container exists, remove it and use system Tor instead.
|
|
if podman ps -a --format '{{.Names}}' 2>/dev/null | grep -qE '^archy-tor$'; then
|
|
podman stop archy-tor 2>/dev/null || true
|
|
podman rm -f archy-tor 2>/dev/null || true
|
|
log "Removed archy-tor container (system Tor is preferred)"
|
|
fi
|
|
|
|
# Ensure system Tor is enabled and running
|
|
if command -v tor >/dev/null 2>&1; then
|
|
if ! systemctl is-active tor@default >/dev/null 2>&1; then
|
|
systemctl enable tor tor@default 2>/dev/null || true
|
|
systemctl start tor tor@default 2>/dev/null || true
|
|
log "Started system Tor"
|
|
return 0
|
|
fi
|
|
fi
|
|
|
|
return 1
|
|
}
|
|
|
|
# ── Fix 4: Tor hidden service permissions ────────────────────
|
|
fix_tor_permissions() {
|
|
local fixed=false
|
|
local tor_dirs=("/var/lib/archipelago/tor" "/var/lib/tor")
|
|
|
|
for base in "${tor_dirs[@]}"; do
|
|
if [ ! -d "$base" ]; then
|
|
continue
|
|
fi
|
|
while IFS= read -r dir; do
|
|
local perms
|
|
perms=$(stat -c '%a' "$dir" 2>/dev/null)
|
|
if [ "$perms" != "700" ]; then
|
|
chmod 700 "$dir"
|
|
log "Fixed permissions on $dir ($perms -> 700)"
|
|
fixed=true
|
|
fi
|
|
done < <(find "$base" -maxdepth 1 -name "hidden_service_*" -type d 2>/dev/null)
|
|
done
|
|
|
|
# If we fixed permissions, restart system Tor to pick up the changes
|
|
if $fixed; then
|
|
systemctl restart tor@default 2>/dev/null || true
|
|
return 0
|
|
fi
|
|
return 1
|
|
}
|
|
|
|
# ── Fix 5: SearXNG read-only / cap-drop ─────────────────────
|
|
fix_searxng() {
|
|
if ! podman ps -a --format '{{.Names}}' 2>/dev/null | grep -q '^searxng$'; then
|
|
return 1
|
|
fi
|
|
|
|
local state
|
|
state=$(podman inspect searxng --format '{{.State.Status}}' 2>/dev/null || true)
|
|
local readonly_root
|
|
readonly_root=$(podman inspect searxng --format '{{.HostConfig.ReadonlyRootfs}}' 2>/dev/null || true)
|
|
local cap_drop
|
|
cap_drop=$(podman inspect searxng --format '{{.HostConfig.CapDrop}}' 2>/dev/null || true)
|
|
|
|
# Fix if: exited, or has read-only root, or has cap-drop ALL
|
|
local needs_fix=false
|
|
if [ "$state" = "exited" ]; then
|
|
needs_fix=true
|
|
fi
|
|
if [ "$readonly_root" = "true" ]; then
|
|
needs_fix=true
|
|
fi
|
|
if [[ "$cap_drop" == *"ALL"* ]] || [[ "$cap_drop" == *"all"* ]]; then
|
|
needs_fix=true
|
|
fi
|
|
|
|
if ! $needs_fix; then
|
|
return 1
|
|
fi
|
|
|
|
log "Recreating SearXNG (readonly=$readonly_root, cap_drop=$cap_drop, state=$state)"
|
|
|
|
# Get current port mapping
|
|
local port
|
|
port=$(podman inspect searxng --format '{{range $k,$v := .HostConfig.PortBindings}}{{$k}}={{range $v}}{{.HostPort}}{{end}}{{println}}{{end}}' 2>/dev/null | head -1)
|
|
local host_port="${port##*=}"
|
|
host_port="${host_port:-8888}"
|
|
|
|
# Kill any stale conmon holding the port
|
|
local conmon_pid
|
|
conmon_pid=$(ss -tlnp 2>/dev/null | grep ":${host_port} " | grep -oP 'pid=\K\d+' | head -1)
|
|
|
|
podman stop searxng 2>/dev/null || true
|
|
podman rm -f searxng 2>/dev/null || true
|
|
|
|
if [ -n "$conmon_pid" ]; then
|
|
kill -9 "$conmon_pid" 2>/dev/null || true
|
|
sleep 2
|
|
fi
|
|
|
|
podman run -d \
|
|
--name searxng \
|
|
--restart=unless-stopped \
|
|
--security-opt=no-new-privileges:true \
|
|
--tmpfs /tmp:rw,noexec,nosuid,size=256m \
|
|
-v searxng-config:/etc/searxng:rw \
|
|
-v searxng-cache:/var/cache/searxng:rw \
|
|
-p "${host_port}:8080" \
|
|
--memory=512m \
|
|
"${SEARXNG_IMAGE}" 2>&1 || true
|
|
|
|
log "SearXNG recreated (no readonly, no cap-drop ALL)"
|
|
return 0
|
|
}
|
|
|
|
# ── Fix 6: Bitcoin Knots prune+txindex conflict ──────────────
|
|
fix_bitcoin_txindex() {
|
|
if ! podman ps -a --format '{{.Names}}' 2>/dev/null | grep -q '^bitcoin-knots$'; then
|
|
return 1
|
|
fi
|
|
|
|
# Check if bitcoin.conf has prune enabled
|
|
local conf="/var/lib/archipelago/bitcoin/bitcoin.conf"
|
|
if [ ! -f "$conf" ] || ! grep -q '^prune=' "$conf"; then
|
|
return 1
|
|
fi
|
|
|
|
# Check if container args include txindex
|
|
local cmd
|
|
cmd=$(podman inspect bitcoin-knots --format '{{json .Config.Cmd}}' 2>/dev/null || true)
|
|
if ! echo "$cmd" | grep -q "txindex"; then
|
|
return 1
|
|
fi
|
|
|
|
log "Bitcoin Knots: prune+txindex conflict detected"
|
|
|
|
# Get current config
|
|
local image
|
|
image=$(podman inspect bitcoin-knots --format '{{.ImageName}}' 2>/dev/null)
|
|
local network
|
|
network=$(podman inspect bitcoin-knots --format '{{.HostConfig.NetworkMode}}' 2>/dev/null)
|
|
|
|
# Read per-installation RPC password
|
|
local SECRETS_DIR="/var/lib/archipelago/secrets"
|
|
local BTC_RPC_PASS="archipelago"
|
|
if [ -f "$SECRETS_DIR/bitcoin-rpc-password" ]; then
|
|
BTC_RPC_PASS=$(cat "$SECRETS_DIR/bitcoin-rpc-password")
|
|
fi
|
|
|
|
# Ensure bitcoin.conf has all RPC settings
|
|
if ! grep -q 'rpcuser=' "$conf"; then
|
|
cat > "$conf" <<BCONF
|
|
server=1
|
|
prune=550
|
|
rpcuser=archipelago
|
|
rpcpassword=$BTC_RPC_PASS
|
|
rpcbind=0.0.0.0
|
|
rpcallowip=127.0.0.1/32
|
|
rpcallowip=10.88.0.0/16
|
|
rpcport=8332
|
|
listen=1
|
|
printtoconsole=1
|
|
BCONF
|
|
log "Updated bitcoin.conf with full RPC settings"
|
|
fi
|
|
|
|
# Remove stale txindex if present
|
|
if [ -d "/var/lib/archipelago/bitcoin/indexes/txindex" ]; then
|
|
find /var/lib/archipelago/bitcoin/indexes/txindex -type f -delete 2>/dev/null
|
|
rmdir /var/lib/archipelago/bitcoin/indexes/txindex 2>/dev/null || true
|
|
log "Removed stale txindex directory"
|
|
fi
|
|
|
|
# Recreate without txindex
|
|
podman stop bitcoin-knots 2>/dev/null || true
|
|
podman rm -f bitcoin-knots 2>/dev/null || true
|
|
sleep 2
|
|
|
|
# Kill stale conmon on port 8332/8333
|
|
for p in 8332 8333; do
|
|
local cpid
|
|
cpid=$(ss -tlnp 2>/dev/null | grep ":${p} " | grep -oP 'pid=\K\d+' | head -1)
|
|
if [ -n "$cpid" ]; then
|
|
kill -9 "$cpid" 2>/dev/null || true
|
|
fi
|
|
done
|
|
sleep 1
|
|
|
|
local net_arg=""
|
|
if [ -n "$network" ] && [ "$network" != "bridge" ] && [ "$network" != "host" ]; then
|
|
net_arg="--network=$network"
|
|
elif [ "$network" = "host" ]; then
|
|
net_arg="--network=host"
|
|
else
|
|
net_arg="--network=archy-net"
|
|
fi
|
|
|
|
podman run -d \
|
|
--name bitcoin-knots \
|
|
--restart=always \
|
|
$net_arg \
|
|
-p 8332:8332 \
|
|
-p 8333:8333 \
|
|
-v /var/lib/archipelago/bitcoin:/home/bitcoin/.bitcoin \
|
|
--memory=2g \
|
|
--cap-drop=ALL \
|
|
--cap-add=CHOWN \
|
|
--cap-add=FOWNER \
|
|
--cap-add=SETUID \
|
|
--cap-add=SETGID \
|
|
--cap-add=DAC_OVERRIDE \
|
|
--security-opt=no-new-privileges:true \
|
|
--health-cmd="bitcoin-cli -rpcuser=archipelago -rpcpassword=$BTC_RPC_PASS getblockchaininfo || exit 1" \
|
|
--health-interval=30s \
|
|
--health-retries=3 \
|
|
"$image" 2>&1 || true
|
|
|
|
log "Bitcoin Knots recreated without txindex (prune mode)"
|
|
return 0
|
|
}
|
|
|
|
# ── Fix 7: Exit code 127 containers ─────────────────────────
|
|
fix_exit_127() {
|
|
local containers
|
|
containers=$(podman ps -a --format '{{.Names}} {{.Status}}' 2>/dev/null | grep 'Exited (127)' | awk '{print $1}' || true)
|
|
|
|
if [ -z "$containers" ]; then
|
|
return 1
|
|
fi
|
|
|
|
local fixed_names=()
|
|
for name in $containers; do
|
|
# Skip containers handled by other fixes
|
|
if [ "$name" = "searxng" ]; then
|
|
continue
|
|
fi
|
|
|
|
log "Container $name has exit code 127 — recreating"
|
|
# Get image and create command for recreation
|
|
local image
|
|
image=$(podman inspect "$name" --format '{{.ImageName}}' 2>/dev/null || true)
|
|
local create_cmd
|
|
create_cmd=$(podman inspect "$name" --format '{{json .Config.CreateCommand}}' 2>/dev/null || true)
|
|
|
|
podman rm -f "$name" 2>/dev/null || true
|
|
|
|
if [ -n "$create_cmd" ] && [ "$create_cmd" != "null" ]; then
|
|
# Re-run the original create command (strip the leading "podman" and "run")
|
|
local recreate_args
|
|
recreate_args=$(echo "$create_cmd" | python3 -c "
|
|
import json, sys
|
|
args = json.load(sys.stdin)
|
|
# Skip 'podman' and 'run', output the rest
|
|
print(' '.join(['\"' + a + '\"' if ' ' in a else a for a in args[2:]]))
|
|
" 2>/dev/null || true)
|
|
|
|
if [ -n "$recreate_args" ]; then
|
|
eval "podman run $recreate_args" 2>&1 || true
|
|
fixed_names+=("$name")
|
|
log "Recreated $name from original args"
|
|
else
|
|
fixed_names+=("$name(removed)")
|
|
log "Removed $name — will be recreated on next deploy"
|
|
fi
|
|
else
|
|
fixed_names+=("$name(removed)")
|
|
log "Removed $name — will be recreated on next deploy"
|
|
fi
|
|
done
|
|
|
|
[ ${#fixed_names[@]} -gt 0 ] && return 0 || return 1
|
|
}
|
|
|
|
# ── Fix 8: Restart stopped core containers ──────────────────
|
|
# Rootless Podman 4.x restart policies don't auto-restart on crash.
|
|
# This check restarts any exited core containers (tiers 0-2).
|
|
fix_stopped_core_containers() {
|
|
local core_containers="bitcoin-knots lnd electrumx mempool-api archy-mempool-web archy-mempool-db archy-btcpay-db archy-nbxplorer btcpay-server"
|
|
local restarted=()
|
|
# Doctor runs as root but containers are rootless under archipelago user
|
|
local PODMANCMD="sudo -u archipelago XDG_RUNTIME_DIR=/run/user/1000 podman"
|
|
|
|
for name in $core_containers; do
|
|
local state
|
|
state=$($PODMANCMD inspect "$name" --format '{{.State.Status}}' 2>/dev/null || echo "missing")
|
|
if [ "$state" = "exited" ] || [ "$state" = "stopped" ]; then
|
|
log "Restarting stopped container: $name"
|
|
$PODMANCMD start "$name" 2>/dev/null && restarted+=("$name") || true
|
|
fi
|
|
done
|
|
|
|
[ ${#restarted[@]} -gt 0 ] && return 0 || return 1
|
|
}
|
|
|
|
# ── Main ─────────────────────────────────────────────────────
|
|
|
|
# If remote host provided, run via SSH
|
|
if [ -n "$1" ] && [ "$1" != "--local" ]; then
|
|
REMOTE_HOST="$1"
|
|
SSH_KEY="${ARCHIPELAGO_SSH_KEY:-$HOME/.ssh/archipelago-deploy}"
|
|
SSH_OPTS="-o StrictHostKeyChecking=no -o ServerAliveInterval=15 -o ServerAliveCountMax=4 -i $SSH_KEY"
|
|
|
|
log "Running container doctor on $REMOTE_HOST"
|
|
|
|
# Copy script to remote and execute
|
|
scp $SSH_OPTS "$0" "$REMOTE_HOST:/tmp/container-doctor.sh" 2>/dev/null
|
|
ssh $SSH_OPTS "$REMOTE_HOST" "sudo bash /tmp/container-doctor.sh --local" 2>&1
|
|
exit 0
|
|
fi
|
|
|
|
# Running locally (on the node itself)
|
|
log "Starting container health check"
|
|
|
|
run_fix "stale-podman" fix_stale_podman
|
|
run_fix "orphaned-conmon" fix_orphaned_conmon
|
|
run_fix "system-tor" fix_system_tor_conflict
|
|
run_fix "tor-permissions" fix_tor_permissions
|
|
run_fix "searxng" fix_searxng
|
|
run_fix "bitcoin-txindex" fix_bitcoin_txindex
|
|
run_fix "exit-127" fix_exit_127
|
|
run_fix "stopped-core" fix_stopped_core_containers
|
|
|
|
echo ""
|
|
if [ $FIXES_APPLIED -gt 0 ]; then
|
|
log "Done: $FIXES_APPLIED fixes applied (${FIX_NAMES[*]}), $CHECKS_PASSED checks passed"
|
|
else
|
|
log "Done: all $CHECKS_PASSED checks passed — no fixes needed"
|
|
fi
|
|
|
|
exit 0
|