archy/scripts/container-doctor.sh
Dorian 430d174389 feat: Phase 2 — systemd sandboxing, Bitcoin RPC localhost binding, Tailscale deprivilege
- Service runs as unprivileged `archipelago` user instead of root
- Added systemd sandboxing: ProtectSystem=strict, NoNewPrivileges, PrivateTmp,
  MemoryDenyWriteExecute, RestrictNamespaces, SystemCallFilter
- Bitcoin RPC rpcallowip restricted to localhost + Podman subnet (10.88.0.0/16)
- Tailscale container: removed --privileged, uses cap-drop ALL + cap-add NET_ADMIN/NET_RAW

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 00:42:29 +00:00

415 lines
14 KiB
Bash
Executable File

#!/bin/bash
#
# Container Doctor — diagnose and fix common container health issues
#
# Usage:
# sudo ./scripts/container-doctor.sh # Run locally on node
# ./scripts/container-doctor.sh user@host # Run remotely via SSH
#
# Fixes:
# 1. Stale podman ps/stats processes (>10 = pileup)
# 2. Orphaned conmon/crun processes holding ports
# 3. System tor conflicting with container tor
# 4. Tor hidden service directory permissions (must be 700)
# 5. SearXNG read-only root / cap-drop ALL
# 6. Bitcoin Knots prune+txindex conflict
# 7. Containers stuck with exit code 127 (binary not found)
#
# Safe to run multiple times (idempotent). Never blocks deploy (exit 0 always).
#
set -o pipefail
FIXES_APPLIED=0
CHECKS_PASSED=0
FIX_NAMES=()
log() { echo "[$(date +%H:%M:%S)] DOCTOR: $*"; }
run_fix() {
local name="$1"
shift
if "$@"; then
FIXES_APPLIED=$((FIXES_APPLIED + 1))
FIX_NAMES+=("$name")
else
CHECKS_PASSED=$((CHECKS_PASSED + 1))
fi
}
# ── Fix 1: Stale podman processes ────────────────────────────
fix_stale_podman() {
local count
count=$(pgrep -f "podman (ps|stats)" 2>/dev/null | wc -l)
count=${count:-0}
if [ "$count" -gt 10 ]; then
log "Killing $count stale podman ps/stats processes"
pkill -f "podman (ps|stats)" 2>/dev/null || true
sleep 2
local after
after=$(pgrep -f "podman (ps|stats)" 2>/dev/null | wc -l)
after=${after:-0}
log "Reduced from $count to $after"
return 0
fi
return 1
}
# ── Fix 2: Orphaned conmon holding ports ─────────────────────
fix_orphaned_conmon() {
local fixed=false
# Find conmon processes whose containers no longer exist
local pids
pids=$(pgrep -f "conmon.*--exit-command" 2>/dev/null || true)
if [ -z "$pids" ]; then
return 1
fi
for pid in $pids; do
# Extract container ID from conmon args
local cid
cid=$(tr '\0' ' ' < /proc/"$pid"/cmdline 2>/dev/null | grep -oP '(?<=-c )[a-f0-9]{64}' || true)
if [ -z "$cid" ]; then
continue
fi
# Check if container still exists
if ! podman inspect "$cid" &>/dev/null; then
local port_info
port_info=$(ss -tlnp 2>/dev/null | grep "pid=$pid" | grep -oP ':\K\d+' | head -3 | tr '\n' ',' | sed 's/,$//')
log "Killing orphaned conmon pid=$pid (ports: ${port_info:-none})"
kill "$pid" 2>/dev/null || kill -9 "$pid" 2>/dev/null || true
fixed=true
fi
done
$fixed && return 0 || return 1
}
# ── Fix 3: System tor conflict ───────────────────────────────
fix_system_tor_conflict() {
# Only relevant if we have a container tor on host network
local has_container_tor=false
if podman ps -a --format '{{.Names}}' 2>/dev/null | grep -qE '^archy-tor$'; then
local net_mode
net_mode=$(podman inspect archy-tor --format '{{.HostConfig.NetworkMode}}' 2>/dev/null || true)
if [ "$net_mode" = "host" ]; then
has_container_tor=true
fi
fi
if ! $has_container_tor; then
return 1
fi
# Check if system tor is binding port 9050
local system_tor_pid
system_tor_pid=$(ss -tlnp 2>/dev/null | grep ':9050 ' | grep -oP 'pid=\K\d+' | head -1)
if [ -z "$system_tor_pid" ]; then
return 1
fi
# Check if it's the system tor (not container tor)
local exe
exe=$(readlink /proc/"$system_tor_pid"/exe 2>/dev/null || true)
if [[ "$exe" == */tor ]] && ! grep -q "container" /proc/"$system_tor_pid"/cgroup 2>/dev/null; then
log "System tor (pid=$system_tor_pid) conflicts with container tor on port 9050"
systemctl stop tor@default 2>/dev/null || true
systemctl stop tor 2>/dev/null || true
systemctl disable tor@default 2>/dev/null || true
systemctl disable tor 2>/dev/null || true
sleep 2
# Restart container tor now that port is free
podman restart archy-tor 2>/dev/null || true
log "Disabled system tor, restarted container tor"
return 0
fi
return 1
}
# ── Fix 4: Tor hidden service permissions ────────────────────
fix_tor_permissions() {
local fixed=false
local tor_dirs=("/var/lib/archipelago/tor" "/var/lib/tor")
for base in "${tor_dirs[@]}"; do
if [ ! -d "$base" ]; then
continue
fi
while IFS= read -r dir; do
local perms
perms=$(stat -c '%a' "$dir" 2>/dev/null)
if [ "$perms" != "700" ]; then
chmod 700 "$dir"
log "Fixed permissions on $dir ($perms -> 700)"
fixed=true
fi
done < <(find "$base" -maxdepth 1 -name "hidden_service_*" -type d 2>/dev/null)
done
# If we fixed permissions and tor container exists, restart it
if $fixed; then
podman restart archy-tor 2>/dev/null || true
return 0
fi
return 1
}
# ── Fix 5: SearXNG read-only / cap-drop ─────────────────────
fix_searxng() {
if ! podman ps -a --format '{{.Names}}' 2>/dev/null | grep -q '^searxng$'; then
return 1
fi
local state
state=$(podman inspect searxng --format '{{.State.Status}}' 2>/dev/null || true)
local readonly_root
readonly_root=$(podman inspect searxng --format '{{.HostConfig.ReadonlyRootfs}}' 2>/dev/null || true)
local cap_drop
cap_drop=$(podman inspect searxng --format '{{.HostConfig.CapDrop}}' 2>/dev/null || true)
# Fix if: exited, or has read-only root, or has cap-drop ALL
local needs_fix=false
if [ "$state" = "exited" ]; then
needs_fix=true
fi
if [ "$readonly_root" = "true" ]; then
needs_fix=true
fi
if [[ "$cap_drop" == *"ALL"* ]] || [[ "$cap_drop" == *"all"* ]]; then
needs_fix=true
fi
if ! $needs_fix; then
return 1
fi
log "Recreating SearXNG (readonly=$readonly_root, cap_drop=$cap_drop, state=$state)"
# Get current port mapping
local port
port=$(podman inspect searxng --format '{{range $k,$v := .HostConfig.PortBindings}}{{$k}}={{range $v}}{{.HostPort}}{{end}}{{println}}{{end}}' 2>/dev/null | head -1)
local host_port="${port##*=}"
host_port="${host_port:-8888}"
# Kill any stale conmon holding the port
local conmon_pid
conmon_pid=$(ss -tlnp 2>/dev/null | grep ":${host_port} " | grep -oP 'pid=\K\d+' | head -1)
podman stop searxng 2>/dev/null || true
podman rm -f searxng 2>/dev/null || true
if [ -n "$conmon_pid" ]; then
kill -9 "$conmon_pid" 2>/dev/null || true
sleep 2
fi
podman run -d \
--name searxng \
--restart=unless-stopped \
--security-opt=no-new-privileges:true \
--tmpfs /tmp:rw,noexec,nosuid,size=256m \
-v searxng-config:/etc/searxng:rw \
-v searxng-cache:/var/cache/searxng:rw \
-p "${host_port}:8080" \
--memory=512m \
docker.io/searxng/searxng:latest 2>&1 || true
log "SearXNG recreated (no readonly, no cap-drop ALL)"
return 0
}
# ── Fix 6: Bitcoin Knots prune+txindex conflict ──────────────
fix_bitcoin_txindex() {
if ! podman ps -a --format '{{.Names}}' 2>/dev/null | grep -q '^bitcoin-knots$'; then
return 1
fi
# Check if bitcoin.conf has prune enabled
local conf="/var/lib/archipelago/bitcoin/bitcoin.conf"
if [ ! -f "$conf" ] || ! grep -q '^prune=' "$conf"; then
return 1
fi
# Check if container args include txindex
local cmd
cmd=$(podman inspect bitcoin-knots --format '{{json .Config.Cmd}}' 2>/dev/null || true)
if ! echo "$cmd" | grep -q "txindex"; then
return 1
fi
log "Bitcoin Knots: prune+txindex conflict detected"
# Get current config
local image
image=$(podman inspect bitcoin-knots --format '{{.ImageName}}' 2>/dev/null)
local network
network=$(podman inspect bitcoin-knots --format '{{.HostConfig.NetworkMode}}' 2>/dev/null)
# Read per-installation RPC password
local SECRETS_DIR="/var/lib/archipelago/secrets"
local BTC_RPC_PASS="archipelago"
if [ -f "$SECRETS_DIR/bitcoin-rpc-password" ]; then
BTC_RPC_PASS=$(cat "$SECRETS_DIR/bitcoin-rpc-password")
fi
# Ensure bitcoin.conf has all RPC settings
if ! grep -q 'rpcuser=' "$conf"; then
cat > "$conf" <<BCONF
server=1
prune=550
rpcuser=archipelago
rpcpassword=$BTC_RPC_PASS
rpcbind=0.0.0.0
rpcallowip=127.0.0.1/32
rpcallowip=10.88.0.0/16
rpcport=8332
listen=1
printtoconsole=1
BCONF
log "Updated bitcoin.conf with full RPC settings"
fi
# Remove stale txindex if present
if [ -d "/var/lib/archipelago/bitcoin/indexes/txindex" ]; then
find /var/lib/archipelago/bitcoin/indexes/txindex -type f -delete 2>/dev/null
rmdir /var/lib/archipelago/bitcoin/indexes/txindex 2>/dev/null || true
log "Removed stale txindex directory"
fi
# Recreate without txindex
podman stop bitcoin-knots 2>/dev/null || true
podman rm -f bitcoin-knots 2>/dev/null || true
sleep 2
# Kill stale conmon on port 8332/8333
for p in 8332 8333; do
local cpid
cpid=$(ss -tlnp 2>/dev/null | grep ":${p} " | grep -oP 'pid=\K\d+' | head -1)
if [ -n "$cpid" ]; then
kill -9 "$cpid" 2>/dev/null || true
fi
done
sleep 1
local net_arg=""
if [ -n "$network" ] && [ "$network" != "bridge" ] && [ "$network" != "host" ]; then
net_arg="--network=$network"
elif [ "$network" = "host" ]; then
net_arg="--network=host"
else
net_arg="--network=archy-net"
fi
podman run -d \
--name bitcoin-knots \
--restart=always \
$net_arg \
-p 8332:8332 \
-p 8333:8333 \
-v /var/lib/archipelago/bitcoin:/home/bitcoin/.bitcoin \
--memory=2g \
--cap-drop=ALL \
--cap-add=CHOWN \
--cap-add=FOWNER \
--cap-add=SETUID \
--cap-add=SETGID \
--cap-add=DAC_OVERRIDE \
--security-opt=no-new-privileges:true \
--health-cmd="bitcoin-cli -rpcuser=archipelago -rpcpassword=$BTC_RPC_PASS getblockchaininfo || exit 1" \
--health-interval=30s \
--health-retries=3 \
"$image" 2>&1 || true
log "Bitcoin Knots recreated without txindex (prune mode)"
return 0
}
# ── Fix 7: Exit code 127 containers ─────────────────────────
fix_exit_127() {
local containers
containers=$(podman ps -a --format '{{.Names}} {{.Status}}' 2>/dev/null | grep 'Exited (127)' | awk '{print $1}' || true)
if [ -z "$containers" ]; then
return 1
fi
local fixed_names=()
for name in $containers; do
# Skip containers handled by other fixes
if [ "$name" = "searxng" ]; then
continue
fi
log "Container $name has exit code 127 — recreating"
# Get image and create command for recreation
local image
image=$(podman inspect "$name" --format '{{.ImageName}}' 2>/dev/null || true)
local create_cmd
create_cmd=$(podman inspect "$name" --format '{{json .Config.CreateCommand}}' 2>/dev/null || true)
podman rm -f "$name" 2>/dev/null || true
if [ -n "$create_cmd" ] && [ "$create_cmd" != "null" ]; then
# Re-run the original create command (strip the leading "podman" and "run")
local recreate_args
recreate_args=$(echo "$create_cmd" | python3 -c "
import json, sys
args = json.load(sys.stdin)
# Skip 'podman' and 'run', output the rest
print(' '.join(['\"' + a + '\"' if ' ' in a else a for a in args[2:]]))
" 2>/dev/null || true)
if [ -n "$recreate_args" ]; then
eval "podman run $recreate_args" 2>&1 || true
fixed_names+=("$name")
log "Recreated $name from original args"
else
fixed_names+=("$name(removed)")
log "Removed $name — will be recreated on next deploy"
fi
else
fixed_names+=("$name(removed)")
log "Removed $name — will be recreated on next deploy"
fi
done
[ ${#fixed_names[@]} -gt 0 ] && return 0 || return 1
}
# ── Main ─────────────────────────────────────────────────────
# If remote host provided, run via SSH
if [ -n "$1" ] && [ "$1" != "--local" ]; then
REMOTE_HOST="$1"
SSH_KEY="${ARCHIPELAGO_SSH_KEY:-$HOME/.ssh/archipelago-deploy}"
SSH_OPTS="-o StrictHostKeyChecking=no -o ServerAliveInterval=15 -o ServerAliveCountMax=4 -i $SSH_KEY"
log "Running container doctor on $REMOTE_HOST"
# Copy script to remote and execute
scp $SSH_OPTS "$0" "$REMOTE_HOST:/tmp/container-doctor.sh" 2>/dev/null
ssh $SSH_OPTS "$REMOTE_HOST" "sudo bash /tmp/container-doctor.sh --local" 2>&1
exit 0
fi
# Running locally (on the node itself)
log "Starting container health check"
run_fix "stale-podman" fix_stale_podman
run_fix "orphaned-conmon" fix_orphaned_conmon
run_fix "system-tor" fix_system_tor_conflict
run_fix "tor-permissions" fix_tor_permissions
run_fix "searxng" fix_searxng
run_fix "bitcoin-txindex" fix_bitcoin_txindex
run_fix "exit-127" fix_exit_127
echo ""
if [ $FIXES_APPLIED -gt 0 ]; then
log "Done: $FIXES_APPLIED fixes applied (${FIX_NAMES[*]}), $CHECKS_PASSED checks passed"
else
log "Done: all $CHECKS_PASSED checks passed — no fixes needed"
fi
exit 0